Commit 11136957 authored by ankraft's avatar ankraft
Browse files

First version of pandoc filter tool

parent 35c69bd8
Loading
Loading
Loading
Loading
+119 −0
Original line number Original line Diff line number Diff line
#
#	pandocFilter.py
#
#	Script to convert a oneM2M Markdown file to pandoc input format
#
#	(c) 2023 by Andreas Kraft, Miguel Ortega
#	License: BSD 3-Clause License. See the LICENSE file for further details.
#

import argparse, os, re
from rich import print
from rich.progress import Progress, TextColumn, TimeElapsedColumn

def readMDFile(progress:Progress, document:str) -> list[str]:
	"""	Read the markdown file and return a list of lines.
	"""	
	_taskID = progress.add_task('[blue]Reading document', start=False, total=0)

	# Check if file exists
	if not os.path.exists(document):
		print(f'File {document} does not exist')
		exit(1)

	# Read the file
	with open(document, 'r', encoding='utf-8', errors = 'replace') as f:
		progress.stop_task(_taskID)
		return f.readlines()
	

def writeMDFile(progress:Progress, mdLines:list[str], document:str, outDirectory:str) -> None:
	"""	Write the markdown file.

	"""
	_taskID = progress.add_task('[blue]Writing document', start=False, total=0)

	# Write the file
	with open(f'{outDirectory}/{os.path.basename(document)}', 'w', encoding='utf-8', errors = 'replace') as f:
		f.writelines(mdLines)
	progress.stop_task(_taskID)


def correctTOC(progress:Progress, mdLines:list[str], tocSection:str = 'Contents') -> list[str]:
	"""	Correct the TOC to be compatible with pandoc.
	"""
	_taskID = progress.add_task('[blue]Correcting TOC', start=False, total=0)

	_contents = f'# {tocSection}\n'
	tocregex = re.compile('^(.*\[.*\])')
	
	_lines:list[str] = []
	_inTOC = False
	for line in mdLines:
		# find TOC section first
		if line == _contents:
			_inTOC = True
			_lines.append(line)
			continue
		if _inTOC:
			if line.startswith('#'):	# End of TOC?
				_inTOC = False
				_lines.append(line)
				continue
			matches = re.findall(tocregex, line)	# Replace entry
			if matches:
				_lines.append(f'{matches[0]}  \n')
				continue
		else:
			_lines.append(line)

	progress.stop_task(_taskID)
	return _lines


def replaceTableCaptions(progress:Progress, mdLines:list[str]) -> list[str]:
	"""	Replace table captions with a pandoc table caption.
	"""
	_taskID = progress.add_task('[blue]Replacing table captions', start=False, total=0)
	# progress.update()
	tableregex = re.compile('^\*\*(Table .*)\*\*')
	
	_lines:list[str] = []
	for line in mdLines:
		matches = re.findall(tableregex, line)
		if matches:
			# move the caption to the beginning of the table and add a "Table:" prefix
			_idx = len(_lines) - 1
			while _idx >= 0 and _lines[_idx].startswith('|'):
				_idx -= 1
			if _idx > 0:
				_lines.insert(_idx+1, f'Table: {matches[0]}\n')
			_lines.insert(_idx+1, f'Table: {matches[0]}\n')
		else:
			_lines.append(line)

	#print(_lines)
	progress.stop_task(_taskID)
	return _lines


def process(document:str, outDirectory:str) -> None:
	with Progress(TextColumn('{task.description}'),  TimeElapsedColumn()) as progress:
		mdLines = readMDFile(progress, document)
		mdLines = correctTOC(progress, mdLines)
		mdLines = replaceTableCaptions(progress, mdLines)
		writeMDFile(progress, mdLines, document, outDirectory)



if __name__ == '__main__':
	# Parse command line arguments
	parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>',  help = 'specify output directory')
	parser.add_argument('document',  help = 'document to parse')
	args = parser.parse_args()

	# Process documents and print output
	os.makedirs(args.outDirectory, exist_ok = True)

	process(args.document, args.outDirectory)
+14 −0
Original line number Original line Diff line number Diff line
#
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
#    pip-compile
#
markdown-it-py==2.2.0
    # via rich
mdurl==0.1.2
    # via markdown-it-py
pygments==2.15.1
    # via rich
rich==13.3.5
    # via oneM2M-markdown-to-pandoc-filter (setup.py)

pandocFilter/setup.py

0 → 100644
+13 −0
Original line number Original line Diff line number Diff line
from setuptools import setup, find_packages

setup(
	name='oneM2M markdown to pandoc filter',
	version='0.0.1',
	author='Andreas Kraft, Miguel Ortega',
	author_email='an.kraft@gmail.com',
	description='Convert oneM2M Markdown to Pandoc input',
	packages=find_packages(),
	install_requires=[
		'rich',
	 ]
)