First version of pandoc filter tool (11136957) · Commits · Centre for Testing and Interoperability / Markdown specifications development / Specification tools

pandocFilter/pandocFilter.py

0 → 100644

+119 −0

Original line number	Diff line number	Diff line
		#
		# pandocFilter.py
		#
		# Script to convert a oneM2M Markdown file to pandoc input format
		#
		# (c) 2023 by Andreas Kraft, Miguel Ortega
		# License: BSD 3-Clause License. See the LICENSE file for further details.
		#

		import argparse, os, re
		from rich import print
		from rich.progress import Progress, TextColumn, TimeElapsedColumn

		def readMDFile(progress:Progress, document:str) -> list[str]:
		""" Read the markdown file and return a list of lines.
		"""
		_taskID = progress.add_task('[blue]Reading document', start=False, total=0)

		# Check if file exists
		if not os.path.exists(document):
		print(f'File {document} does not exist')
		exit(1)

		# Read the file
		with open(document, 'r', encoding='utf-8', errors = 'replace') as f:
		progress.stop_task(_taskID)
		return f.readlines()


		def writeMDFile(progress:Progress, mdLines:list[str], document:str, outDirectory:str) -> None:
		""" Write the markdown file.

		"""
		_taskID = progress.add_task('[blue]Writing document', start=False, total=0)

		# Write the file
		with open(f'{outDirectory}/{os.path.basename(document)}', 'w', encoding='utf-8', errors = 'replace') as f:
		f.writelines(mdLines)
		progress.stop_task(_taskID)


		def correctTOC(progress:Progress, mdLines:list[str], tocSection:str = 'Contents') -> list[str]:
		""" Correct the TOC to be compatible with pandoc.
		"""
		_taskID = progress.add_task('[blue]Correcting TOC', start=False, total=0)

		_contents = f'# {tocSection}\n'
		tocregex = re.compile('^(.\[.\])')

		_lines:list[str] = []
		_inTOC = False
		for line in mdLines:
		# find TOC section first
		if line == _contents:
		_inTOC = True
		_lines.append(line)
		continue
		if _inTOC:
		if line.startswith('#'): # End of TOC?
		_inTOC = False
		_lines.append(line)
		continue
		matches = re.findall(tocregex, line) # Replace entry
		if matches:
		_lines.append(f'{matches[0]} \n')
		continue
		else:
		_lines.append(line)

		progress.stop_task(_taskID)
		return _lines


		def replaceTableCaptions(progress:Progress, mdLines:list[str]) -> list[str]:
		""" Replace table captions with a pandoc table caption.
		"""
		_taskID = progress.add_task('[blue]Replacing table captions', start=False, total=0)
		# progress.update()
		tableregex = re.compile('^\\(Table .)\\*')

		_lines:list[str] = []
		for line in mdLines:
		matches = re.findall(tableregex, line)
		if matches:
		# move the caption to the beginning of the table and add a "Table:" prefix
		_idx = len(_lines) - 1
		while _idx >= 0 and _lines[_idx].startswith('\|'):
		_idx -= 1
		if _idx > 0:
		_lines.insert(_idx+1, f'Table: {matches[0]}\n')
		_lines.insert(_idx+1, f'Table: {matches[0]}\n')
		else:
		_lines.append(line)

		#print(_lines)
		progress.stop_task(_taskID)
		return _lines


		def process(document:str, outDirectory:str) -> None:
		with Progress(TextColumn('{task.description}'), TimeElapsedColumn()) as progress:
		mdLines = readMDFile(progress, document)
		mdLines = correctTOC(progress, mdLines)
		mdLines = replaceTableCaptions(progress, mdLines)
		writeMDFile(progress, mdLines, document, outDirectory)



		if __name__ == '__main__':
		# Parse command line arguments
		parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
		parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>', help = 'specify output directory')
		parser.add_argument('document', help = 'document to parse')
		args = parser.parse_args()

		# Process documents and print output
		os.makedirs(args.outDirectory, exist_ok = True)

		process(args.document, args.outDirectory)

pandocFilter/requirements.txt

0 → 100644

+14 −0

Original line number	Diff line number	Diff line
		#
		# This file is autogenerated by pip-compile with Python 3.10
		# by the following command:
		#
		# pip-compile
		#
		markdown-it-py==2.2.0
		# via rich
		mdurl==0.1.2
		# via markdown-it-py
		pygments==2.15.1
		# via rich
		rich==13.3.5
		# via oneM2M-markdown-to-pandoc-filter (setup.py)

pandocFilter/setup.py

0 → 100644

+13 −0

Original line number	Diff line number	Diff line
		from setuptools import setup, find_packages

		setup(
		name='oneM2M markdown to pandoc filter',
		version='0.0.1',
		author='Andreas Kraft, Miguel Ortega',
		author_email='an.kraft@gmail.com',
		description='Convert oneM2M Markdown to Pandoc input',
		packages=find_packages(),
		install_requires=[
		'rich',
		]
		)