Added converting bold and italic text in paragraphs, headers and tables. (10048d4b) · Commits · Centre for Testing and Interoperability / Markdown specifications development / spec2md

README.md

+7 −3

Original line number	Diff line number	Diff line
		@@ -13,8 +13,8 @@ python3 -m pip install -r requirements.txt
		## Usage
		- Create a directory with the Word document in it. The Word document must be in docx format. This can be achieved by opening the document with Word and save it in docx format to another file.
		- Create a configuration file with the same base name as the Word document + .ini extension. This file may contain different configurations as the standard config.ini file provided.
		- Alternativaly, a file named config.ini will apply to all files in that directory.
		- It is only necessary to add the settings that are different from the config.ini file in the projects root directoy. That file will always act as a fallback.
		- Alternatively, a file named config.ini will apply to all files in that directory.
		- It is only necessary to add the settings that are different from the config.ini file in the project's root directory. That file will always act as a fallback.
		- Run the converter as follows:
		```
		python3 spec2md.py <path-to-word-document>
		@@ -25,3 +25,7 @@ python3 spec2md.py <path-to-word-document>
		### The converter doesn't seem to generate image files.

		Is LibreOffice already running? If yes, then close it.

		## Changes

		- 2023-07-27 - Added converting bold and italic text in paragraphs, headers and tables.
		No newline at end of file

requirements.txt

+11 −7

Original line number	Diff line number	Diff line
		#
		# This file is autogenerated by pip-compile with python 3.10
		# To update, run:
		# This file is autogenerated by pip-compile with Python 3.10
		# by the following command:
		#
		# pip-compile
		#
		commonmark==0.9.1
		lxml==4.9.3
		# via
		# oneM2M-spec-2-MD-converter (setup.py)
		# python-docx
		markdown-it-py==3.0.0
		# via rich
		lxml==4.9.1
		# via python-docx
		pygments==2.13.0
		mdurl==0.1.2
		# via markdown-it-py
		pygments==2.15.1
		# via rich
		python-docx==0.8.11
		# via oneM2M-spec-2-MD-converter (setup.py)
		rich==12.5.1
		rich==13.4.2
		# via oneM2M-spec-2-MD-converter (setup.py)

setup.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -9,6 +9,7 @@ setup(
		description='Convert oneM2M specifications to Markdown',
		packages=find_packages(),
		install_requires=[
		'lxml',
		'rich',
		'python-docx',
		]

spec2md.py

+29 −8

Original line number	Diff line number	Diff line
		@@ -24,8 +24,7 @@ from rich.progress import Progress, TextColumn, BarColumn
		from rich.console import Console
		from rich import inspect
		import configparser, zipfile
		from xml.etree import ElementTree as ET

		from lxml import etree as ET

		class Style(IntEnum):
		example = auto()
		@@ -78,7 +77,9 @@ _captionMarker = '__CAPTION__'
		console = Console()
		_print:Callable = print


		# Some predefined tags and attributes
		wns = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
		_val = f'{{{wns}}}val'

		class SectionNumbers(object):

		@@ -286,7 +287,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
		return tag


		def getTextFromXML(elem:Paragraph) -> str:
		def getTextFromXML(elem:Paragraph\|_Cell) -> str:

		# Not-used document tags.
		_ignoredTags = ( 'AlternateContent',
		@@ -322,7 +323,18 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
		for x in element:
		_result += _parseXML(x)
		case 't':
		_result += str(toMD(str(element.text)))
		bold = ''
		italics = ''
		for e in element.getparent():
		if strippedTag(e.tag) == 'rPr': # paragraph style
		for ep in e:
		match strippedTag(ep.tag):
		case 'b' if ep.attrib.get(_val, 'true') == 'true':
		bold = '**'
		case 'i' if ep.attrib.get(_val, 'true') == 'true':
		italics = '_'
		_result += f'{bold}{italics}{str(toMD(str(element.text)))}{italics}{bold}'

		case 'br':
		_result += _linebreak
		case 'bookmarkStart' \| 'bookmarkEnd': # TODO ?
		@@ -381,7 +393,16 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
		return _result

		#_print(ET.fromstring(elem._p.xml))
		match elem:
		case Paragraph(): # type: ignore[misc]
		return _parseXML(ET.fromstring(elem._p.xml))
		case _Cell(): # type: ignore[misc]
		result = ''
		for p in elem.paragraphs:
		result += _parseXML(ET.fromstring(p._p.xml))
		return result
		case _:
		return ''



		@@ -620,7 +641,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
		for row in elem.rows:
		cells:list[str] = []
		for cell in row.cells:
		cells.append(f'{toMD(cell.text)} ') # add at least a space
		cells.append(f'{getTextFromXML(cell)} ') # add at least a space
		rows.append(cells)
		nrRows += 1

		@@ -706,7 +727,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:

		for i in range(len(lines)):
		line = lines[i]
		lines[i] = re.sub(_referenceExpression, _repl, line)
		lines[i] = re.sub(_referenceExpression, _repl, line) # type:ignore[arg-type]

		#
		# Write produced Markdown file