Commit 9328ebdb authored by ankraft's avatar ankraft
Browse files

Added support for merging consecutive code paragraphs into a single code block

parent e0ababce
Loading
Loading
Loading
Loading
+3 −2
Original line number Original line Diff line number Diff line
@@ -12,8 +12,8 @@ python3 -m pip install -r requirements.txt


## Usage
## Usage
- Create a directory with the Word document in it. The Word document **must** be in *docx* format. This can be achieved by opening the document with *Word* and save it in *docx* format to another file.
- Create a directory with the Word document in it. The Word document **must** be in *docx* format. This can be achieved by opening the document with *Word* and save it in *docx* format to another file.
- Create a configuration file with the same base name as the Word document + *.ini* extension. This file may contain different configurations as the standard *config.ini* file provided. 
- Optional: Create a configuration file *config.ini* in that directory. This file may contain different configurations as the configuratioon file in the project's root directory. This confiuration file will apply to all files in that directory
	- Alternatively, a file named *config.ini* will apply to all files in that directory.
	- Alternatively, a configuration wile with the same base name as the Word document. This configuration file will only apply to the Word document with the same base name.
	- It is only necessary to add the settings that are different from the *config.ini* file in the project's root directory. That file will always act as a fallback.
	- It is only necessary to add the settings that are different from the *config.ini* file in the project's root directory. That file will always act as a fallback.
- Run the converter as follows:
- Run the converter as follows:
```
```
@@ -63,5 +63,6 @@ Lists in table cells are also not possible. One may use html lists for this, but


## Changes
## Changes


- **2024-01-09** - Added support for merging consecutive code paragraphs into a single code block.
- **2023-08-18** - Improved handling of sometimes broken inline formatting in table cells. Adding more default heading formats.
- **2023-08-18** - Improved handling of sometimes broken inline formatting in table cells. Adding more default heading formats.
- **2023-07-27** - Added converting bold and italic text in paragraphs, headers and tables.
- **2023-07-27** - Added converting bold and italic text in paragraphs, headers and tables.
 No newline at end of file
+9 −2
Original line number Original line Diff line number Diff line
@@ -7,8 +7,12 @@




[general]
[general]
# Replace non-breaking spaces in the word document with an HTML space entity.
replaceNbsp =  
replaceNbsp =  


# Replace the less than character in the word document with an HTML entity.
replaceLt = <

; Rename EMF/WMF image references to a different file extension.
; Rename EMF/WMF image references to a different file extension.
; Allowed values: png, svg.
; Allowed values: png, svg.
; If not preseent, no renaming will happen.
; If not preseent, no renaming will happen.
@@ -18,12 +22,14 @@ renameEMFExtension = svg
; Default: false
; Default: false
skipUnreferencedMediaFiles = false
skipUnreferencedMediaFiles = false


replaceLt = <
# Combine code paragraphs into a single markdown code paragraph.
combineCodeParagraphs = true


; Add image captions to the markdown's alternate text.
; Add image captions to the markdown's alternate text.
; Note, that the image caption has follow the image in the document.
; Note, that the image caption has follow the image in the document.
imageCaptions2AltText = true
imageCaptions2AltText = true



[toc]
[toc]
addSectionNumbers = false
addSectionNumbers = false
excludeFromNumbering =
excludeFromNumbering =
@@ -35,7 +41,7 @@ addTocMacro = false


[paragraphs]
[paragraphs]
normal = normal
normal = normal
h1 = heading 1
h1 = heading 1, tt
h2 = heading 2
h2 = heading 2
h3 = heading 3
h3 = heading 3
h4 = heading 4
h4 = heading 4
@@ -48,6 +54,7 @@ a1 = heading 1
a2 = heading 2
a2 = heading 2
a3 = heading 3
a3 = heading 3
note = no
note = no
code = pl
example = ex, ew
example = ex, ew
ul1 = b1, b1+, list paragraph
ul1 = b1, b1+, list paragraph
ul2 = b2, b2+
ul2 = b2, b2+
+32 −0
Original line number Original line Diff line number Diff line
@@ -27,6 +27,7 @@ import configparser, zipfile
from lxml import etree as ET
from lxml import etree as ET


class Style(IntEnum):
class Style(IntEnum):
	code = auto()
	example = auto()
	example = auto()
	image = auto()
	image = auto()
	imagecaption = auto()
	imagecaption = auto()
@@ -145,6 +146,7 @@ class DocumentConfiguration(object):
		self.renameEMFExtension = config.get('general', 'renameEMFExtension', fallback = None)
		self.renameEMFExtension = config.get('general', 'renameEMFExtension', fallback = None)
		self.skipUnreferencedMediaFiles = config.getboolean('general', 'skipUnreferencedMediaFiles', fallback = False)
		self.skipUnreferencedMediaFiles = config.getboolean('general', 'skipUnreferencedMediaFiles', fallback = False)
		self.imageCaptions2AltText = config.getboolean('general', 'imageCaptions2AltText', fallback = True)
		self.imageCaptions2AltText = config.getboolean('general', 'imageCaptions2AltText', fallback = True)
		self.combineCodeParagraphs = config.getboolean('general', 'combineCodeParagraphs', fallback = True)


		#	Paragraphs
		#	Paragraphs
		self.paragraphs = { c : config.getlist('paragraphs', c)	# type: ignore [attr-defined]
		self.paragraphs = { c : config.getlist('paragraphs', c)	# type: ignore [attr-defined]
@@ -170,6 +172,7 @@ class DocumentConfiguration(object):
		self.ul4 = self.paragraphs['ul4']
		self.ul4 = self.paragraphs['ul4']
		self.ul5 = self.paragraphs['ul5']
		self.ul5 = self.paragraphs['ul5']
		#self.continuedlist = self.paragraphs['continuedlist']
		#self.continuedlist = self.paragraphs['continuedlist']
		self.code = self.paragraphs['code']
		self.note = self.paragraphs['note']
		self.note = self.paragraphs['note']
		self.example = self.paragraphs['example']
		self.example = self.paragraphs['example']
		self.tablecaption = self.paragraphs['tablecaption']
		self.tablecaption = self.paragraphs['tablecaption']
@@ -653,6 +656,12 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
							lines.append('')
							lines.append('')
							lines.append(text)
							lines.append(text)


						#	Code
						elif style in docConfig.code:
							checkSameStyle(Style.code, lambda:lines.append(''))
							for _t in text.split(_linebreak):
								lines.append(f'```{_t if _t else " "}```  ') # at least an empty space. And 2 spaces at the end for newline

						#	Example
						#	Example
						elif style in docConfig.example:
						elif style in docConfig.example:
							checkSameStyle(Style.example, lambda:lines.append(''))
							checkSameStyle(Style.example, lambda:lines.append(''))
@@ -745,6 +754,29 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
				lines[i] = line
				lines[i] = line




			#
			#	Combine mutiple consecutive "code" lines
			#

			if docConfig.combineCodeParagraphs:
				codeblock:list[str] = []
				_lines:list[str] = []
				for i in range(len(lines)):
					line = lines[i]
					if line.startswith('```') and line.endswith('```  '):
						# Store code block
						codeblock.append(line[3:-5])
					elif codeblock:
						# Add whole code block to lines
						_lines.append('```')
						_lines.extend(codeblock)
						_lines.append('```')
						codeblock = []
					else:
						# Add line
						_lines.append(line)
				lines = _lines

			#
			#
			#	Insert auto-generated table of contents
			#	Insert auto-generated table of contents
			#
			#