Add the insertLineAfterTableOrCodeBlock function (56d86df3) · Commits · Centre for Testing and Interoperability / Markdown specifications development / Specification tools

generateBaseline/pandocFilter.py

+21 −10

Original line number	Diff line number	Diff line
		@@ -267,8 +267,9 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li
		_taskID = progress.add_task('[blue]Inserting line after table or code block', total=0)
		# progress.update()

		matchCodefenceStart = re.compile(r'\s```\s?.', re.IGNORECASE)
		matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE)
		# Also support quoted code fences in examples, e.g. "> ```sparql"
		matchCodefenceStart = re.compile(r'^\s(?:>\s)```\s?.', re.IGNORECASE)
		matchCodefenceEnd = re.compile(r'^\s(?:>\s)*```\s?$', re.IGNORECASE)
		matchTable = re.compile(r'^\s\\|.\\|\s*$', re.IGNORECASE)
		matchTableSeparator = re.compile(r'^\s\\|([-: ]+\\|)+\s$', re.IGNORECASE)
		matchGridTable = re.compile(r'^\s\+-.\+\s$', re.IGNORECASE)
		@@ -291,13 +292,20 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li
		# Note, that longer codefences are allowed by the markdown specification.

		if matchCodefenceStart.match(line) and not inCodefence:
		# For quoted fences (e.g. "> ```"), add an empty quoted line before
		# the fence so the previous quoted text is not absorbed as code.
		if line.lstrip().startswith('>'):
		qprefix = re.match(r'^(\s(?:>\s)+)', line)
		if qprefix:
		_lines.append(f'{qprefix.group(1)}\n')
		inCodefence = True
		_lines.append(line)
		continue
		if matchCodefenceEnd.match(line):
		inCodefence = False
		_lines.append(line)
		_lines.append("<br />") # insert a blank line after the code block
		_lines.append("\n")
		_lines.append("<br />")
		continue
		if inCodefence:
		_lines.append(line)
		@@ -320,9 +328,9 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li
		inTable = False
		tableHasSeparator = False
		# Mark the previous line as the last row in the table
		_lines.append("<br />") # insert a blank line after the table
		continue
		# continue with other matches
		_lines.append("\n")
		_lines.append("<br />")
		# Continue processing the current line (first line after table).

		#Detect grid tables and convert them to html table
		if matchGridTable.match(line) and not inGridTable:
		@@ -344,8 +352,9 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li
		else:
		inGridTable = False
		#processGridTable()
		_lines.append("<br />") # insert a blank line after the grid table
		continue
		_lines.append("\n")
		_lines.append("<br />")
		# Continue processing the current line (first line after grid table).
		# continue with other matches

		# Detect notes
		@@ -358,7 +367,8 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li
		if inNote and line == "":
		inNote = False
		_lines.append(line)
		_lines.append("<br />") # insert a blank line after the note
		_lines.append("\n")
		_lines.append("<br />")
		continue
		if inNote:
		_lines.append(line)
		@@ -408,6 +418,7 @@ def process(args) -> None:
		mdLines = correctTableSeparators(progress, mdLines)
		mdLines = instertLineBeforeStartOfList(progress, mdLines)
		mdLines = replaceBulletedLettersLists(progress, mdLines)
		# Spacing after code/table blocks is now handled in DOCX postprocessing.
		# mdLines = instertLineAfterTableOrCodeBlock(progress, mdLines)
		mdLines = replaceLineBreaks(progress, mdLines)
		writeMDFile(progress, mdLines, args.document, args.outDirectory)

generateBaseline/postprocessing.py

+100 −0

Original line number	Diff line number	Diff line
		@@ -2376,6 +2376,105 @@ def update_source_code_style(docx_input, docx_output):
		if os.path.exists(tmp_path):
		os.remove(tmp_path)


		def add_break_after_code_blocks_and_tables(docx_input, docx_output):
		"""
		Insert minimal separators after source-code/example paragraphs (styles
		'PL'/'EW') and
		after tables to improve separation from following body text in Word.
		"""
		ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

		with zipfile.ZipFile(docx_input, 'r') as zin:
		xml_data = zin.read("word/document.xml")

		root = etree.fromstring(xml_data)
		body = root.find(f".//{{{ns['w']}}}body")
		if body is None:
		return

		w_p = f"{{{ns['w']}}}p"
		w_pPr = f"{{{ns['w']}}}pPr"
		w_spacing = f"{{{ns['w']}}}spacing"
		w_r = f"{{{ns['w']}}}r"
		w_br = f"{{{ns['w']}}}br"
		w_tbl = f"{{{ns['w']}}}tbl"
		w_val = f"{{{ns['w']}}}val"
		w_before = f"{{{ns['w']}}}before"
		w_after = f"{{{ns['w']}}}after"

		def _has_target_block_style(elem):
		if elem is None or elem.tag != w_p:
		return False
		style_elems = elem.xpath('./w:pPr/w:pStyle', namespaces=ns)
		return bool(style_elems and style_elems[0].get(w_val) in {"PL", "EW"})

		def _is_empty_paragraph(elem):
		if elem is None or elem.tag != w_p:
		return False
		texts = ''.join(elem.xpath('.//w:t/text()', namespaces=ns)).strip()
		return texts == '' and not elem.xpath('.//w:drawing', namespaces=ns)

		code_break_paras = 0
		table_break_paras = 0
		for elem in list(body):
		# After PL/EW paragraphs: insert one empty paragraph with zero spacing.
		if _has_target_block_style(elem):
		nxt = elem.getnext()
		if _is_empty_paragraph(nxt):
		continue

		sep_p = etree.Element(w_p)
		sep_pPr = etree.Element(w_pPr)
		sep_spacing = etree.Element(w_spacing)
		sep_spacing.set(w_before, "0")
		sep_spacing.set(w_after, "0")
		sep_pPr.append(sep_spacing)
		sep_p.append(sep_pPr)
		body.insert(body.index(elem) + 1, sep_p)
		code_break_paras += 1
		continue

		# After tables: insert one empty paragraph with zero spacing.
		if elem.tag == w_tbl:
		nxt = elem.getnext()
		if _is_empty_paragraph(nxt):
		continue

		sep_p = etree.Element(w_p)
		sep_pPr = etree.Element(w_pPr)
		sep_spacing = etree.Element(w_spacing)
		sep_spacing.set(w_before, "0")
		sep_spacing.set(w_after, "0")
		sep_pPr.append(sep_spacing)
		sep_p.append(sep_pPr)
		body.insert(body.index(elem) + 1, sep_p)
		table_break_paras += 1

		print(
		f'Inserted minimal separators: post-code/example paragraphs={code_break_paras}, '
		f'post-table paragraphs={table_break_paras}'
		)

		xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")

		tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
		os.close(tmp_fd)

		try:
		with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
		for item in zin.infolist():
		if item.filename != "word/document.xml":
		data = zin.read(item.filename)
		zout.writestr(item.filename, data)
		zout.writestr("word/document.xml", xml_data)

		shutil.move(tmp_path, docx_output)
		os.chmod(docx_output, 0o644)
		finally:
		if os.path.exists(tmp_path):
		os.remove(tmp_path)

		def update_equation_style(docx_input, docx_output):
		"""
		Updates equation styles from 'Equation' to 'EQ' in a DOCX file.
		@@ -2539,6 +2638,7 @@ def update_format_styles_cli():
		add_no_break_hyphens(args.docx_input, args.docx_output)
		update_references_style(args.docx_input, args.docx_output)
		update_source_code_style(args.docx_input, args.docx_output)
		add_break_after_code_blocks_and_tables(args.docx_input, args.docx_output)
		update_equation_style(args.docx_input, args.docx_output)
		correct_quotes_docx(args.docx_input, args.docx_output)