From 757d182cc3aea7dfeb6afd6b2b0788f330400363 Mon Sep 17 00:00:00 2001
From: Naum Spaseski <naum.spaseski@etsi.org>
Date: Mon, 15 Dec 2025 16:05:43 +0100
Subject: [PATCH 1/2] URI and other corrections, mostly inspired from NFV docs

---
 spec2md.py | 327 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 318 insertions(+), 9 deletions(-)

diff --git a/spec2md.py b/spec2md.py
index c2f9d58..0d8da69 100644
--- a/spec2md.py
+++ b/spec2md.py
@@ -153,7 +153,8 @@ class DocumentConfiguration(object):
 		self.skipUnreferencedMediaFiles = config.getboolean('general', 'skipUnreferencedMediaFiles', fallback = False)
 		self.imageCaptions2AltText = config.getboolean('general', 'imageCaptions2AltText', fallback = True)
 		self.combineCodeParagraphs = config.getboolean('general', 'combineCodeParagraphs', fallback = True)
-
+		self.pandocTableWarnings = config.getboolean('general', 'pandocTableWarnings', fallback = True)
+		self.escapeQuotations = config.getboolean('general', 'escapeQuotations', fallback = False)
 		#	Paragraphs
 		self.paragraphs = { c : config.getlist('paragraphs', c)	# type: ignore [attr-defined]
 							for c in config['paragraphs'] }
@@ -248,6 +249,7 @@ def processDocuments(documents:list[str],
 	headers:list[Tuple[int, str]]									= []
 	emfFiles:list[str]												= []
 	referencedImages:list[str]										= []
+	lastListIndent:Dict[str, int]									= {}
 	footnotes:dict[str, str]										= {}
 
 	global _print
@@ -356,6 +358,50 @@ def processDocuments(documents:list[str],
 			)
 			newParagraphs = 0
 
+			def _isURILike(text:str) -> bool:
+				"""	Check if text appears to be a URI or code-like structure.
+				"""
+				if not text:
+					return False
+				# Check for URI patterns: contains slashes, curly braces, or common URI components
+				uri_indicators = ['/', '{', '}', 'apiRoot', 'apiMajorVersion', 'http', '://']
+				# Also check for variable patterns like {variableName}
+				has_curly_braces = '{' in text and '}' in text
+				has_slashes = '/' in text
+				has_uri_keywords = any(indicator.lower() in text.lower() for indicator in uri_indicators)
+				return has_curly_braces or (has_slashes and has_uri_keywords)
+
+			def _isAPIReference(text:str) -> bool:
+				"""	Check if text appears to be an API reference (e.g., VnfInstances.Post.201, VnfInstancesPostRequest).
+				"""
+				if not text:
+					return False
+				# Pattern: ComponentName.Method.StatusCode (e.g., VnfInstances.Post.201)
+				if re.match(r'^[A-Z][a-zA-Z0-9]+\.(Post|Get|Put|Delete|Patch|Head|Options)\.\d+$', text.strip()):
+					return True
+				# Pattern: ComponentNameMethodRequest/Response (e.g., VnfInstancesPostRequest)
+				if re.match(r'^[A-Z][a-zA-Z0-9]+(Post|Get|Put|Delete|Patch|Head|Options)(Request|Response)$', text.strip()):
+					return True
+				# Pattern: ComponentName with method (e.g., VnfInstances.Post)
+				if re.match(r'^[A-Z][a-zA-Z0-9]+\.(Post|Get|Put|Delete|Patch|Head|Options)$', text.strip()):
+					return True
+				return False
+
+			def _getRunFormatting(runElement:ET.Element) -> Tuple[bool, bool]:
+				"""	Extract bold and italic formatting from a run element.
+				"""
+				bold = False
+				italic = False
+				for e in runElement:
+					if strippedTag(e.tag) == 'rPr':
+						for ep in e:
+							match strippedTag(ep.tag):
+								case 'b' if ep.attrib.get(_val, 'true') == 'true':
+									bold = True
+								case 'i' if ep.attrib.get(_val, 'true') == 'true':
+									italic = True
+				return (bold, italic)
+
 			def _parseXML(element:ET.Element, inCell:Optional[bool] = False) -> str:
 				"""	Recursively parse a document paragraph.
 				"""
@@ -365,12 +411,101 @@ def processDocuments(documents:list[str],
 				tag = strippedTag(element.tag)	# remove namespaces for easier handlings
 				match tag:
 					case 'p':
-						for x in element:
-							_result += _parseXML(x, inCell)
+						# Collect all runs first to merge consecutive runs with same formatting
+						current_run_text = ''
+						current_bold = False
+						current_italic = False
+						
+						def _outputCurrentRun() -> str:
+							"""	Output the current accumulated run with proper formatting.
+							"""
+							nonlocal current_run_text, current_bold, current_italic
+							if not current_run_text:
+								return ''
+							
+							# Determine if this looks like a URI or API reference
+							is_uri = _isURILike(current_run_text)
+							is_api_ref = _isAPIReference(current_run_text)
+							
+							# Process text
+							_s = str(toMD(current_run_text))
+							# Don't strip whitespace for URIs or if not bold/italic
+							if not is_uri and (current_bold or current_italic):
+								_s = _s.strip()
+							
+							# For URIs and API references that are bold, use inline code instead
+							if (is_uri or is_api_ref) and current_bold:
+								# Use inline code instead of bold for URIs and API references
+								# Replace single * or _ (but not needed in code blocks)
+								_s = _s.replace('`', '\\`')  # Escape backticks
+								result = f'`{_s}` '
+							else:
+								# Replace single * or _
+								_s = _s.replace('_', '\\_')
+								_s = _s.replace('*', '\\*')
+								if docConfig.escapeQuotations:
+									_s = _s.replace('"', '\\"') # Escape double quotes
+								
+								# Apply formatting
+								_bold_marker = '**' if current_bold else ''
+								_italic_marker = '_' if current_italic else ''
+								
+								# Add trailing space only if formatting is applied and not URI-like
+								_postfix = ' ' if (current_bold or current_italic) and not is_uri else ''
+								result = f'{_bold_marker}{_italic_marker}{_s}{_italic_marker}{_bold_marker}{_postfix}'
+							
+							# Reset current run
+							current_run_text = ''
+							current_bold = False
+							current_italic = False
+							
+							return result
+						
+						for child in element:
+							child_tag = strippedTag(child.tag)
+							if child_tag == 'r':
+								# Get formatting for this run
+								run_bold, run_italic = _getRunFormatting(child)
+								# Extract text from this run
+								run_text = ''
+								for text_elem in child:
+									if strippedTag(text_elem.tag) == 't':
+										run_text += str(text_elem.text) if text_elem.text else ''
+									elif strippedTag(text_elem.tag) == 'br':
+										run_text += _linebreak
+									elif strippedTag(text_elem.tag) == 'tab':
+										run_text += '    '
+									# Handle other inline elements recursively
+									else:
+										run_text += _parseXML(text_elem, inCell)
+								
+								# Merge with previous run if formatting matches
+								if run_text and (run_bold == current_bold and run_italic == current_italic and current_run_text):
+									current_run_text += run_text
+								else:
+									# Output previous run if any
+									if current_run_text:
+										_result += _outputCurrentRun()
+									# Start new run
+									current_run_text = run_text
+									current_bold = run_bold
+									current_italic = run_italic
+							else:
+								# Non-run element - output current run first, then process the element
+								if current_run_text:
+									_result += _outputCurrentRun()
+								_result += _parseXML(child, inCell)
+						
+						# Output last run if any
+						if current_run_text:
+							_result += _outputCurrentRun()
+					
 					case 'r':
+						# For runs not in paragraphs (shouldn't happen normally, but handle gracefully)
 						for x in element:
 							_result += _parseXML(x, inCell)
 					case 't':
+						# Fallback for text elements not processed in paragraph merge
 						_bold = ''
 						_italics = ''
 						for e in element.getparent():
@@ -384,13 +519,18 @@ def processDocuments(documents:list[str],
 										# case _:
 										# 	_print(f'[yellow]unsupported style: {ep.tag}')
 						
-						# Strip white spaces if bold or italics
-						_s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text)))
+						_text = str(toMD(str(element.text))) if element.text else ''
+						is_uri = _isURILike(_text)
+						
+						# Strip white spaces if bold or italics, but not for URIs
+						_s = _text.strip() if (_bold or _italics) and not is_uri else _text
 						# Replace single * or _
 						_s = _s.replace('_', '\\_')
 						_s = _s.replace('*', '\\*')
-						# Add trailing white space when bold or italics
-						_postfix = ' ' if _bold or _italics else ''
+						if docConfig.escapeQuotations:
+							_s = _s.replace('"', '\\"') # Escape double quotes
+						# Add trailing white space when bold or italics, but not for URIs
+						_postfix = ' ' if (_bold or _italics) and not is_uri else ''
 						_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_postfix}'
 						# print(_result)
 
@@ -681,23 +821,128 @@ def processDocuments(documents:list[str],
 						elif style in docConfig.ul1:
 							checkSameStyle(Style.unorderedlist, lambda:lines.append(''))
 							if len(elem.text):	# ignore empty
+								if (match := re.match(r'^(\s*\d+\.\s+)', text)):
+									# Numbered item: capture indentation length
+									lastListIndent['ul1'] = len(match.group(1))
+									lines.append(text)
+								elif re.match(r'^\[[\w\.]+\]', text):
+									# Reference
+									lines.append('')
+									lines.append(text)
+								elif text.startswith(' '):
+									# Continuation: use captured indentation
+									lines.append('')
+									indent = lastListIndent.get('ul1', 4)
+									lines.append(' ' * indent + text.lstrip())
+								elif text.lstrip().startswith(('-', '*')):
+									# Explicit bullet in text
+									lastListIndent['ul1'] = 4
+									lines.append(f'- {text.lstrip("- *").strip()}')
+								elif 'bullet' in style or style.startswith('b'):
+									# Style implies bullet
+									lastListIndent['ul1'] = 4
 								lines.append(f'- {text}')
+								else:
+									# Fallback: treat as continuation paragraph (e.g. List Paragraph without number/bullet)
+									lines.append('')
+									indent = lastListIndent.get('ul1', 4)
+									lines.append(' ' * indent + text.lstrip())
+
 						elif style in docConfig.ul2:
 							checkSameStyle(Style.unorderedlist2, lambda:lines.append(''))
 							if len(elem.text):	# ignore empty
+								if (match := re.match(r'^(\s*\d+\.\s+)', text)):
+									lastListIndent['ul2'] = len(match.group(1))
+									lines.append(f'{"    "*1}{text}')
+								elif re.match(r'^\[[\w\.]+\]', text):
+									lines.append('')
+									lines.append(f'{"    "*1}{text}')
+								elif text.startswith(' '):
+									lines.append('')
+									indent = lastListIndent.get('ul2', 4)
+									lines.append(f'{"    "*1}{" " * indent}{text.lstrip()}')
+								elif text.lstrip().startswith(('-', '*')):
+									lastListIndent['ul2'] = 4
+									lines.append(f'{"    "*1}- {text.lstrip("- *").strip()}')
+								elif 'bullet' in style or style.startswith('b'):
+									lastListIndent['ul2'] = 4
 								lines.append(f'{"    "*1}- {text}')
+								else:
+									lines.append('')
+									indent = lastListIndent.get('ul2', 4)
+									lines.append(f'{"    "*1}{" " * indent}{text.lstrip()}')
+
 						elif style in docConfig.ul3:
 							checkSameStyle(Style.unorderedlist3, lambda:lines.append(''))
 							if len(elem.text):	# ignore empty
+								if (match := re.match(r'^(\s*\d+\.\s+)', text)):
+									lastListIndent['ul3'] = len(match.group(1))
+									lines.append(f'{"    "*2}{text}')
+								elif re.match(r'^\[[\w\.]+\]', text):
+									lines.append('')
+									lines.append(f'{"    "*2}{text}')
+								elif text.startswith(' '):
+									lines.append('')
+									indent = lastListIndent.get('ul3', 4)
+									lines.append(f'{"    "*2}{" " * indent}{text.lstrip()}')
+								elif text.lstrip().startswith(('-', '*')):
+									lastListIndent['ul3'] = 4
+									lines.append(f'{"    "*2}- {text.lstrip("- *").strip()}')
+								elif 'bullet' in style or style.startswith('b'):
+									lastListIndent['ul3'] = 4
 								lines.append(f'{"    "*2}- {text}')
+								else:
+									lines.append('')
+									indent = lastListIndent.get('ul3', 4)
+									lines.append(f'{"    "*2}{" " * indent}{text.lstrip()}')
+
 						elif style in docConfig.ul4:
 							checkSameStyle(Style.unorderedlist4, lambda:lines.append(''))
 							if len(elem.text):	# ignore empty
+								if (match := re.match(r'^(\s*\d+\.\s+)', text)):
+									lastListIndent['ul4'] = len(match.group(1))
+									lines.append(f'{"    "*3}{text}')
+								elif re.match(r'^\[[\w\.]+\]', text):
+									lines.append('')
+									lines.append(f'{"    "*3}{text}')
+								elif text.startswith(' '):
+									lines.append('')
+									indent = lastListIndent.get('ul4', 4)
+									lines.append(f'{"    "*3}{" " * indent}{text.lstrip()}')
+								elif text.lstrip().startswith(('-', '*')):
+									lastListIndent['ul4'] = 4
+									lines.append(f'{"    "*3}- {text.lstrip("- *").strip()}')
+								elif 'bullet' in style or style.startswith('b'):
+									lastListIndent['ul4'] = 4
 								lines.append(f'{"    "*3}- {text}')
+								else:
+									lines.append('')
+									indent = lastListIndent.get('ul4', 4)
+									lines.append(f'{"    "*3}{" " * indent}{text.lstrip()}')
+
 						elif style in docConfig.ul5:
 							checkSameStyle(Style.unorderedlist5, lambda:lines.append(''))
 							if len(elem.text):	# ignore empty
+								if (match := re.match(r'^(\s*\d+\.\s+)', text)):
+									lastListIndent['ul5'] = len(match.group(1))
+									lines.append(f'{"    "*4}{text}')
+								elif re.match(r'^\[[\w\.]+\]', text):
+									lines.append('')
+									lines.append(f'{"    "*4}{text}')
+								elif text.startswith(' '):
+									lines.append('')
+									indent = lastListIndent.get('ul5', 4)
+									lines.append(f'{"    "*4}{" " * indent}{text.lstrip()}')
+								elif text.lstrip().startswith(('-', '*')):
+									lastListIndent['ul5'] = 4
+									lines.append(f'{"    "*4}- {text.lstrip("- *").strip()}')
+								elif 'bullet' in style or style.startswith('b'):
+									lastListIndent['ul5'] = 4
 								lines.append(f'{"    "*4}- {text}')
+								else:
+									lines.append('')
+									indent = lastListIndent.get('ul5', 4)
+									lines.append(f'{"    "*4}{" " * indent}{text.lstrip()}')
 
 						#	Table Caption
 						elif style in docConfig.tablecaption:
@@ -739,6 +984,11 @@ def processDocuments(documents:list[str],
 						#	Example
 						elif style in docConfig.example:
 							checkSameStyle(Style.example, lambda:lines.append(''))
+
+							# Special handling for references to ensure paragraph separation
+							if re.match(r'^\s*\[[\w\.]+\]', text):
+								lines.append('')
+
 							# Replace linebreaks
 							for _t in text.split(_linebreak):
 								lines.append(f'`{_t if _t else " "}`  ') # at least an empty space. And 2 spaces at the end for newline
@@ -883,12 +1133,71 @@ def processDocuments(documents:list[str],
 			progress.update(processTask, advance = 1)	# progress update
 			for i in range(len(lines)):
 				line = lines[i]
+				
+				# Remove empty formatting markers
 				line = line.replace('__', '')
 				line = line.replace('****', '')
 				line = line.replace('**  ', '** ')
 				line = line.replace('_  ', '_ ')
-				line = line.replace('** **', ' ')
-				#line = line.replace('  ', ' ')
+				
+				# Merge consecutive bold markers: **text1****text2** -> **text1 text2**
+				# This handles cases where formatting was split across runs
+				# Pattern: **text1****text2** (four asterisks between text)
+				line = re.sub(r'\*\*([^*]+)\*\*\*\*([^*]+)\*\*', r'**\1 \2**', line)
+				# Pattern: **text1**** (trailing four asterisks)
+				line = re.sub(r'\*\*([^*]+)\*\*\*\*(\s|$)', r'**\1**\2', line)
+				# Pattern: ****text1** (leading four asterisks)
+				line = re.sub(r'(\s|^)\*\*\*\*([^*]+)\*\*', r'\1**\2**', line)
+				# Pattern: **text** (trailing double asterisks not part of a pair)
+				# Remove trailing ** that's not part of a valid bold pair
+				line = re.sub(r'\*\*([^*\s]+)\*\*(\*\*)(\s|$)', r'**\1**\3', line)
+				
+				# Remove any remaining sequences of 4+ asterisks (shouldn't happen after merge, but safety)
+				line = re.sub(r'\*{4,}', '**', line)
+				
+				lines[i] = line
+
+			#
+			#	Post-process to convert bold URIs and API references to inline code
+			#	This catches any cases that weren't handled during initial parsing
+			#
+			progress.update(processTask, advance = 1)	# progress update
+			in_code_block = False
+			for i in range(len(lines)):
+				line = lines[i]
+				stripped = line.strip()
+
+				# Track code block state
+				if stripped.startswith('```'):
+					in_code_block = not in_code_block
+					continue
+				
+				# Skip processing if inside a code block
+				if in_code_block:
+					continue
+				
+				# Pattern 1: Bold URIs - **{apiRoot}/path/to/resource** -> `{apiRoot}/path/to/resource`
+				# Matches URIs with slashes and optional curly braces
+				line = re.sub(r'\*\*(\s*\{?[a-zA-Z0-9_]+\}?/[^*]+\})\*\*', r'`\1`', line)
+				line = re.sub(r'\*\*(\s*\{?[a-zA-Z0-9_]+\}?/[^*]+)\*\*', r'`\1`', line)
+				
+				# Pattern 2: API references like VnfInstances.Post.201
+				line = re.sub(r'\*\*([A-Z][a-zA-Z0-9]+\.(Post|Get|Put|Delete|Patch|Head|Options)\.\d+)\*\*', r'`\1`', line)
+				
+				# Pattern 3: API component names like VnfInstancesPostRequest
+				line = re.sub(r'\*\*([A-Z][a-zA-Z0-9]+(Post|Get|Put|Delete|Patch|Head|Options)(Request|Response))\*\*', r'`\1`', line)
+				
+				# Pattern 4: API references like VnfInstances.Post (without status code)
+				line = re.sub(r'\*\*([A-Z][a-zA-Z0-9]+\.(Post|Get|Put|Delete|Patch|Head|Options))\*\*', r'`\1`', line)
+				
+				# Pattern 5: Paths ending with ** (e.g., /path/to/resource**:)
+				# This handles cases where the colon is outside the bold markers
+				line = re.sub(r'(\S+)\*\*:', r'\1:', line)
+				
+				# Pattern 6: References with ** in the middle (e.g., VnfInstances**.Post.201)
+				line = re.sub(r'([A-Z][a-zA-Z0-9]+)\*\*\.', r'\1.', line)
+				line = re.sub(r'([A-Z][a-zA-Z0-9]+)\*\*([A-Z])', r'\1\2', line)
+				
 				lines[i] = line
 
 
-- 
GitLab


From 5c2c277163b98d79cb48b34b59ffb3a3c55b83fc Mon Sep 17 00:00:00 2001
From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org>
Date: Thu, 18 Dec 2025 13:38:46 +0100
Subject: [PATCH 2/2] Fix

---
 spec2md.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/spec2md.py b/spec2md.py
index 0d8da69..7d7536c 100644
--- a/spec2md.py
+++ b/spec2md.py
@@ -841,7 +841,7 @@ def processDocuments(documents:list[str],
 								elif 'bullet' in style or style.startswith('b'):
 									# Style implies bullet
 									lastListIndent['ul1'] = 4
-								lines.append(f'- {text}')
+									lines.append(f'- {text}')
 								else:
 									# Fallback: treat as continuation paragraph (e.g. List Paragraph without number/bullet)
 									lines.append('')
@@ -866,7 +866,7 @@ def processDocuments(documents:list[str],
 									lines.append(f'{"    "*1}- {text.lstrip("- *").strip()}')
 								elif 'bullet' in style or style.startswith('b'):
 									lastListIndent['ul2'] = 4
-								lines.append(f'{"    "*1}- {text}')
+									lines.append(f'{"    "*1}- {text}')
 								else:
 									lines.append('')
 									indent = lastListIndent.get('ul2', 4)
@@ -890,7 +890,7 @@ def processDocuments(documents:list[str],
 									lines.append(f'{"    "*2}- {text.lstrip("- *").strip()}')
 								elif 'bullet' in style or style.startswith('b'):
 									lastListIndent['ul3'] = 4
-								lines.append(f'{"    "*2}- {text}')
+									lines.append(f'{"    "*2}- {text}')
 								else:
 									lines.append('')
 									indent = lastListIndent.get('ul3', 4)
@@ -914,7 +914,7 @@ def processDocuments(documents:list[str],
 									lines.append(f'{"    "*3}- {text.lstrip("- *").strip()}')
 								elif 'bullet' in style or style.startswith('b'):
 									lastListIndent['ul4'] = 4
-								lines.append(f'{"    "*3}- {text}')
+									lines.append(f'{"    "*3}- {text}')
 								else:
 									lines.append('')
 									indent = lastListIndent.get('ul4', 4)
@@ -938,7 +938,7 @@ def processDocuments(documents:list[str],
 									lines.append(f'{"    "*4}- {text.lstrip("- *").strip()}')
 								elif 'bullet' in style or style.startswith('b'):
 									lastListIndent['ul5'] = 4
-								lines.append(f'{"    "*4}- {text}')
+									lines.append(f'{"    "*4}- {text}')
 								else:
 									lines.append('')
 									indent = lastListIndent.get('ul5', 4)
-- 
GitLab