URI and other corrections, mostly inspired from NFV docs (757d182c) · Commits · Centre for Testing and Interoperability / Markdown specifications development / spec2md

spec2md.py

+318 −9

Original line number	Diff line number	Diff line
		@@ -153,7 +153,8 @@ class DocumentConfiguration(object):
		self.skipUnreferencedMediaFiles = config.getboolean('general', 'skipUnreferencedMediaFiles', fallback = False)
		self.imageCaptions2AltText = config.getboolean('general', 'imageCaptions2AltText', fallback = True)
		self.combineCodeParagraphs = config.getboolean('general', 'combineCodeParagraphs', fallback = True)

		self.pandocTableWarnings = config.getboolean('general', 'pandocTableWarnings', fallback = True)
		self.escapeQuotations = config.getboolean('general', 'escapeQuotations', fallback = False)
		# Paragraphs
		self.paragraphs = { c : config.getlist('paragraphs', c) # type: ignore [attr-defined]
		for c in config['paragraphs'] }
		@@ -248,6 +249,7 @@ def processDocuments(documents:list[str],
		headers:list[Tuple[int, str]] = []
		emfFiles:list[str] = []
		referencedImages:list[str] = []
		lastListIndent:Dict[str, int] = {}
		footnotes:dict[str, str] = {}

		global _print
		@@ -356,6 +358,50 @@ def processDocuments(documents:list[str],
		)
		newParagraphs = 0

		def _isURILike(text:str) -> bool:
		""" Check if text appears to be a URI or code-like structure.
		"""
		if not text:
		return False
		# Check for URI patterns: contains slashes, curly braces, or common URI components
		uri_indicators = ['/', '{', '}', 'apiRoot', 'apiMajorVersion', 'http', '://']
		# Also check for variable patterns like {variableName}
		has_curly_braces = '{' in text and '}' in text
		has_slashes = '/' in text
		has_uri_keywords = any(indicator.lower() in text.lower() for indicator in uri_indicators)
		return has_curly_braces or (has_slashes and has_uri_keywords)

		def _isAPIReference(text:str) -> bool:
		""" Check if text appears to be an API reference (e.g., VnfInstances.Post.201, VnfInstancesPostRequest).
		"""
		if not text:
		return False
		# Pattern: ComponentName.Method.StatusCode (e.g., VnfInstances.Post.201)
		if re.match(r'^[A-Z][a-zA-Z0-9]+\.(Post\|Get\|Put\|Delete\|Patch\|Head\|Options)\.\d+$', text.strip()):
		return True
		# Pattern: ComponentNameMethodRequest/Response (e.g., VnfInstancesPostRequest)
		if re.match(r'^[A-Z][a-zA-Z0-9]+(Post\|Get\|Put\|Delete\|Patch\|Head\|Options)(Request\|Response)$', text.strip()):
		return True
		# Pattern: ComponentName with method (e.g., VnfInstances.Post)
		if re.match(r'^[A-Z][a-zA-Z0-9]+\.(Post\|Get\|Put\|Delete\|Patch\|Head\|Options)$', text.strip()):
		return True
		return False

		def _getRunFormatting(runElement:ET.Element) -> Tuple[bool, bool]:
		""" Extract bold and italic formatting from a run element.
		"""
		bold = False
		italic = False
		for e in runElement:
		if strippedTag(e.tag) == 'rPr':
		for ep in e:
		match strippedTag(ep.tag):
		case 'b' if ep.attrib.get(_val, 'true') == 'true':
		bold = True
		case 'i' if ep.attrib.get(_val, 'true') == 'true':
		italic = True
		return (bold, italic)

		def _parseXML(element:ET.Element, inCell:Optional[bool] = False) -> str:
		""" Recursively parse a document paragraph.
		"""
		@@ -365,12 +411,101 @@ def processDocuments(documents:list[str],
		tag = strippedTag(element.tag) # remove namespaces for easier handlings
		match tag:
		case 'p':
		for x in element:
		_result += _parseXML(x, inCell)
		# Collect all runs first to merge consecutive runs with same formatting
		current_run_text = ''
		current_bold = False
		current_italic = False

		def _outputCurrentRun() -> str:
		""" Output the current accumulated run with proper formatting.
		"""
		nonlocal current_run_text, current_bold, current_italic
		if not current_run_text:
		return ''

		# Determine if this looks like a URI or API reference
		is_uri = _isURILike(current_run_text)
		is_api_ref = _isAPIReference(current_run_text)

		# Process text
		_s = str(toMD(current_run_text))
		# Don't strip whitespace for URIs or if not bold/italic
		if not is_uri and (current_bold or current_italic):
		_s = _s.strip()

		# For URIs and API references that are bold, use inline code instead
		if (is_uri or is_api_ref) and current_bold:
		# Use inline code instead of bold for URIs and API references
		# Replace single * or _ (but not needed in code blocks)
		_s = _s.replace('`', '\\`') # Escape backticks
		result = f'`{_s}` '
		else:
		# Replace single * or _
		_s = _s.replace('_', '\\_')
		_s = _s.replace('', '\\')
		if docConfig.escapeQuotations:
		_s = _s.replace('"', '\\"') # Escape double quotes

		# Apply formatting
		_bold_marker = '**' if current_bold else ''
		_italic_marker = '_' if current_italic else ''

		# Add trailing space only if formatting is applied and not URI-like
		_postfix = ' ' if (current_bold or current_italic) and not is_uri else ''
		result = f'{_bold_marker}{_italic_marker}{_s}{_italic_marker}{_bold_marker}{_postfix}'

		# Reset current run
		current_run_text = ''
		current_bold = False
		current_italic = False

		return result

		for child in element:
		child_tag = strippedTag(child.tag)
		if child_tag == 'r':
		# Get formatting for this run
		run_bold, run_italic = _getRunFormatting(child)
		# Extract text from this run
		run_text = ''
		for text_elem in child:
		if strippedTag(text_elem.tag) == 't':
		run_text += str(text_elem.text) if text_elem.text else ''
		elif strippedTag(text_elem.tag) == 'br':
		run_text += _linebreak
		elif strippedTag(text_elem.tag) == 'tab':
		run_text += ' '
		# Handle other inline elements recursively
		else:
		run_text += _parseXML(text_elem, inCell)

		# Merge with previous run if formatting matches
		if run_text and (run_bold == current_bold and run_italic == current_italic and current_run_text):
		current_run_text += run_text
		else:
		# Output previous run if any
		if current_run_text:
		_result += _outputCurrentRun()
		# Start new run
		current_run_text = run_text
		current_bold = run_bold
		current_italic = run_italic
		else:
		# Non-run element - output current run first, then process the element
		if current_run_text:
		_result += _outputCurrentRun()
		_result += _parseXML(child, inCell)

		# Output last run if any
		if current_run_text:
		_result += _outputCurrentRun()

		case 'r':
		# For runs not in paragraphs (shouldn't happen normally, but handle gracefully)
		for x in element:
		_result += _parseXML(x, inCell)
		case 't':
		# Fallback for text elements not processed in paragraph merge
		_bold = ''
		_italics = ''
		for e in element.getparent():
		@@ -384,13 +519,18 @@ def processDocuments(documents:list[str],
		# case _:
		# _print(f'[yellow]unsupported style: {ep.tag}')

		# Strip white spaces if bold or italics
		_s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text)))
		_text = str(toMD(str(element.text))) if element.text else ''
		is_uri = _isURILike(_text)

		# Strip white spaces if bold or italics, but not for URIs
		_s = _text.strip() if (_bold or _italics) and not is_uri else _text
		# Replace single * or _
		_s = _s.replace('_', '\\_')
		_s = _s.replace('', '\\')
		# Add trailing white space when bold or italics
		_postfix = ' ' if _bold or _italics else ''
		if docConfig.escapeQuotations:
		_s = _s.replace('"', '\\"') # Escape double quotes
		# Add trailing white space when bold or italics, but not for URIs
		_postfix = ' ' if (_bold or _italics) and not is_uri else ''
		_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_postfix}'
		# print(_result)

		@@ -681,23 +821,128 @@ def processDocuments(documents:list[str],
		elif style in docConfig.ul1:
		checkSameStyle(Style.unorderedlist, lambda:lines.append(''))
		if len(elem.text): # ignore empty
		if (match := re.match(r'^(\s*\d+\.\s+)', text)):
		# Numbered item: capture indentation length
		lastListIndent['ul1'] = len(match.group(1))
		lines.append(text)
		elif re.match(r'^\[[\w\.]+\]', text):
		# Reference
		lines.append('')
		lines.append(text)
		elif text.startswith(' '):
		# Continuation: use captured indentation
		lines.append('')
		indent = lastListIndent.get('ul1', 4)
		lines.append(' ' * indent + text.lstrip())
		elif text.lstrip().startswith(('-', '*')):
		# Explicit bullet in text
		lastListIndent['ul1'] = 4
		lines.append(f'- {text.lstrip("- *").strip()}')
		elif 'bullet' in style or style.startswith('b'):
		# Style implies bullet
		lastListIndent['ul1'] = 4
		lines.append(f'- {text}')
		else:
		# Fallback: treat as continuation paragraph (e.g. List Paragraph without number/bullet)
		lines.append('')
		indent = lastListIndent.get('ul1', 4)
		lines.append(' ' * indent + text.lstrip())

		elif style in docConfig.ul2:
		checkSameStyle(Style.unorderedlist2, lambda:lines.append(''))
		if len(elem.text): # ignore empty
		if (match := re.match(r'^(\s*\d+\.\s+)', text)):
		lastListIndent['ul2'] = len(match.group(1))
		lines.append(f'{" "*1}{text}')
		elif re.match(r'^\[[\w\.]+\]', text):
		lines.append('')
		lines.append(f'{" "*1}{text}')
		elif text.startswith(' '):
		lines.append('')
		indent = lastListIndent.get('ul2', 4)
		lines.append(f'{" "1}{" " indent}{text.lstrip()}')
		elif text.lstrip().startswith(('-', '*')):
		lastListIndent['ul2'] = 4
		lines.append(f'{" "1}- {text.lstrip("- ").strip()}')
		elif 'bullet' in style or style.startswith('b'):
		lastListIndent['ul2'] = 4
		lines.append(f'{" "*1}- {text}')
		else:
		lines.append('')
		indent = lastListIndent.get('ul2', 4)
		lines.append(f'{" "1}{" " indent}{text.lstrip()}')

		elif style in docConfig.ul3:
		checkSameStyle(Style.unorderedlist3, lambda:lines.append(''))
		if len(elem.text): # ignore empty
		if (match := re.match(r'^(\s*\d+\.\s+)', text)):
		lastListIndent['ul3'] = len(match.group(1))
		lines.append(f'{" "*2}{text}')
		elif re.match(r'^\[[\w\.]+\]', text):
		lines.append('')
		lines.append(f'{" "*2}{text}')
		elif text.startswith(' '):
		lines.append('')
		indent = lastListIndent.get('ul3', 4)
		lines.append(f'{" "2}{" " indent}{text.lstrip()}')
		elif text.lstrip().startswith(('-', '*')):
		lastListIndent['ul3'] = 4
		lines.append(f'{" "2}- {text.lstrip("- ").strip()}')
		elif 'bullet' in style or style.startswith('b'):
		lastListIndent['ul3'] = 4
		lines.append(f'{" "*2}- {text}')
		else:
		lines.append('')
		indent = lastListIndent.get('ul3', 4)
		lines.append(f'{" "2}{" " indent}{text.lstrip()}')

		elif style in docConfig.ul4:
		checkSameStyle(Style.unorderedlist4, lambda:lines.append(''))
		if len(elem.text): # ignore empty
		if (match := re.match(r'^(\s*\d+\.\s+)', text)):
		lastListIndent['ul4'] = len(match.group(1))
		lines.append(f'{" "*3}{text}')
		elif re.match(r'^\[[\w\.]+\]', text):
		lines.append('')
		lines.append(f'{" "*3}{text}')
		elif text.startswith(' '):
		lines.append('')
		indent = lastListIndent.get('ul4', 4)
		lines.append(f'{" "3}{" " indent}{text.lstrip()}')
		elif text.lstrip().startswith(('-', '*')):
		lastListIndent['ul4'] = 4
		lines.append(f'{" "3}- {text.lstrip("- ").strip()}')
		elif 'bullet' in style or style.startswith('b'):
		lastListIndent['ul4'] = 4
		lines.append(f'{" "*3}- {text}')
		else:
		lines.append('')
		indent = lastListIndent.get('ul4', 4)
		lines.append(f'{" "3}{" " indent}{text.lstrip()}')

		elif style in docConfig.ul5:
		checkSameStyle(Style.unorderedlist5, lambda:lines.append(''))
		if len(elem.text): # ignore empty
		if (match := re.match(r'^(\s*\d+\.\s+)', text)):
		lastListIndent['ul5'] = len(match.group(1))
		lines.append(f'{" "*4}{text}')
		elif re.match(r'^\[[\w\.]+\]', text):
		lines.append('')
		lines.append(f'{" "*4}{text}')
		elif text.startswith(' '):
		lines.append('')
		indent = lastListIndent.get('ul5', 4)
		lines.append(f'{" "4}{" " indent}{text.lstrip()}')
		elif text.lstrip().startswith(('-', '*')):
		lastListIndent['ul5'] = 4
		lines.append(f'{" "4}- {text.lstrip("- ").strip()}')
		elif 'bullet' in style or style.startswith('b'):
		lastListIndent['ul5'] = 4
		lines.append(f'{" "*4}- {text}')
		else:
		lines.append('')
		indent = lastListIndent.get('ul5', 4)
		lines.append(f'{" "4}{" " indent}{text.lstrip()}')

		# Table Caption
		elif style in docConfig.tablecaption:
		@@ -739,6 +984,11 @@ def processDocuments(documents:list[str],
		# Example
		elif style in docConfig.example:
		checkSameStyle(Style.example, lambda:lines.append(''))

		# Special handling for references to ensure paragraph separation
		if re.match(r'^\s*\[[\w\.]+\]', text):
		lines.append('')

		# Replace linebreaks
		for _t in text.split(_linebreak):
		lines.append(f'`{_t if _t else " "}` ') # at least an empty space. And 2 spaces at the end for newline
		@@ -883,12 +1133,71 @@ def processDocuments(documents:list[str],
		progress.update(processTask, advance = 1) # progress update
		for i in range(len(lines)):
		line = lines[i]

		# Remove empty formatting markers
		line = line.replace('__', '')
		line = line.replace('****', '')
		line = line.replace(' ', ' ')
		line = line.replace('_ ', '_ ')
		line = line.replace(' ', ' ')
		#line = line.replace(' ', ' ')

		# Merge consecutive bold markers: text1text2 -> text1 text2
		# This handles cases where formatting was split across runs
		# Pattern: text1text2 (four asterisks between text)
		line = re.sub(r'\\([^]+)\\\\([^]+)\\', r'\1 \2', line)
		# Pattern: text1** (trailing four asterisks)
		line = re.sub(r'\\([^]+)\\\\(\s\|$)', r'\1*\2', line)
		# Pattern: **text1 (leading four asterisks)
		line = re.sub(r'(\s\|^)\\\\([^]+)\\', r'\1\2*', line)
		# Pattern: text (trailing double asterisks not part of a pair)
		# Remove trailing ** that's not part of a valid bold pair
		line = re.sub(r'\\([^\s]+)\\(\\)(\s\|$)', r'\1*\3', line)

		# Remove any remaining sequences of 4+ asterisks (shouldn't happen after merge, but safety)
		line = re.sub(r'\{4,}', '*', line)

		lines[i] = line

		#
		# Post-process to convert bold URIs and API references to inline code
		# This catches any cases that weren't handled during initial parsing
		#
		progress.update(processTask, advance = 1) # progress update
		in_code_block = False
		for i in range(len(lines)):
		line = lines[i]
		stripped = line.strip()

		# Track code block state
		if stripped.startswith('```'):
		in_code_block = not in_code_block
		continue

		# Skip processing if inside a code block
		if in_code_block:
		continue

		# Pattern 1: Bold URIs - {apiRoot}/path/to/resource -> `{apiRoot}/path/to/resource`
		# Matches URIs with slashes and optional curly braces
		line = re.sub(r'\\(\s\{?[a-zA-Z0-9_]+\}?/[^]+\})\\', r'`\1`', line)
		line = re.sub(r'\\(\s\{?[a-zA-Z0-9_]+\}?/[^]+)\\', r'`\1`', line)

		# Pattern 2: API references like VnfInstances.Post.201
		line = re.sub(r'\\([A-Z][a-zA-Z0-9]+\.(Post\|Get\|Put\|Delete\|Patch\|Head\|Options)\.\d+)\\', r'`\1`', line)

		# Pattern 3: API component names like VnfInstancesPostRequest
		line = re.sub(r'\\([A-Z][a-zA-Z0-9]+(Post\|Get\|Put\|Delete\|Patch\|Head\|Options)(Request\|Response))\\', r'`\1`', line)

		# Pattern 4: API references like VnfInstances.Post (without status code)
		line = re.sub(r'\\([A-Z][a-zA-Z0-9]+\.(Post\|Get\|Put\|Delete\|Patch\|Head\|Options))\\', r'`\1`', line)

		# Pattern 5: Paths ending with (e.g., /path/to/resource:)
		# This handles cases where the colon is outside the bold markers
		line = re.sub(r'(\S+)\\:', r'\1:', line)

		# Pattern 6: References with in the middle (e.g., VnfInstances.Post.201)
		line = re.sub(r'([A-Z][a-zA-Z0-9]+)\\\.', r'\1.', line)
		line = re.sub(r'([A-Z][a-zA-Z0-9]+)\\([A-Z])', r'\1\2', line)

		lines[i] = line

Admin message