updated to work with pandoc chunked HTML option (Spezzatino) (96ab91ee) · Commits · CIM - Context Information Management / NGSI-LD API

filter_1.lua

+21 −3

Original line number	Diff line number	Diff line
		--THIS LUA FILTER MUST BE APPLIED TO PANDOC IN THE CORRECT ORDER


		--global table for references link
		references = {}
		annex_headers = {}
		@@ -145,6 +146,13 @@ if FORMAT:match 'html' then
		-- fix annex headers
		if el.level == 8 then
		annex_headers[el.content[3].text] = "#"..el.attr.identifier -- annex letter / link correspondance
		el.level = 1
		return el
		end
		-- fix header level of sub-clauses of annexes
		if el.content[1].t == "Str" and el.content[1].text:find("^%u%.%d") then
		el.level = el.level + 1
		return el
		end
		--search for style
		return Style(el)
		@@ -152,6 +160,13 @@ if FORMAT:match 'html' then


		function Image(el)
		-- retrieve aspect ratio of image
		local width = tonumber(el.attr.attributes.width:sub(1,-3))
		local height = tonumber(el.attr.attributes.height:sub(1,-3))
		local ratio = height/width
		-- set height
		el.attr = { style = "width: 100%; height: calc(100%*"..ratio..");"}

		local filePath, extension = el.src:match("(.)%.(.)$") -- image.png, jpeg or emf
		--fixes extensions
		if extension == "emf" then
		@@ -160,10 +175,13 @@ if FORMAT:match 'html' then
		end
		--adds yellow overlay on top of the image
		if extension == "png" then
		local overlay = pandoc.Span({}, { style = "position: absolute; top: 0; right: 0; bottom: 0; left: 0; background-color: rgba(255, 255, 0, 0.5); pointer-events: none; z-index: 1;", class = "image_overlay"})
		el.attr = { style = "width: 100%; height: auto; position: relative;"}
		local overlay = pandoc.Span({}, {
		style = "position: absolute; top: 0; right: 0; bottom: 0; left: 0; background-color: rgba(255, 255, 0, 0.5); pointer-events: none; z-index: 1;",
		class = "image_overlay"
		})
		return pandoc.Span({el, overlay}, { style = "position: relative; display: inline-flex;" })
		end
		return el;
		end

		function Pandoc(el)

filter_2.lua

+1 −1

Original line number	Diff line number	Diff line
		@@ -238,7 +238,7 @@ if FORMAT:match 'html' then

		function Emph(el)
		el = Linking(el)
		return el
		return pandoc.RawInline('html', '<i>' .. pandoc.utils.stringify(el) .. '</i>')
		end

		function Underline(el)

html_to_docx.py

+38 −28

Original line number	Diff line number	Diff line
		import os
		import re
		import sys
		from dataclasses import dataclass
		@@ -9,7 +10,7 @@ from docx.api import Document
		from docx.enum.text import WD_ALIGN_PARAGRAPH
		from docx.shared import Emu, Pt, RGBColor

		DEBUG = False #print debug info switch
		DEBUG = True #print debug info switch


		@dataclass
		@@ -199,7 +200,7 @@ def insert_missing_space(para, inline_text):
		return ' '+inline_text



		#use new document to try word diff
		doc = Document("ETSI_GS_skeleton.docx")

		cssrules = {} #collection of parsed css rules as Style objects
		@@ -218,6 +219,7 @@ list_level = 0 # used to resolve multiple nested lists
		def handle_tag(HTML_tag : bs4.element.Tag):
		global para, run, cell, style, list_level, para_style, prev_inline, curr_inline

		##THIS HANDLES TEXT LITERALS
		if isinstance(HTML_tag, bs4.element.NavigableString):
		if DEBUG:
		print(HTML_tag)
		@@ -239,16 +241,16 @@ def handle_tag(HTML_tag : bs4.element.Tag):
		return

		curr_inline = None
		#resolve blockquotes
		##THIS HANDLES BLOCKQUOTES
		if HTML_tag.name == "blockquote":
		for child in HTML_tag.children:
		handle_tag(child)

		#resolve table
		##THIS HANDLES TABLES
		elif HTML_tag.name == "table":
		handle_table(HTML_tag)

		#resolve header
		##THIS HANDLES HEADERS
		elif header := re.match(r"h(\d)", HTML_tag.name):
		if HTML_tag.string:
		heading = doc.add_paragraph(style= "Heading"+header.group(1))
		@@ -262,7 +264,7 @@ def handle_tag(HTML_tag : bs4.element.Tag):
		if DEBUG:
		print("ERROR: header string is not available", list(HTML_tag.children))

		#resolve lists
		##THIS HANDLES LISTS
		elif HTML_tag.name in ["ol", "ul"]:
		list_level += 1 #start of the list

		@@ -276,7 +278,7 @@ def handle_tag(HTML_tag : bs4.element.Tag):
		list_level -= 1 #end of the list


		#start a new paragraph
		##THIS HANDLES DIVS AND PARAGRAPHS (BLOCKS)
		elif HTML_tag.name in blocks:

		#reset default
		@@ -337,7 +339,7 @@ def handle_tag(HTML_tag : bs4.element.Tag):
		handle_tag(child)


		#handle links
		##THIS HANDLES LINKS
		elif HTML_tag.name == "a":
		old_run = None
		if run: #we wont write here, but we want to save the style info, if present
		@@ -367,7 +369,7 @@ def handle_tag(HTML_tag : bs4.element.Tag):



		# add a new run
		##THIS HANDLES INLINES
		elif HTML_tag.name in inlines:
		curr_inline = HTML_tag

		@@ -424,7 +426,8 @@ def handle_tag(HTML_tag : bs4.element.Tag):
		def handle_table(table: bs4.element.Tag):
		global cell, para

		col_widths = [int(col.attrs["style"][-3:-1]) for col in table.find_all(lambda tag: tag.name == "col")] #list of each column width in percentages
		#each col tag has a style element with something like "width: xx%", this extracts the xx as an intint(
		col_widths = [int(re.search(r'(\d+)%',col.attrs["style"]).group(1)) for col in table.find_all(lambda tag: tag.name == "col")] #list of each column width in percentages
		rows = [row for row in table.find_all(lambda tag: tag.name == "tr")]

		#create an empty table in docx document
		@@ -480,17 +483,25 @@ def handle_table(table: bs4.element.Tag):

		if __name__ == "__main__":

		if len(sys.argv) < 2 or len(sys.argv) > 3:
		print("Usage: html_to_docx.py file.html [style.css]")
		if len(sys.argv) != 2:
		print("Usage: html_to_docx.py <Diretory with html and css files>")
		sys.exit(1)

		# Define the directory path
		dir_path = sys.argv[1]
		directory = os.scandir(dir_path)

		# Get list of files (htmls are sorted based on their number)
		html_files = sorted([entry.path for entry in directory if entry.is_file() and entry.name.endswith('.html') if entry.name != "index.html" ], key = lambda x: int(x.split("/")[-1].split("-")[0]))
		css_files = [entry.path for entry in directory if entry.is_file() and entry.name.endswith('.css')]

		if len(sys.argv) == 3:
		#use first css file in alphabetical order (this should by default be API.css)
		if css_files:
		#keeps long color definition
		cssutils.ser.prefs.minimizeColorHash = False
		#parse css
		parser = cssutils.CSSParser()
		sheet = parser.parseFile(sys.argv[2])
		sheet = parser.parseFile(css_files[0])
		for rule in sheet:
		if DEBUG:
		print(rule.selectorText)
		@@ -527,18 +538,17 @@ if __name__ == "__main__":

		cssrules[rule.selectorText[1:]] = _style # remove ".' form selector and uses it as key to add style to our rules dictionary

		#parse html
		with open(sys.argv[1], "r") as file:
		#parse htmls
		for html_file in html_files:
		if DEBUG:
		print(f"Parsing: {html_file}")
		with open(html_file, "r") as file:
		html_content = file.read()
		soup = bs4.BeautifulSoup(html_content, "html.parser")

		skip = True

		#construct doc
		for tag in soup.body.children:
		#skip to executive summary, everything before that is already in the skeleton
		if tag.string == "Executive summary":
		skip = False
		if not skip:
		handle_tag(tag)

template.html

+75 −2

Original line number	Diff line number	Diff line
		@@ -32,12 +32,84 @@ $if(mathjax)$
		$endif$
		$math$
		$endif$
		<script src="dist/bundle.js" defer></script>
		<!--[if lt IE 9]>
		<script src="//cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv-printshiv.min.js"></script>
		<![endif]-->
		</head>
		<body>
		<button style="position: fixed; bottom: 10px; right: 10px;" onclick="toggleStyles()">Toggle Background highlighting</button>
		<button style="position: fixed; bottom: 50px; right: 10px;" id="download_btn">Download HTML</button>
		<label class="switch">Editing
		<input type="checkbox" id="editing">
		<span class="slider round"></span>
		</label>
		<style>
		/* The switch - the box around the slider */
		.switch {
		position: fixed;
		bottom: 90px;
		right: 10px;
		display: inline-block;
		width: 120px;
		height: 34px;
		}

		/* Hide default HTML checkbox */
		.switch input {
		opacity: 0;
		width: 0;
		height: 0;
		}

		/* The slider */
		.slider {
		position: absolute;
		cursor: pointer;
		top: 0;
		left: 60px;
		right: 0;
		bottom: 0;
		background-color: #ccc;
		-webkit-transition: .4s;
		transition: .4s;
		}

		.slider:before {
		position: absolute;
		content: "";
		height: 26px;
		width: 26px;
		left: 4px;
		bottom: 4px;
		background-color: white;
		-webkit-transition: .4s;
		transition: .4s;
		}

		input:checked+.slider {
		background-color: #2196F3;
		}

		input:focus+.slider {
		box-shadow: 0 0 1px #2196F3;
		}

		input:checked+.slider:before {
		-webkit-transform: translateX(26px);
		-ms-transform: translateX(26px);
		transform: translateX(26px);
		}

		/* Rounded sliders */
		.slider.round {
		border-radius: 34px;
		}

		.slider.round:before {
		border-radius: 50%;
		}
		</style>

		<script>
		let styleToggle = false;
		@@ -78,17 +150,18 @@ $abstract$
		$endif$
		</header>
		$endif$
		$if(toc)$
		<nav id="$idprefix$TOC" role="doc-toc">
		$if(toc-title)$
		<h2 id="$idprefix$toc-title">$toc-title$</h2>
		$endif$
		<ul><li><h1><a href="0-.html">ETSI title</a></h1></li></ul>
		$table-of-contents$
		</nav>
		$endif$
		<div id="editor">
		$body$
		$for(include-after)$
		$include-after$
		$endfor$
		</div>
		</body>
		</html>