Enhance reference handling and table formatting in Lua filters (4ec587dd) · Commits · CIM - Context Information Management / NGSI-LD API

API.docx

−30 KiB (8.22 MiB)

File changed.

No diff preview for this file type.

README.md

+66 −2

Original line number	Diff line number	Diff line
		# Requiements
		# Requirements
		---

		# Command line tools
		## Command line tools
		- Pandoc
		- libreoffice (command line)
		- mogrify / imagemagick
		@@ -15,6 +16,7 @@
		Check the requirements.txt file for the needed packages.

		# Generate html from docx
		---
		Move to the right folder:
		> `cd docx_to_html`

		@@ -243,3 +245,65 @@ This function is called while handling a paragraph (\<p> or \<div> elements) whe
		---
		Invoke postprocessing, and create the final docx, named html_to_docx_output_fixed.docx:
		> `postprocessing.py html_to_docx_output.docx ./media`


		# Converting Between HTML and Markdown
		---

		The script `convert.py` handles conversion between Markdown and HTML.

		## Requirements

		#### [Pandoc](https://pandoc.org/installing.html)
		Version 3.7.0.2

		## Description
		Converts from HTML to Markdown and from Markdown to HTML using Pandoc and making heavy use of BeautifulSoup4 to perform preprocessing on input HTML and some postprocessing on generated HTML.

		## Usage
		`convert.py -h` - List the script's arguments

		### Arguments

		#### Required
		The script will not run without being provided with these arguments.

		- `--frm [file type]` - Either `html`, `md`, or `markdown`. Tells the script which file extensions to look for in the source folder. `md` and `markdown` both correspond to the `.md` extension. It must be different from the destination file type provided with `--to`.

		- `--to` - Similar to `--frm`, either `md`, `markdown`, or `html`. Tells the script with which file extension, either `.md` or `.html`, to assign to newly created files. It must be different from the source file type provided with `--frm`.

		#### Optional
		##### Directories
		The script will assign default values based on the destination filetype provided with `--to` if these arguments are omitted.

		- `--src [path/to/source/directory]` - The path to the directory containing the files to convert.

		Defaults:
		- Destination type is __Markdown__: `./API`
		- Destination type is __HTML__: `./API-md`

		- `--dest [path/to/destination/directory]` - The path to the directory that will contain the converted files. If it does not exist, it will be created.

		Defaults:
		- Destination type is __Markdown__: `./API-md`
		- Desination type is __HTML__: `./API-html`

		##### Markdown Cleanup
		If source HTML is poorly formatted, run additional preprocessing by passing the `--cleanup` flag.

		Ex. `convert.py --frm html --to md --cleanup`

		Passing this flag with another destination type will display a warning, but not do anything.

		### Example Usage
		- `convert.py --src ./API --frm html --dest ./API-markdown --to md`

		Converts HTML files contained in the API directory (the destination directory for documents converted from docx to HTML) to Markdown and places them in the API-markdown directory, creating the directory if necessary.

		NOTE: Equivalent to `convert.py --frm html --to md`

		- `convert.py --src ./API-markdown --frm md --dest ./API-html --to html`

		Converts the Markdown files contained in ./API-markdown to HTML and places them in the API-html directory, creating the directory if necessary.

		NOTE: Equivalent to `convert.py --frm md --to html`
		No newline at end of file

convert.py

0 → 100644

+1663 −0

File added.

Preview size limit exceeded, changes collapsed.

filter_1.lua

+34 −2

Original line number	Diff line number	Diff line
		@@ -88,11 +88,43 @@ end
		--helper function to generate references
		function Reference(el)
		local text = pandoc.utils.stringify(el)
		for prefix, number in text:gmatch("%[(i?%.?)(%d+)%]") do
		for prefix, number in text:gmatch("%[?%s*(i?%.?)(%d+)%]") do
		if number then
		local key = (prefix or "")..number -- just the number or the prefix + number if prefix is not null
		if text:sub(2, #key+1) == key then --this is a reference in 2.1 or 2.2
		references[key] = "#"..key --we save the reerence in the global table

		-- Pandoc separates the opening bracket from the rest
		-- of the reference tag due to the hyperlink in the docx,
		-- this is to fix that
		if el.content[1] and el.content[1].content and el.content[1].content[1] and el.content[1].content[1].text == "[" then -- The separated bracket
		if el.content[2] and el.content[2].content and el.content[2].content[1] and el.content[2].content[1].text then
		el.content[2].content[1].text = "[" .. el.content[2].content[1].text
		table.remove(el.content, 1)
		end
		end
		-- Reference i.18 still needs to be fixed because its bracketed portion is split up like this: [i. , 18]
		if key == "i.18" then
		if el.content[2] and el.content[2].content and el.content[2].content[1] and el.content[2].content[1].text then
		el.content[2].content[1].text = "[i." .. el.content[2].content[1].text
		table.remove(el.content, 1)
		end
		end
		----

		-- Some references have portions of text attached to the element with the tag,
		-- so they must be separated
		if el.content[1] and el.content[1].content and el.content[1].content[1] and el.content[1].content[1].text then
		local ideal_length = string.len("[" .. prefix .. number .. "]\t")
		local actual_length = string.len(el.content[1].content[1].text)
		if ideal_length ~= actual_length then
		local text_to_split = el.content[1].content[1].text
		table.remove(el.content, 1)
		table.insert(el.content, 1, pandoc.Plain({pandoc.Str(text_to_split:sub(ideal_length, actual_length))}))
		table.insert(el.content, 1, pandoc.Plain({pandoc.Str(text_to_split:sub(1, ideal_length))}))
		end
		end
		----
		references[key] = "#"..key --we save the reference in the global table
		return pandoc.Div(el.content, {id = key, class = el.attr["classes"][1]})
		end
		end

filter_2.lua

+22 −2

Original line number	Diff line number	Diff line
		@@ -265,6 +265,19 @@ if FORMAT:match 'html' then
		secondDiv.classes = pandoc.List({})
		--set the content of the original div to be the two new divs
		el.content = pandoc.List({firstDiv, secondDiv})
		else
		-- This is a reference
		-- Organize into two columns - one for the tag and one for the citation
		if el.content[1] and el.content[1].t == "Span" and el.content[1].content[2] and el.content[1].content[2].t == "Link" and el.content[1].content[2].content[1] and el.content[1].content[2].content[1].text:find("(%[?%s*(i?%.?%d+)%])") then
		local reference_tag = el.content[1]
		local everything_else = {}

		for i = 2, #el.content do
		table.insert(everything_else, el.content[i])
		end

		el.content = pandoc.List({reference_tag, pandoc.Span(everything_else)})
		end
		end
		end
		if el.classes[1] == "NO" or el.classes[1] == "TAN" then
		@@ -333,9 +346,16 @@ if FORMAT:match 'html' then

		function Str(el)
		--substitute reference with link
		local startIndex, endIndex, reference, key = el.text:find("(%[(i?%.?%d+)%])")
		local startIndex, endIndex, reference, key = el.text:find("(%[?%s*(i?%.?%d+)%])")
		if reference and references[key] then --reference found
		return pandoc.Span({pandoc.Str(el.text:sub(1,startIndex-1)), pandoc.Link(reference, references[key]), pandoc.Str(el.text:sub(endIndex+1))}) --start of the string + Link + last bit of string
		if reference and reference:sub(1,1) ~= "[" then -- Add missing opening bracket to reference
		reference = "[" .. reference
		end
		return pandoc.Span({
		pandoc.Str(el.text:sub(1,startIndex-1)),
		pandoc.Link(reference, references[key]),
		pandoc.Str(el.text:sub(endIndex+1))
		}) --start of the string + Link + last bit of string
		end
		end
		end
		No newline at end of file