Commit 2fdbd6c5 authored by Marco Cavalli's avatar Marco Cavalli
Browse files

feat: add timing functionality to conversion scripts and update argument parsing

parent 78944bed
Loading
Loading
Loading
Loading
+58 −17
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@ set "GIT_BRANCH="
set "REPO_SRC="
set "SRC_SUBFOLDER="
set "REBUILD=false"
set "TIMER_ENABLED=false"

:parse_args
if "%~1"=="" goto args_done
@@ -51,8 +52,19 @@ if "%~1"=="--diff" (
	shift
	goto parse_args
)
if "%~1"=="--git" (
if "%~1"=="--diff-git" (
	set "GIT=true"
	set "NEXT=%~2"
	if "!NEXT!"=="" (
		set "GIT_BRANCH="
		shift
		goto parse_args
	)
	if "!NEXT:~0,2!"=="--" (
		set "GIT_BRANCH="
		shift
		goto parse_args
	)
	set "GIT_BRANCH=%~2"
	shift
	shift
@@ -69,6 +81,29 @@ if "%~1"=="--rebuild" (
	shift
	goto parse_args
)
if "%~1"=="--time" (
	set "TIMER_ENABLED=true"
	shift
	goto parse_args
)
if "%~1"=="--help" (
	echo Usage: convert.bat --frm <format> --to <format> --folder <path> [options]
	echo.
	echo Required parameters:
	echo   --frm <format>        Source format (e.g., md)
	echo   --to <format>         Target format (e.g., html)
	echo   --folder <path>       Path to the folder containing files to convert
	echo.
	echo Optional parameters:
	echo   --src <path>          Path to source files (overrides --folder for source volume)
	echo   --file-order <file>   File specifying the order of conversion
	echo   --diff                 Enable diff conversion using folder-base
	echo   --diff-git <branch>    Enable git-based diff conversion. If specified, uses the input branch
	echo   --arch <arch>         Target architecture (amd64 or arm64^). Default is amd64.
	echo   --rebuild              Rebuild the Docker image before running
	echo   --time                 Enable timing of the conversion process
	exit /b 0
)
echo Unknown parameter passed: %~1
exit /b 1
:args_done
@@ -106,12 +141,13 @@ if "%FOLDER%"=="" (
	echo Error: --folder is required.
	exit /b 1
)
if %GIT%==true if %DIFF%==false (
	echo Error: --git can only be used together with --diff.
	exit /b 1
if %GIT%==true if %DIFF%==true (
	echo Warning: --diff-git cannot be used together with --diff.
	echo "--diff-git overrides --diff. Continuing with --diff-git behavior."
	set "DIFF=false"
)
if %GIT%==true if "%SRC%"=="" (
	echo Error: --git requires --src to be specified.
	echo Error: --diff-git requires --src to be specified.
	exit /b 1
)
set "SRC_VOL=%FOLDER%"
@@ -127,18 +163,18 @@ if "%GIT%"=="true" (
	popd
	
	if exist "!SRC_VOL_ABS!\.git" (
		echo SRC is a Git repository
		rem SRC is a Git repository
		set "REPO_SRC=!SRC_VOL_ABS!"
	) else (
		pushd "!SRC_VOL_ABS!\.."
		if exist ".git" (
			echo SRC's parent is a Git repository
			rem SRC's parent is a Git repository
			rem Extract subfolder name
			for %%F in ("!SRC_VOL_ABS!") do set "SRC_SUBFOLDER=%%~nxF"
			rem Get parent directory absolute path
			set "REPO_SRC=!CD!"
		) else (
			echo WARNING: neither src nor its parent is a Git repository - Proceeding without diff
			rem WARNING: neither src nor its parent is a Git repository - Proceeding without diff
			set "DIFF=false"
			set "GIT=false"
			set "SRC_SUBFOLDER="
@@ -153,16 +189,21 @@ if defined REPO_SRC (
set "EXTRA_ARGS="
if defined SRC (
	if defined SRC_SUBFOLDER (
		set "EXTRA_ARGS=!EXTRA_ARGS! --src ./sources/%SRC_SUBFOLDER%"
	)
		set "EXTRA_ARGS=!EXTRA_ARGS! --src "/data/sources/!SRC_SUBFOLDER!""
	) else (
	if defined SRC (
		set "EXTRA_ARGS=!EXTRA_ARGS! --src ./sources"
		set "EXTRA_ARGS=!EXTRA_ARGS! --src "/data/sources""
	)
)
if defined FILE_ORDER set "EXTRA_ARGS=!EXTRA_ARGS! --file-order ""%FILE_ORDER%"""
if defined FILE_ORDER set "EXTRA_ARGS=!EXTRA_ARGS! --file-order "!FILE_ORDER!""
if %DIFF%==true set "EXTRA_ARGS=!EXTRA_ARGS! --diff"
if %DIFF%==true if %GIT%==true ( set "EXTRA_ARGS=!EXTRA_ARGS! --git %GIT_BRANCH%" )
if %GIT%==true (
	if defined GIT_BRANCH (
		set "EXTRA_ARGS=!EXTRA_ARGS! --diff-git "!GIT_BRANCH!""
	) else (
		set "EXTRA_ARGS=!EXTRA_ARGS! --diff-git"
	)
)
if %TIMER_ENABLED%==true set "EXTRA_ARGS=!EXTRA_ARGS! --time"

set "APP_DIR=%CD%"
if defined EXTRA_ARGS (
+190 −155
Original line number Diff line number Diff line
@@ -27,6 +27,7 @@ from src.utils import (
    get_md_to_html_command,
    get_consolidated_md_path,
)
from src.time_book import get_timer, set_threshold
import git_helper

from src.to_md.preprocessing import preprocess as preprocess_md
@@ -75,13 +76,18 @@ parser.add_argument(
parser.add_argument(
    "--diff",
    action="store_true",
    help="Optionally, when converting from Markdown to HTML, highlight differences between the current Markdown files and the previous version stored on disk. When used with --git, the target branch or commit is used as the comparison source.",
    help="Optionally, when converting from Markdown to HTML, highlight differences between the current Markdown files and the previous version stored on disk. When using --diff-git, the target branch or commit is used as the comparison source.",
)
parser.add_argument(
    "--git",
    "--diff-git",
    nargs="?",
    const=True,
    help="Optionally, when provided, uses Git to get the target branch or commit and use it as the source of the diff when converting from Markdown to HTML. This argument can only be used with the --diff argument.",
    help="Optionally, when provided, uses Git to get the target branch or commit and use it as the source of the diff when converting from Markdown to HTML. This argument overrides the diff command.",
)
parser.add_argument(
    "--time",
    action="store_true",
    help="Optionally, print a timing report at the end of the conversion process, showing the time taken for each section of the process. This can be used to identify any bottlenecks in the conversion process and optimize them for better performance.",
)

args = parser.parse_args()
@@ -103,22 +109,24 @@ FOLDER: str = args.folder

IS_DIFF: bool = bool(args.diff)

IS_GIT_DIFF: bool = args.git is not None
IS_GIT_DIFF: bool = args.diff_git is not None

GIT_CHECKOUT_NAME: str = (
    args.diff_git if IS_GIT_DIFF and isinstance(args.diff_git, str) else ""
)

GIT_CHECKOUT_NAME: str = args.git if IS_GIT_DIFF and isinstance(args.git, str) else ""
TIMER_ENABLED: bool = bool(args.time)

if IS_GIT_DIFF and IS_DIFF:
    print("Warning: --diff-git cannot be used together with --diff.")
    print("--diff-git overrides --diff. Continuing with --diff-git behavior.")
if IS_GIT_DIFF and not IS_DIFF:
    print(
        p_error(
            f"The {p_label('--git')} argument can only be used together with {p_label('--diff')}."
        )
    )
    sys.exit(1)
    IS_DIFF = True

if not args.src and IS_DIFF and IS_GIT_DIFF:
    print(
        p_error(
            f"When using the {p_label('--git')} argument, the {p_label('--src')} argument must be provided with a valid path to a Git repository."
            f"When using the {p_label('--diff-git')} argument, the {p_label('--src')} argument must be provided with a valid path to a Git repository."
        )
    )
    sys.exit(1)
@@ -230,6 +238,7 @@ def convert(conversion_args=None):
    """
    if conversion_args is None:
        conversion_args = args
    with get_timer().section("Convert md to html"):

        def convert_md_to_html():
            """
@@ -245,19 +254,25 @@ def convert(conversion_args=None):
            if os.path.exists(DEST):
                shutil.rmtree(DEST)

        filename_numbers_mapping = preprocess_html(SRC, SRC_TYPE, CONSOLIDATED_MD_PATH, FILE_ORDER_JSON)
        filename_numbers_mapping_path = os.path.join(SRC, "filename_numbers_mapping.json")
            filename_numbers_mapping = preprocess_html(
                SRC, SRC_TYPE, CONSOLIDATED_MD_PATH, FILE_ORDER_JSON
            )
            filename_numbers_mapping_path = os.path.join(
                SRC, "filename_numbers_mapping.json"
            )
            with open(filename_numbers_mapping_path, "w") as f:
                json.dump(filename_numbers_mapping, f, indent=4)

            # Conversion
            command = get_md_to_html_command(SRC, DEST, CONSOLIDATED_MD_PATH, CSS_SRC)

            with get_timer().section("Preprocessing: Pandoc md to html conversion"):
                try:
                    subprocess.run(command, check=True, capture_output=True, text=True)
                    os.remove(filename_numbers_mapping_path)
                except subprocess.CalledProcessError as e:
            print(f"Error converting Markdown files in {SRC} to HTML:\n{e.stderr}")
                    print(
                        f"Error converting Markdown files in {SRC} to HTML:\n{e.stderr}"
                    )
                    os.remove(filename_numbers_mapping_path)
                    sys.exit(1)

@@ -275,6 +290,8 @@ def convert(conversion_args=None):
            # Cleanup the consolidated Markdown
            handle_consolidated_md("delete", SRC, CONSOLIDATED_MD_PATH)

    with get_timer().section("Convert html to md"):

        def convert_html_to_md():
            """
            Handles conversion from HTML to Markdown.
@@ -303,7 +320,11 @@ def convert(conversion_args=None):
                    md_input_path = os.path.join(DEST, filename)

                    headings_id_mapping = preprocess_md(
                    input_path, md_input_path, IS_CLEANUP, CSS_SRC, FILENAMES_MAPPING
                        input_path,
                        md_input_path,
                        IS_CLEANUP,
                        CSS_SRC,
                        FILENAMES_MAPPING,
                    )

                    if IS_CLEANUP:
@@ -335,6 +356,8 @@ def convert(conversion_args=None):
                if os.path.exists(md_input_path):
                    os.remove(md_input_path)

    with get_timer().section("Convert html to docx"):

        def convert_html_to_docx():
            """
            Handles conversion from HTML to a single Docx file.
@@ -352,7 +375,9 @@ def convert(conversion_args=None):
                subprocess.run(command, check=True, capture_output=True, text=True)
            except subprocess.CalledProcessError as e:
                print(
                p_error(f"Unable to convert HTML files in {SRC} to Docx:\n{e.stderr}")
                    p_error(
                        f"Unable to convert HTML files in {SRC} to Docx:\n{e.stderr}"
                    )
                )
                sys.exit(1)

@@ -571,19 +596,25 @@ def load_target_commit(path):

### Run script
print("Starting conversion...")
t = get_timer()
with t.section("Launch conversion"):
    convert()
print("Conversion from {} to {} completed successfully.".format(args.frm, args.to))

print("Post-processing started...")

if SRC_TYPE == "html" and DEST_TYPE == "md":
    with t.section("Post-process Markdown files"):
        postprocess_md(DEST, IS_CLEANUP)

if SRC_TYPE == "html" and DEST_TYPE == "docx":
    with t.section("Post-process Docx file"):
        postprocess_docx(OUTPUT_DOC_PATH)

if SRC_TYPE == "md" and DEST_TYPE == "html":
    with t.section("Post-process HTML files"):
        postprocess_html(DEST)
    with t.section("Apply diff to HTML files"):
        if IS_DIFF:
            # default diff_source_dir is GENERATED_FILE/{FOLDER}-base/html
            if IS_GIT_DIFF:
@@ -610,4 +641,8 @@ if SRC_TYPE == "md" and DEST_TYPE == "html":
            shutil.copy("diffVisualizer.js", DEST)
            print(f"Diff applied.")
print("Post-processing completed successfully.")
if TIMER_ENABLED:
    print("Timing report:")
    set_threshold(0.001)
    print(t.report())
# endregion
+36 −6
Original line number Diff line number Diff line
@@ -36,6 +36,7 @@ FILE_ORDER=""
DIFF=false
GIT=false
GIT_BRANCH=""
TIMER_ENABLED=false

while [[ "$#" -gt 0 ]]; do
    case $1 in
@@ -46,8 +47,35 @@ while [[ "$#" -gt 0 ]]; do
        --file-order) FILE_ORDER="$2"; shift ;;
        --arch) shift ;; # Already processed
        --diff) DIFF=true ;;
        --git) GIT=true; GIT_BRANCH="$2"; shift ;;
        --diff-git)
            GIT=true
            if [[ -n "$2" && "$2" != --* ]]; then
                GIT_BRANCH="$2"
                shift
            else
                GIT_BRANCH=""
            fi
            ;;
        --time) TIMER_ENABLED=true ;;
        --rebuild) ;; # Already processed
        --help)
            echo "Usage: bash convert.sh --frm <format> --to <format> --folder <path> [options]"
            echo ""
            echo "Required parameters:"
            echo "  --frm <format>        Source format (e.g., md)"
            echo "  --to <format>         Target format (e.g., html)"
            echo "  --folder <path>       Path to the folder containing files to convert"
            echo ""
            echo "Optional parameters:"
            echo "  --src <path>          Path to source files (overrides --folder for source volume)"
            echo "  --file-order <file>   File specifying the order of conversion"
            echo "  --diff                 Enable diff conversion using folder-base"
            echo "  --diff-git <branch>    Enable git-based diff conversion. If specified, uses the input branch"
            echo "  --arch <arch>         Target architecture (amd64 or arm64). Default is amd64."
            echo "  --rebuild              Rebuild the Docker image before running"
            echo "  --time                 Enable timing of the conversion process"
            exit 0
            ;;
        *) echo "Unknown parameter passed: $1"; exit 1 ;;
    esac
    shift
@@ -58,12 +86,13 @@ if [[ -z "$FRM" || -z "$TO" || -z "$FOLDER" ]]; then
    exit 1
fi
if [[ "$GIT" == true && -z "$SRC" ]]; then
    echo "Error: --src parameter is required when --git is specified."
    echo "Error: --src parameter is required when --diff-git is specified."
    exit 1
fi
if [[ "$GIT" == true && "$DIFF" != true ]]; then
    echo "Error: --git can only be used together with --diff."
    exit 1
if [[ "$GIT" == true && "$DIFF" == true ]]; then
    echo "Warning: --diff-git cannot be used together with --diff."
    echo "--diff-git overrides --diff. Continuing with --diff-git behavior."
    DIFF=false
fi
SRC_VOL=""
if [[ -n "$SRC" ]]; then
@@ -114,7 +143,8 @@ elif [[ -n "$SRC" ]]; then
fi
[[ -n "$FILE_ORDER" ]] && RUN_ARGS+=("--file-order" "$FILE_ORDER")
[[ "$DIFF" == true ]] && RUN_ARGS+=("--diff")
[[ "$DIFF" == true && "$GIT" == true ]] && RUN_ARGS+=("--git" "$GIT_BRANCH")
[[ "$GIT" == true ]] && RUN_ARGS+=("--diff-git" "$GIT_BRANCH")
[[ "$TIMER_ENABLED" == true ]] && RUN_ARGS+=("--time")

"${RUN_ARGS[@]}"
+49 −0
Original line number Diff line number Diff line
import time
from contextlib import contextmanager

class TimerBook:
    def __init__(self):
        self._global_start = time.perf_counter()
        self.sections = {}
        self._threshold = 0.0

    def global_elapsed(self):
        return time.perf_counter() - self._global_start

    @contextmanager
    def section(self, name):
        start = time.perf_counter()
        try:
            yield
        finally:
            elapsed = time.perf_counter() - start
            self.sections[name] = self.sections.get(name, 0.0) + elapsed

    def reset(self):
        """Reset all measurements (global and sections)."""
        self._global_start = time.perf_counter()
        self.sections.clear()

    def report(self):
        lines = [f"Global: {self.global_elapsed():.3f}s"]
        for k, v in self.sections.items():
            if v >= self._threshold:
                lines.append(f"{k}: {v:.3f}s")
        return "\n".join(lines)


# Module-level singleton to share across modules
_TIMER_SINGLETON = TimerBook()


def get_timer():
    """Return the shared TimerBook instance."""
    return _TIMER_SINGLETON


def set_threshold(threshold: float):
    """Set a threshold for reporting sections.

    Sections with elapsed time below this threshold will be omitted from the report.
    """
    _TIMER_SINGLETON._threshold = threshold