Commit 55e8a9aa authored by Marco Cavalli's avatar Marco Cavalli
Browse files

feat: add diff command

parent d23f475e
Loading
Loading
Loading
Loading
+0 −21
Original line number Diff line number Diff line
__pycache__/
*.pyc
*.pyo
*.pyd
.Python
*.so
*.egg
*.egg-info/
dist/
build/
.git/
.gitignore
.vscode/
.idea/
*.md
!requirements.txt
.env
venv/
env/
GENERATED_FILES/*
!GENERATED_FILES/.gitkeep
 No newline at end of file
+7 −11
Original line number Diff line number Diff line
@@ -26,28 +26,24 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
# Set working directory
WORKDIR /app

# Copy requirements first for better caching
COPY requirements.txt .

# Install Python dependencies
RUN pip install --no-cache-dir -r requirements.txt

# Copy the entire application
COPY . .
RUN pip install --no-cache-dir -r /app/requirements.txt

# Create GENERATED_FILES directory
RUN mkdir -p GENERATED_FILES
RUN mkdir -p sources
RUN mkdir -p /app/GENERATED_FILES
RUN mkdir -p /data/sources

# Create non-root user
RUN useradd -m -u 1000 appuser && \
    chown -R appuser:appuser /app

RUN git config --system --add safe.directory "*"

USER appuser

# Declare volume mount point
VOLUME /app/GENERATED_FILES
VOLUME /app/sources
VOLUME /app
VOLUME /data/sources

# Set environment variables
ENV PYTHONUNBUFFERED=1
+10 −0
Original line number Diff line number Diff line
@@ -238,6 +238,16 @@ Specify a different directory containing the Markdown files.

`convert.py --frm md --to html --folder {folder_name} --src relative/or/absolute/source/path`

Creates a diff version of the HTML by comparing it with `./GENERATED_FILES/{folder_name}-base`.
When using `--diff`, `--src` is required.

`convert.py --frm html --to md --folder {folder_name} --src relative/or/absolute/source/path --diff`

Creates a diff version of the HTML comparing it with a branch or a commit ID specified in `{target_name}`. If not specified, uses `main` or `master`.
When using `--git`, `--diff` is required.

`convert.py --frm html --to md --folder {folder_name} --src relative/or/absolute/source/path --diff --git {target_name}`

### 2.2.3 HTML to Docx

---
+98 −19
Original line number Diff line number Diff line
@@ -7,6 +7,12 @@ set "TO="
set "FOLDER="
set "SRC="
set "FILE_ORDER="
set "DIFF=false"
set "GIT=false"
set "GIT_BRANCH="
set "REPO_SRC="
set "SRC_SUBFOLDER="
set "REBUILD=false"

:parse_args
if "%~1"=="" goto args_done
@@ -40,22 +46,37 @@ if "%~1"=="--file-order" (
	shift
	goto parse_args
)
if "%~1"=="--diff" (
	set "DIFF=true"
	shift
	goto parse_args
)
if "%~1"=="--git" (
	set "GIT=true"
	set "GIT_BRANCH=%~2"
	shift
	shift
	goto parse_args
)
if "%~1"=="--arch" (
	set "ARCH=%~2"
	shift
	shift
	goto parse_args
)
if "%~1"=="--rebuild" (
	set "REBUILD=true"
	shift
	goto parse_args
)
echo Unknown parameter passed: %~1
exit /b 1

:args_done

if "%ARCH%" NEQ "amd64" if "%ARCH%" NEQ "arm64" (
	echo Error: --arch must be either 'amd64' or 'arm64'
	exit /b 1
)

rem Ensure the docker image exists; build if missing
docker image inspect md-converter >nul 2>&1
if errorlevel 1 (
@@ -65,7 +86,14 @@ if errorlevel 1 (
		exit /b 1
	)
)

rem Rebuild the image if --rebuild is present
if "%REBUILD%"=="true" (
	docker build --build-arg TARGETARCH=%ARCH% -t md-converter .
	if errorlevel 1 (
		echo Failed to build image md-converter.
		exit /b 1
	)
)
if "%FRM%"=="" (
	echo Error: --frm is required.
	exit /b 1
@@ -78,33 +106,84 @@ if "%FOLDER%"=="" (
	echo Error: --folder is required.
	exit /b 1
)

if %GIT%==true if %DIFF%==false (
	echo Error: --git can only be used together with --diff.
	exit /b 1
)
if %GIT%==true if "%SRC%"=="" (
	echo Error: --git requires --src to be specified.
	exit /b 1
)
set "SRC_VOL=%FOLDER%"
if defined SRC set "SRC_VOL=%SRC%"
if "%GIT%"=="true" (
	rem Convert relative path to absolute for proper .git detection
	pushd "%SRC_VOL%"
	if errorlevel 1 (
		echo Error: Cannot access source directory "%SRC_VOL%"
		exit /b 1
	)
	set "SRC_VOL_ABS=!CD!"
	popd
	
	if exist "!SRC_VOL_ABS!\.git" (
		echo SRC is a Git repository
		set "REPO_SRC=!SRC_VOL_ABS!"
	) else (
		pushd "!SRC_VOL_ABS!\.."
		if exist ".git" (
			echo SRC's parent is a Git repository
			rem Extract subfolder name
			for %%F in ("!SRC_VOL_ABS!") do set "SRC_SUBFOLDER=%%~nxF"
			rem Get parent directory absolute path
			set "REPO_SRC=!CD!"
		) else (
			echo WARNING: neither src nor its parent is a Git repository - Proceeding without diff
			set "DIFF=false"
			set "GIT=false"
			set "SRC_SUBFOLDER="
			set "REPO_SRC="
		)
		popd
	)
)
if defined REPO_SRC (
	set "SRC_VOL=%REPO_SRC%"
)
set "EXTRA_ARGS="
if defined SRC set "EXTRA_ARGS=!EXTRA_ARGS! --src ./sources"
if defined SRC (
	if defined SRC_SUBFOLDER (
		set "EXTRA_ARGS=!EXTRA_ARGS! --src ./sources/%SRC_SUBFOLDER%"
	)
) else (
	if defined SRC (
		set "EXTRA_ARGS=!EXTRA_ARGS! --src ./sources"
	) 
) 
if defined FILE_ORDER set "EXTRA_ARGS=!EXTRA_ARGS! --file-order ""%FILE_ORDER%"""
if %DIFF%==true set "EXTRA_ARGS=!EXTRA_ARGS! --diff"
if %DIFF%==true if %GIT%==true ( set "EXTRA_ARGS=!EXTRA_ARGS! --git %GIT_BRANCH%" )

set "GEN_DIR=%CD%\GENERATED_FILES"

set "APP_DIR=%CD%"
if defined EXTRA_ARGS (
	docker run --rm ^
		-v "%GEN_DIR%:/app/GENERATED_FILES:rw" ^
		-v "%SRC_VOL%:/app/sources:rw" ^
		--user 1000:1000 ^
		-v "%APP_DIR%:/app:rw" ^
		-v "%SRC_VOL%:/data/sources:rw" ^
		md-converter convert.py ^
		--frm "%FRM%" ^
		--to "%TO%" ^
		--folder "%FOLDER%" ^
		--frm %FRM% ^
		--to %TO% ^
		--folder %FOLDER% ^
		!EXTRA_ARGS!
) else (
	docker run --rm ^
		-v "%GEN_DIR%:/app/GENERATED_FILES:rw" ^
		-v "%SRC_VOL%:/app/sources:rw" ^
		--user 1000:1000 ^
		-v "%APP_DIR%:/app:rw" ^
		-v "%SRC_VOL%:/data/sources:rw" ^
		md-converter convert.py ^
		--frm "%FRM%" ^
		--to "%TO%" ^
		--folder "%FOLDER%"
		--frm %FRM% ^
		--to %TO% ^
		--folder %FOLDER%
)

endlocal
+263 −2
Original line number Diff line number Diff line
# region Imports
import argparse, subprocess, os, sys, shutil, json
from html_diff import make_trackchanges_diff
from src.constants import (
    FILEGEN_DIR,
    TO_DOCX_EXCLUDED_HTML_FILES,
    TO_MD_EXCLUDED_HTML_FILES,
    SCOPE,
    REFS,
    DEFS
    DEFS,
    GIT_STASH_STATUSES,
)
from src.utils import (
    get_consolidated_html_path,
@@ -25,6 +27,7 @@ from src.utils import (
    get_md_to_html_command,
    get_consolidated_md_path,
)
import git_helper

from src.to_md.preprocessing import preprocess as preprocess_md
from src.to_md.postprocessing import postprocess as postprocess_md
@@ -69,6 +72,17 @@ parser.add_argument(
    "--file_order",
    help=f"Optionally, when converting from {p_label('Markdown')} to {p_label('HTML')}, provide the relative or absolute path to a JSON file containing the order in which the files should be arranged in the final produced HTML. If a file is not provided, the default ordering will be used.  The default ordering parses first clauses (named {p_label(f'clause-{{number 4 - 20}}')}), then annexes (named {p_label(f'annex-{{letter a - z}}')}).  Any files in the source directory not included in the list will be included at the end in alphabetical order. However, any files present in the JSON that are not present in the source directory will cause an error and the script to exit prematurely.",
)
parser.add_argument(
    "--diff",
    action="store_true",
    help="Optionally, when converting from Markdown to HTML, highlight differences between the current Markdown files and the previous version stored on disk. When used with --git, the target branch or commit is used as the comparison source.",
)
parser.add_argument(
    "--git",
    nargs="?",
    const=True,
    help="Optionally, when provided, uses Git to get the target branch or commit and use it as the source of the diff when converting from Markdown to HTML. This argument can only be used with the --diff argument.",
)

args = parser.parse_args()

@@ -87,6 +101,28 @@ validate_type(args.to, is_src=False)
# Argument values
FOLDER: str = args.folder

IS_DIFF: bool = bool(args.diff)

IS_GIT_DIFF: bool = args.git is not None

GIT_CHECKOUT_NAME: str = args.git if IS_GIT_DIFF and isinstance(args.git, str) else ""

if IS_GIT_DIFF and not IS_DIFF:
    print(
        p_error(
            f"The {p_label('--git')} argument can only be used together with {p_label('--diff')}."
        )
    )
    sys.exit(1)

if not args.src and IS_DIFF and IS_GIT_DIFF:
    print(
        p_error(
            f"When using the {p_label('--git')} argument, the {p_label('--src')} argument must be provided with a valid path to a Git repository."
        )
    )
    sys.exit(1)

SRC_TYPE: str = "html" if str(args.frm).startswith("html") else args.frm
if args.src:
    SRC = args.src
@@ -184,7 +220,7 @@ shutil.copytree(SRC_MEDIA_PATH, DEST_MEDIA_PATH)


# region Conversion Logic
def convert():
def convert(conversion_args=None):
    """
    Converts between Markdown and HTML, based on provided script arguments

@@ -192,6 +228,8 @@ def convert():
    1. Preprocess inputs
    2. Convert using Pandoc
    """
    if conversion_args is None:
        conversion_args = args

    def convert_md_to_html():
        """
@@ -333,6 +371,204 @@ def convert():
        convert_md_to_html()


def get_comparing_folder_from_git():
    # Check if the DIFF_PATH is a git repo
    if not git_helper.is_git_repo(SRC):
        print(
            p_error(f"The provided path at {p_label('--src')} is not a Git repository.")
        )
        sys.exit(1)
    current_branch = git_helper.get_current_branch(SRC)
    if current_branch is None:
        print(
            p_error(
                f"Could not determine the current branch for the Git repository at {p_label('--src')}."
            )
        )
        sys.exit(1)
    # Get the main branch name
    target_to_checkout = (
        GIT_CHECKOUT_NAME
        if GIT_CHECKOUT_NAME
        else git_helper.get_master_branch_name(SRC)
    )
    if not target_to_checkout:
        print(
            p_error(
                f"Could not determine the main branch name for the Git repository at {p_label('--src')}."
            )
        )
        sys.exit(1)
    target_is_branch = git_helper.is_target_a_branch(SRC, target_to_checkout)
    target_is_commit = git_helper.is_target_a_commit(SRC, target_to_checkout)
    current_commit_hash = git_helper.get_current_commit_hash(SRC)
    last_remote_commit_hash = git_helper.get_last_commit_remote_hash(
        SRC, remote_name="origin", branch_name=target_to_checkout
    )
    if target_is_commit and current_commit_hash == target_to_checkout:
        print(
            p_error(
                f"The provided Git commit hash {p_label(target_to_checkout)} is the same as the current commit in the Git repository at {p_label('--src')}. Please provide a different commit hash to compare against."
            )
        )
        sys.exit(1)
    if (
        target_is_branch
        and current_branch == target_to_checkout
        and current_commit_hash == last_remote_commit_hash
    ):
        print(
            p_error(
                f"The provided Git branch name {p_label(target_to_checkout)} is the same as the current branch in the Git repository at {p_label('--src')}. Please provide a different branch name to compare against."
            )
        )
        sys.exit(1)
    # If target_to_checkout is a branch name, get the latest remote commit hash
    if target_is_branch:
        branch_target = target_to_checkout
        target_to_checkout = last_remote_commit_hash
        if not target_to_checkout:
            print(
                p_error(
                    f"Could not retrieve the latest remote commit hash for branch {p_label(branch_target)} in the Git repository at {p_label('--src')}."
                )
            )
            sys.exit(1)
    else:
        branch_target = None
    if (
        load_target_commit(os.path.join(FILEGEN_DIR, f"{FOLDER}-base"))
        == target_to_checkout
    ):
        print(
            f"The base HTML files for commit {p_label(target_to_checkout)} {f'(branch: {p_label(branch_target)}) ' if branch_target else ''}have already been generated. Using existing files for diff..."
        )
        return os.path.join(FILEGEN_DIR, f"{FOLDER}-base", "html")
    # Stash any changes
    changes_stashed = git_helper.stash_changes(SRC)
    if changes_stashed == GIT_STASH_STATUSES.FAILED:
        print(
            p_error(
                f"Could not stash changes in the Git repository at {p_label('--src')}."
            )
        )
        sys.exit(1)
    # Checkout to the target branch/commit
    if not git_helper.checkout(SRC, target_to_checkout):
        print(
            p_error(
                f"Could not checkout to commit {p_label(target_to_checkout)} {f'(branch: {p_label(branch_target)}) ' if branch_target else ''}in the Git repository at {p_label('--src')}."
            )
        )
        restore_original_state(current_branch, changes_stashed)
        sys.exit(1)
    # pull the latest changes
    if target_is_branch and current_branch != branch_target and branch_target:
        print(
            f"Pulling latest changes for branch {p_label(branch_target)} in the Git repository at {p_label('--src')}..."
        )
        if not git_helper.pull_changes(
            SRC, remote_name="origin", branch_name=branch_target
        ):
            print(
                p_error(
                    f"Could not pull latest changes in the Git repository at {p_label('--src')}."
                )
            )
            restore_original_state(current_branch, changes_stashed)
            sys.exit(1)
    # Start a new subprocess to generate the comparing HTML files in a temporary directory
    temp_base_html_dir = os.path.join(FILEGEN_DIR, f"{FOLDER}-base")
    if os.path.exists(temp_base_html_dir):
        shutil.rmtree(temp_base_html_dir)
    os.makedirs(temp_base_html_dir)
    temp_args = argparse.Namespace(
        frm="md",
        to="html",
        folder=f"{FOLDER}-base",
        src=SRC,
        file_order=None,
        diff=None,
        git=None,
    )
    # Run the conversion in the temporary directory
    print(
        f"Generating base HTML files from commit {p_label(target_to_checkout)} {f'(branch: {p_label(branch_target)}) ' if branch_target else ''}for diff..."
    )
    try:
        subprocess.run(
            [
                sys.executable,
                "convert.py",
                "--frm",
                temp_args.frm,
                "--to",
                temp_args.to,
                "--folder",
                temp_args.folder,
                "--src",
                temp_args.src,
            ],
            check=True,
            capture_output=True,
            text=True,
        )
    except subprocess.CalledProcessError as e:
        print(p_error(f"Error generating comparing HTML files:\n{e.stderr}"))
        sys.exit(1)
    finally:
        restore_original_state(current_branch, changes_stashed)
        print(
            f"Restored original Git state. Returning to branch {p_label(current_branch)}..."
        )
    print(f"Base HTML files generated successfully.")
    save_target_commit(target_to_checkout, temp_base_html_dir)
    return os.path.join(FILEGEN_DIR, f"{FOLDER}-base", "html")


def apply_stash(changes_stashed):
    # Apply stashed changes
    if (
        changes_stashed == GIT_STASH_STATUSES.STASHED
        and not git_helper.stash_pop_changes(SRC)
    ):
        print(
            p_error(
                f"Could not apply stashed changes in the Git repository at {p_label('--src')}."
            )
        )
        sys.exit(1)


def return_to_original_branch(original_branch):
    if not git_helper.checkout(SRC, original_branch):
        print(
            p_error(
                f"Could not checkout back to branch {p_label(original_branch)} in the Git repository at {p_label('--src')}."
            )
        )
        sys.exit(1)


def restore_original_state(original_branch, changes_stashed):
    return_to_original_branch(original_branch)
    if changes_stashed == GIT_STASH_STATUSES.STASHED:
        apply_stash(changes_stashed)


def save_target_commit(target_commit, path):
    with open(os.path.join(path, ".gittargetcommit"), "w") as f:
        f.write(target_commit)


def load_target_commit(path):
    target_commit_path = os.path.join(path, ".gittargetcommit")
    if os.path.exists(target_commit_path):
        with open(target_commit_path, "r") as f:
            return f.read().strip()
    return None


### Run script
print("Starting conversion...")
convert()
@@ -348,5 +584,30 @@ if SRC_TYPE == "html" and DEST_TYPE == "docx":

if SRC_TYPE == "md" and DEST_TYPE == "html":
    postprocess_html(DEST)
    if IS_DIFF:
        # default diff_source_dir is GENERATED_FILE/{FOLDER}-base/html
        if IS_GIT_DIFF:
            if not args.src:
                print(
                    p_error(
                        f"When using the {p_label('--git')} argument, the {p_label('--src')} argument must be provided with a valid path to a Git repository."
                    )
                )
                sys.exit(1)
            diff_source_dir = get_comparing_folder_from_git()
        else:
            diff_source_dir = os.path.join(FILEGEN_DIR, f"{FOLDER}-base", "html")
        print(f"Applying diff from source directory: {diff_source_dir}...")
        # Iterate through new HTML files and create diffs
        for filename in os.listdir(DEST):
            if filename.endswith(".html"):
                new_file_path = os.path.join(DEST, filename)
                comparing_file_path = os.path.join(diff_source_dir, filename)
                # Create diff and overwrite the new file
                make_trackchanges_diff(
                    comparing_file_path, new_file_path, new_file_path
                )
        shutil.copy("diffVisualizer.js", DEST)
        print(f"Diff applied.")
print("Post-processing completed successfully.")
# endregion
Loading