Handle code blocks, fix some bugs, add `fix-all` command

5 months ago · beff498743
2 changed files with 279 additions and 6 deletions
--- a/scripts/doc_parsing_utils.py
+++ b/scripts/doc_parsing_utils.py
@ -20,10 +20,15 @@ MARKDOWN_LINK_RE = re.compile(
 )

 HTML_LINK_RE = re.compile(r"<a\s+[^>]*>.*?</a>")
-HTML_LINK_TEXT = re.compile(r"<a\b([^>]*)>(.*?)</a>")
+HTML_LINK_TEXT_RE = re.compile(r"<a\b([^>]*)>(.*?)</a>")
 HTML_LINK_OPEN_TAG_RE = re.compile(r"<a\b([^>]*)>")
 HTML_ATTR_RE = re.compile(r'(\w+)\s*=\s*([\'"])(.*?)\2')

+CODE_BLOCK_LANG_RE = re.compile(r"^```([\w-]*)", re.MULTILINE)
+
+SLASHES_COMMENT_RE = re.compile(r"^(?P<code>.*?)(?P<comment>\s*// .*)?$")
+HASH_COMMENT_RE = re.compile(r"^(?P<code>.*?)(?P<comment>\s*# .*)?$")
+

 class CodeIncludeInfo(TypedDict):
    line_no: int
@ -57,6 +62,12 @@ class HtmlLinkInfo(TypedDict):
    text: str


+class MultilineCodeBlockInfo(TypedDict):
+    lang: str
+    start_line_no: int
+    content: list[str]
+
+
 # Code includes
 # -----------------------------------------------------------------------------------------

@ -82,10 +93,11 @@ def replace_code_includes_with_placeholders(text: list[str]) -> list[str]:
    Replace code includes with placeholders.
    """

+    modified_text = text.copy()
    includes = extract_code_includes(text)
    for include in includes:
-        text[include["line_no"] - 1] = CODE_INCLUDE_PLACEHOLDER
-    return text
+        modified_text[include["line_no"] - 1] = CODE_INCLUDE_PLACEHOLDER
+    return modified_text


 def replace_placeholders_with_code_includes(
@ -274,7 +286,7 @@ def _construct_markdown_link(
        link = f"[{text}]({url})"

    if attributes:
-        link += f" {{{attributes}}}"
+        link += f"{{{attributes}}}"

    return link

@ -345,7 +357,7 @@ def extract_html_links(lines: list[str]) -> list[HtmlLinkInfo]:
        for html_link in HTML_LINK_RE.finditer(line):
            link_str = html_link.group(0)

-            link_text_match = HTML_LINK_TEXT.match(link_str)
+            link_text_match = HTML_LINK_TEXT_RE.match(link_str)
            assert link_text_match is not None
            link_text = link_text_match.group(2)
            assert isinstance(link_text, str)
@ -442,3 +454,188 @@ def replace_html_links(
        )

    return modified_text
+
+
+# Multiline code blocks
+# -----------------------------------------------------------------------------------------
+
+
+def get_code_block_lang(line: str) -> str:
+    match = CODE_BLOCK_LANG_RE.match(line)
+    if match:
+        return match.group(1)
+    return ""
+
+
+def extract_multiline_code_blocks(text: list[str]) -> list[MultilineCodeBlockInfo]:
+    blocks: list[MultilineCodeBlockInfo] = []
+
+    in_code_block3 = False
+    in_code_block4 = False
+    current_block_lang = ""
+    current_block_start_line = -1
+    current_block_lines = []
+
+    for line_no, line in enumerate(text, start=1):
+        stripped = line.lstrip()
+
+        # --- Detect opening fence ---
+        if not (in_code_block3 or in_code_block4):
+            if stripped.startswith("```"):
+                current_block_start_line = line_no
+                count = len(stripped) - len(stripped.lstrip("`"))
+                if count == 3:
+                    in_code_block3 = True
+                    current_block_lang = get_code_block_lang(stripped)
+                    current_block_lines = [line]
+                    continue
+                elif count >= 4:
+                    in_code_block4 = True
+                    current_block_lang = get_code_block_lang(stripped)
+                    current_block_lines = [line]
+                    continue
+
+        # --- Detect closing fence ---
+        elif in_code_block3:
+            if stripped.startswith("```"):
+                count = len(stripped) - len(stripped.lstrip("`"))
+                if count == 3:
+                    current_block_lines.append(line)
+                    blocks.append(
+                        MultilineCodeBlockInfo(
+                            lang=current_block_lang,
+                            start_line_no=current_block_start_line,
+                            content=current_block_lines,
+                        )
+                    )
+                    in_code_block3 = False
+                    current_block_lang = ""
+                    current_block_start_line = -1
+                    current_block_lines = []
+                    continue
+            current_block_lines.append(line)
+
+        elif in_code_block4:
+            if stripped.startswith("````"):
+                count = len(stripped) - len(stripped.lstrip("`"))
+                if count >= 4:
+                    current_block_lines.append(line)
+                    blocks.append(
+                        MultilineCodeBlockInfo(
+                            lang=current_block_lang,
+                            start_line_no=current_block_start_line,
+                            content=current_block_lines,
+                        )
+                    )
+                    in_code_block4 = False
+                    current_block_lang = ""
+                    current_block_start_line = -1
+                    current_block_lines = []
+                    continue
+            current_block_lines.append(line)
+
+    return blocks
+
+
+def _split_hash_comment(line: str) -> tuple[str, str | None]:
+    match = HASH_COMMENT_RE.match(line)
+    if match:
+        code = match.group("code").rstrip()
+        comment = match.group("comment")
+        return code, comment
+    return line.rstrip(), None
+
+
+def _split_slashes_comment(line: str) -> tuple[str, str | None]:
+    match = SLASHES_COMMENT_RE.match(line)
+    if match:
+        code = match.group("code").rstrip()
+        comment = match.group("comment")
+        return code, comment
+    return line, None
+
+
+def replace_multiline_code_block(
+    block_a: MultilineCodeBlockInfo, block_b: MultilineCodeBlockInfo
+) -> list[str]:
+    """
+    Replace multiline code block a with block b leaving comments intact.
+
+    Syntax of comments depends on the language of the code block.
+    Raises ValueError if the blocks are not compatible (different languages or different number of lines).
+    """
+
+    if block_a["lang"] != block_b["lang"]:
+        raise ValueError("Code blocks have different languages")
+    if len(block_a["content"]) != len(block_b["content"]):
+        raise ValueError("Code blocks have different number of lines")
+
+    block_language = block_a["lang"].lower()
+    if block_language in {"mermaid"}:
+        return block_a["content"].copy()  # We don't handle mermaid code blocks for now
+
+    code_block: list[str] = []
+    for line_a, line_b in zip(block_a["content"], block_b["content"]):
+        line_a_comment: str | None = None
+        line_b_comment: str | None = None
+
+        # Handle comments based on language
+        if block_language in {
+            "python",
+            "py",
+            "sh",
+            "bash",
+            "dockerfile",
+            "requirements",
+            "gitignore",
+            "toml",
+            "yaml",
+            "yml",
+        }:
+            _line_a_code, line_a_comment = _split_hash_comment(line_a)
+            line_b_code, line_b_comment = _split_hash_comment(line_b)
+            res_line = line_b
+            if line_b_comment:
+                res_line = res_line.replace(line_b_comment, line_a_comment, 1)
+            code_block.append(res_line)
+        elif block_language in {"console", "json"}:
+            _line_a_code, line_a_comment = _split_slashes_comment(line_a)
+            line_b_code, line_b_comment = _split_slashes_comment(line_b)
+            res_line = line_b
+            if line_b_comment:
+                print(f"Replacing comment: {line_b_comment} with {line_a_comment}")
+                res_line = res_line.replace(line_b_comment, line_a_comment, 1)
+                print(f"Resulting line: {res_line}")
+            code_block.append(res_line)
+        else:
+            code_block.append(line_b)
+
+    return code_block
+
+
+def replace_multiline_code_blocks_in_text(
+    text: list[str],
+    code_blocks: list[MultilineCodeBlockInfo],
+    original_code_blocks: list[MultilineCodeBlockInfo],
+) -> list[MultilineCodeBlockInfo]:
+    """
+    Update each code block in `text` with the corresponding code block from
+    `original_code_blocks` with comments taken from `code_blocks`.
+
+    Raises ValueError if the number, language, or shape of code blocks do not match.
+    """
+
+    if len(code_blocks) != len(original_code_blocks):
+        raise ValueError(
+            "Number of code blocks does not match the number of original code blocks"
+        )
+
+    modified_text = text.copy()
+    for block, original_block in zip(code_blocks, original_code_blocks):
+        updated_content = replace_multiline_code_block(block, original_block)
+
+        start_line_index = block["start_line_no"] - 1
+        for i, updated_line in enumerate(updated_content):
+            modified_text[start_line_index + i] = updated_line
+
+    return modified_text
--- a/scripts/translation_fixer.py
+++ b/scripts/translation_fixer.py
@ -1,3 +1,6 @@
+import difflib
+import os
+from collections.abc import Iterable
 from pathlib import Path
 from typing import Annotated

@ -8,13 +11,27 @@ from scripts.doc_parsing_utils import (
    extract_header_permalinks,
    extract_html_links,
    extract_markdown_links,
+    extract_multiline_code_blocks,
    replace_code_includes_with_placeholders,
    replace_header_permalinks,
    replace_html_links,
    replace_markdown_links,
+    replace_multiline_code_blocks_in_text,
    replace_placeholders_with_code_includes,
 )

+non_translated_sections = (
+    f"reference{os.sep}",
+    "release-notes.md",
+    "fastapi-people.md",
+    "external-links.md",
+    "newsletter.md",
+    "management-tasks.md",
+    "management.md",
+    "contributing.md",
+)
+
+
 cli = typer.Typer()


@ -23,6 +40,53 @@ def callback():
    pass


+def iter_all_lang_paths(lang_path_root: Path) -> Iterable[Path]:
+    """
+    Iterate on the markdown files to translate in order of priority.
+    """
+
+    first_dirs = [
+        lang_path_root / "learn",
+        lang_path_root / "tutorial",
+        lang_path_root / "advanced",
+        lang_path_root / "about",
+        lang_path_root / "how-to",
+    ]
+    first_parent = lang_path_root
+    yield from first_parent.glob("*.md")
+    for dir_path in first_dirs:
+        yield from dir_path.rglob("*.md")
+    first_dirs_str = tuple(str(d) for d in first_dirs)
+    for path in lang_path_root.rglob("*.md"):
+        if str(path).startswith(first_dirs_str):
+            continue
+        if path.parent == first_parent:
+            continue
+        yield path
+
+
+def get_all_paths(lang: str):
+    res: list[str] = []
+    lang_docs_root = Path("docs") / lang / "docs"
+    for path in iter_all_lang_paths(lang_docs_root):
+        relpath = path.relative_to(lang_docs_root)
+        if not str(relpath).startswith(non_translated_sections):
+            res.append(str(relpath))
+    return res
+
+
+@cli.command()
+def fix_all(ctx: typer.Context, language: str):
+    docs = get_all_paths(language)
+
+    for page in docs:
+        doc_path = Path("docs") / language / "docs" / page
+        try:
+            fix_pages(doc_paths=[doc_path])
+        except ValueError as e:
+            print(f"Error processing {doc_path}: {e}")
+
+
@cli.command()
 def fix_pages(
    doc_paths: Annotated[
@ -49,6 +113,11 @@ def fix_pages(
        )
        if fixed_doc_lines != doc_lines:
            print(f"Fixing code includes in: {path}")
+            diff = difflib.unified_diff(
+                doc_lines, fixed_doc_lines, fromfile="translation", tofile="fixed"
+            )
+            print("\n".join(diff))
+
        doc_lines = fixed_doc_lines

        # Fix permalinks
@ -75,7 +144,14 @@ def fix_pages(
        doc_lines = fixed_doc_lines

        # Fix multiline code blocks
-        # TODO: Implement
+        en_code_blocks = extract_multiline_code_blocks(en_doc_lines)
+        doc_code_blocks = extract_multiline_code_blocks(doc_lines)
+        fixed_doc_lines = replace_multiline_code_blocks_in_text(
+            doc_lines, doc_code_blocks, en_code_blocks
+        )
+        if fixed_doc_lines != doc_lines:
+            print(f"Fixing multiline code blocks in: {path}")
+        doc_lines = fixed_doc_lines

        # Write back the fixed document
        doc_lines.append("")  # Ensure file ends with a newline