Fix links, permalinks, code includes

6 months ago · 0339277673
2 changed files with 530 additions and 0 deletions
--- a/scripts/doc_parsing_utils.py
+++ b/scripts/doc_parsing_utils.py
@ -0,0 +1,444 @@
+import re
+from typing import TypedDict
+
+CODE_INCLUDE_RE = re.compile(r"^\{\*\s*(\S+)\s*(.*)\*\}$")
+CODE_INCLUDE_PLACEHOLDER = "<CODE_INCLUDE>"
+
+HEADER_WITH_PERMALINK_RE = re.compile(r"^(#{1,6}) (.+?)(\s*\{\s*#.*\s*\})?\s*$")
+HEADER_LINE_RE = re.compile(r"^(#{1,6}) (.+?)(?:\s*\{\s*(#.*)\s*\})?\s*$")
+
+TIANGOLO_COM = "https://fastapi.tiangolo.com"
+
+MARKDOWN_LINK_RE = re.compile(
+    r"(?<!\\)(?<!\!)"  # not an image ![...] and not escaped \[...]
+    r"\[(?P<text>.*?)\]"  # link text (non-greedy)
+    r"\("
+    r"(?P<url>[^)\s]+)"  # url (no spaces and `)`)
+    r'(?:\s+["\'](?P<title>.*?)["\'])?'  # optional title in "" or ''
+    r"\)"
+    r"(?:\s*\{(?P<attrs>[^}]*)\})?"  # optional attributes in {}
+)
+
+HTML_LINK_RE = re.compile(r"<a\s+[^>]*>.*?</a>")
+HTML_LINK_TEXT = re.compile(r"<a\b([^>]*)>(.*?)</a>")
+HTML_LINK_OPEN_TAG_RE = re.compile(r"<a\b([^>]*)>")
+HTML_ATTR_RE = re.compile(r'(\w+)\s*=\s*([\'"])(.*?)\2')
+
+
+class CodeIncludeInfo(TypedDict):
+    line_no: int
+    line: str
+
+
+class HeaderPermalinkInfo(TypedDict):
+    line_no: int
+    hashes: str
+    permalink: str
+
+
+class MarkdownLinkInfo(TypedDict):
+    line_no: int
+    url: str
+    text: str
+    title: str | None
+    attributes: str | None
+
+
+class HTMLLinkAttribute(TypedDict):
+    name: str
+    quote: str
+    value: str
+
+
+class HtmlLinkInfo(TypedDict):
+    line_no: int
+    full_tag: str
+    attributes: list[HTMLLinkAttribute]
+    text: str
+
+
+# Code includes
+# -----------------------------------------------------------------------------------------
+
+
+def extract_code_includes(lines: list[str]) -> list[CodeIncludeInfo]:
+    """
+    Exctract lines that contain code includes.
+
+    Return list of CodeIncludeInfo namedtuples, where each tuple contains:
+    - `line_no` - line number (1-based)
+    - `line` - text of the line
+    """
+
+    includes: list[CodeIncludeInfo] = []
+    for line_no, line in enumerate(lines, start=1):
+        if CODE_INCLUDE_RE.match(line):
+            includes.append(CodeIncludeInfo(line_no=line_no, line=line))
+    return includes
+
+
+def replace_code_includes_with_placeholders(text: list[str]) -> list[str]:
+    """
+    Replace code includes with placeholders.
+    """
+
+    includes = extract_code_includes(text)
+    for include in includes:
+        text[include["line_no"] - 1] = CODE_INCLUDE_PLACEHOLDER
+    return text
+
+
+def replace_placeholders_with_code_includes(
+    text: list[str], original_includes: list[CodeIncludeInfo]
+) -> list[str]:
+    """
+    Replace code includes placeholders with actual code includes from the original (English) document.
+    Fail if the number of placeholders does not match the number of original includes.
+    """
+
+    modified_text: list[str] = []
+    include_index = 0
+    for line in text:
+        if line.strip() == CODE_INCLUDE_PLACEHOLDER:
+            if include_index >= len(original_includes):
+                raise ValueError(
+                    "Number of placeholders exceeds number of code includes in the original document"
+                )
+            modified_text.append(original_includes[include_index]["line"])
+            include_index += 1
+        else:
+            modified_text.append(line)
+
+    if include_index < len(original_includes):
+        raise ValueError(
+            "Number of placeholders is less than number of code includes in the original document"
+        )
+
+    return modified_text
+
+
+# Header permalinks
+# -----------------------------------------------------------------------------------------
+
+
+def extract_header_permalinks(lines: list[str]) -> list[HeaderPermalinkInfo]:
+    """
+    Extract list of header permalinks from the given lines.
+
+    Return list of HeaderPermalinkInfo namedtuples, where each tuple contains:
+    - `line_no` - line number (1-based)
+    - `hashes` - string of hashes representing header level (e.g., "###")
+    - `permalink` - permalink string (e.g., "{#permalink}")
+    """
+
+    headers: list[HeaderPermalinkInfo] = []
+    in_code_block3 = False
+    in_code_block4 = False
+
+    for line_no, line in enumerate(lines, start=1):
+        if not (in_code_block3 or in_code_block4):
+            if line.startswith("```"):
+                count = len(line) - len(line.lstrip("`"))
+                if count == 3:
+                    in_code_block3 = True
+                    continue
+                elif count >= 4:
+                    in_code_block4 = True
+                    continue
+
+            header_match = HEADER_WITH_PERMALINK_RE.match(line)
+            if header_match:
+                hashes, _title, permalink = header_match.groups()
+                headers.append(
+                    HeaderPermalinkInfo(
+                        hashes=hashes, line_no=line_no, permalink=permalink
+                    )
+                )
+
+        elif in_code_block3:
+            if line.startswith("```"):
+                count = len(line) - len(line.lstrip("`"))
+                if count == 3:
+                    in_code_block3 = False
+                    continue
+
+        elif in_code_block4:
+            if line.startswith("````"):
+                count = len(line) - len(line.lstrip("`"))
+                if count >= 4:
+                    in_code_block4 = False
+                    continue
+
+    return headers
+
+
+def remove_header_permalinks(lines: list[str]) -> list[str]:
+    """
+    Remove permalinks from headers in the given lines.
+    """
+
+    modified_lines: list[str] = []
+    for line in lines:
+        header_match = HEADER_WITH_PERMALINK_RE.match(line)
+        if header_match:
+            hashes, title, _permalink = header_match.groups()
+            modified_line = f"{hashes} {title}"
+            modified_lines.append(modified_line)
+        else:
+            modified_lines.append(line)
+    return modified_lines
+
+
+def replace_header_permalinks(
+    text: list[str], original_permalinks: list[HeaderPermalinkInfo]
+) -> list[str]:
+    """
+    Replace permalinks in the given text with the permalinks from the original document.
+
+    Fail if the number or order of headers does not match the original.
+    """
+
+    modified_text: list[str] = []
+    permalink_index = 0
+    for line in text:
+        header_match = HEADER_LINE_RE.match(line)
+        if header_match:
+            if permalink_index >= len(original_permalinks):
+                raise ValueError(
+                    "Number of headers exceeds number of headers in the original document"
+                )
+            hashes, title, _permalink = header_match.groups()
+            original_permalink_info = original_permalinks[permalink_index]
+            if original_permalink_info["hashes"] != hashes:
+                raise ValueError(
+                    "Header levels do not match between document and original document"
+                )
+
+            modified_line = f"{hashes} {title}{original_permalink_info['permalink']}"
+            modified_text.append(modified_line)
+            permalink_index += 1
+        else:
+            modified_text.append(line)
+
+    if permalink_index < len(original_permalinks):
+        raise ValueError(
+            "Number of headers is less than number of headers in the original document"
+        )
+
+    return modified_text
+
+
+# Markdown links
+# -----------------------------------------------------------------------------------------
+
+
+def extract_markdown_links(lines: list[str]) -> list[tuple[str, int]]:
+    """
+    Extract all markdown links from the given lines.
+
+    Return list of MarkdownLinkInfo namedtuples, where each tuple contains:
+    - `line_no` - line number (1-based)
+    - `url` - link URL
+    - `text` - link text
+    - `title` - link title (if any)
+    """
+
+    links: list[MarkdownLinkInfo] = []
+    for line_no, line in enumerate(lines, start=1):
+        for m in MARKDOWN_LINK_RE.finditer(line):
+            links.append(
+                MarkdownLinkInfo(
+                    line_no=line_no,
+                    url=m.group("url"),
+                    text=m.group("text"),
+                    title=m.group("title"),
+                    attributes=m.group("attrs"),
+                )
+            )
+    return links
+
+
+def _construct_markdown_link(
+    url: str, text: str, title: str | None, attributes: str | None, lang_code: str
+) -> str:
+    """
+    Construct a markdown link, adjusting the URL for the given language code if needed.
+    """
+
+    if url.startswith(TIANGOLO_COM):
+        url = url.replace(TIANGOLO_COM, f"{TIANGOLO_COM}/{lang_code}")
+
+    if title:
+        link = f'[{text}]({url} "{title}")'
+    else:
+        link = f"[{text}]({url})"
+
+    if attributes:
+        link += f" {{{attributes}}}"
+
+    return link
+
+
+def replace_markdown_links(
+    text: list[str], original_links: list[MarkdownLinkInfo], lang_code: str
+) -> list[str]:
+    """
+    Replace markdown links in the given text with the original links.
+
+    Fail if the number of links does not match the original.
+    """
+
+    modified_text: list[str] = []
+    link_index = 0
+    for line in text:
+        modified_line = line
+        for m in MARKDOWN_LINK_RE.finditer(line):
+            if link_index >= len(original_links):
+                raise ValueError(
+                    "Number of markdown links exceeds number of markdown links in the original document"
+                )
+            link_text = m.group("text")
+            assert isinstance(link_text, str)
+            link_title = m.group("title")
+            assert link_title is None or isinstance(link_title, str)
+
+            original_link_info = original_links[link_index]
+
+            # Replace
+            replacement_link = _construct_markdown_link(
+                url=original_link_info["url"],
+                text=link_text,
+                title=link_title,
+                attributes=original_link_info["attributes"],
+                lang_code=lang_code,
+            )
+            modified_line = modified_line.replace(m.group(0), replacement_link, 1)
+
+            link_index += 1
+        modified_text.append(modified_line)
+
+    if link_index < len(original_links):
+        raise ValueError(
+            "Number of markdown links is less than in the original document"
+        )
+
+    return modified_text
+
+
+# HTML links
+# -----------------------------------------------------------------------------------------
+
+
+def extract_html_links(lines: list[str]) -> list[HtmlLinkInfo]:
+    """
+    Extract all HTML links from the given lines.
+
+    Return list of HtmlLinkInfo namedtuples, where each tuple contains:
+    - `line_no` - line number (1-based)
+    - `full_tag` - full HTML link tag
+    - `attributes` - list of HTMLLinkAttribute namedtuples (name, quote, value)
+    - `text` - link text
+    """
+
+    links = []
+    for line_no, line in enumerate(lines, start=1):
+        for html_link in HTML_LINK_RE.finditer(line):
+            link_str = html_link.group(0)
+
+            link_text_match = HTML_LINK_TEXT.match(link_str)
+            assert link_text_match is not None
+            link_text = link_text_match.group(2)
+            assert isinstance(link_text, str)
+
+            link_open_tag_match = HTML_LINK_OPEN_TAG_RE.match(link_str)
+            assert link_open_tag_match is not None
+            link_open_tag = link_open_tag_match.group(1)
+            assert isinstance(link_open_tag, str)
+
+            attributes: list[HTMLLinkAttribute] = []
+            for attr_name, attr_quote, attr_value in re.findall(
+                HTML_ATTR_RE, link_open_tag
+            ):
+                assert isinstance(attr_name, str)
+                assert isinstance(attr_quote, str)
+                assert isinstance(attr_value, str)
+                attributes.append(
+                    HTMLLinkAttribute(
+                        name=attr_name, quote=attr_quote, value=attr_value
+                    )
+                )
+            links.append(
+                HtmlLinkInfo(
+                    line_no=line_no,
+                    full_tag=link_str,
+                    attributes=attributes,
+                    text=link_text,
+                )
+            )
+    return links
+
+
+def _construct_html_link(
+    link_text: str,
+    attributes: list[HTMLLinkAttribute],
+    lang_code: str,
+) -> str:
+    """
+    Reconstruct HTML link, adjusting the URL for the given language code if needed.
+    """
+
+    attributes_upd: list[HTMLLinkAttribute] = []
+    for attribute in attributes:
+        if attribute["name"] == "href":
+            original_url = attribute["value"]
+            if original_url.startswith(TIANGOLO_COM):
+                url = original_url.replace(TIANGOLO_COM, f"{TIANGOLO_COM}/{lang_code}")
+            else:
+                url = original_url
+            attributes_upd.append(
+                HTMLLinkAttribute(name="href", quote=attribute["quote"], value=url)
+            )
+        else:
+            attributes_upd.append(attribute)
+
+    attrs_str = " ".join(
+        f"{attribute['name']}={attribute['quote']}{attribute['value']}{attribute['quote']}"
+        for attribute in attributes_upd
+    )
+    return f"<a {attrs_str}>{link_text}</a>"
+
+
+def replace_html_links(
+    text: list[str], original_links: list[HtmlLinkInfo], lang_code: str
+) -> list[str]:
+    """
+    Replace HTML links in the given text with the links from the original document.
+
+    Adjust URLs for the given language code.
+    Fail if the number of links does not match the original.
+    """
+
+    links = extract_html_links(text)
+    if len(links) > len(original_links):
+        raise ValueError(
+            "Number of HTML links exceeds number of HTML links in the original document"
+        )
+    elif len(links) < len(original_links):
+        raise ValueError("Number of HTML links is less than in the original document")
+
+    modified_text = text.copy()
+    for link_index, link in enumerate(links):
+        original_link_info = original_links[link_index]
+
+        # Replace in the document text
+        replacement_link = _construct_html_link(
+            link_text=link["text"],
+            attributes=original_link_info["attributes"],
+            lang_code=lang_code,
+        )
+        line_no = link["line_no"] - 1
+        modified_text[line_no] = modified_text[line_no].replace(
+            link["full_tag"], replacement_link, 1
+        )
+
+    return modified_text
--- a/scripts/translation_fixer.py
+++ b/scripts/translation_fixer.py
@ -0,0 +1,86 @@
+from pathlib import Path
+from typing import Annotated
+
+import typer
+
+from scripts.doc_parsing_utils import (
+    extract_code_includes,
+    extract_header_permalinks,
+    extract_html_links,
+    extract_markdown_links,
+    replace_code_includes_with_placeholders,
+    replace_header_permalinks,
+    replace_html_links,
+    replace_markdown_links,
+    replace_placeholders_with_code_includes,
+)
+
+cli = typer.Typer()
+
+
+@cli.callback()
+def callback():
+    pass
+
+
+@cli.command()
+def fix_pages(
+    doc_paths: Annotated[
+        list[Path],
+        typer.Argument(help="List of paths to documents."),
+    ],
+):
+    for path in doc_paths:
+        lang_code = path.parts[1]
+        if lang_code == "en":
+            print(f"Skipping English document: {path}")
+            continue
+
+        en_doc_path = Path("docs") / "en" / Path(*path.parts[2:])
+
+        doc_lines = path.read_text(encoding="utf-8").splitlines()
+        en_doc_lines = en_doc_path.read_text(encoding="utf-8").splitlines()
+
+        # Fix code includes
+        en_code_includes = extract_code_includes(en_doc_lines)
+        doc_lines_with_placeholders = replace_code_includes_with_placeholders(doc_lines)
+        fixed_doc_lines = replace_placeholders_with_code_includes(
+            doc_lines_with_placeholders, en_code_includes
+        )
+        if fixed_doc_lines != doc_lines:
+            print(f"Fixing code includes in: {path}")
+        doc_lines = fixed_doc_lines
+
+        # Fix permalinks
+        en_permalinks = extract_header_permalinks(en_doc_lines)
+        fixed_doc_lines = replace_header_permalinks(doc_lines, en_permalinks)
+        if fixed_doc_lines != doc_lines:
+            print(f"Fixing header permalinks in: {path}")
+        doc_lines = fixed_doc_lines
+
+        # Fix markdown links
+        en_markdown_links = extract_markdown_links(en_doc_lines)
+        fixed_doc_lines = replace_markdown_links(
+            doc_lines, en_markdown_links, lang_code
+        )
+        if fixed_doc_lines != doc_lines:
+            print(f"Fixing markdown links in: {path}")
+        doc_lines = fixed_doc_lines
+
+        # Fix HTML links
+        en_html_links = extract_html_links(en_doc_lines)
+        fixed_doc_lines = replace_html_links(doc_lines, en_html_links, lang_code)
+        if fixed_doc_lines != doc_lines:
+            print(f"Fixing HTML links in: {path}")
+        doc_lines = fixed_doc_lines
+
+        # Fix multiline code blocks
+        # TODO: Implement
+
+        # Write back the fixed document
+        doc_lines.append("")  # Ensure file ends with a newline
+        path.write_text("\n".join(doc_lines), encoding="utf-8")
+
+
+if __name__ == "__main__":
+    cli()