Fix links, permalinks, code includes

6 months ago · 0339277673
2 changed files with 530 additions and 0 deletions
--- a/scripts/doc_parsing_utils.py
+++ b/scripts/doc_parsing_utils.py
@ -0,0 +1,444 @@
 import re
 from typing import TypedDict
 CODE_INCLUDE_RE = re.compile(r"^\{\*\s*(\S+)\s*(.*)\*\}$")
 CODE_INCLUDE_PLACEHOLDER = "<CODE_INCLUDE>"
 HEADER_WITH_PERMALINK_RE = re.compile(r"^(#{1,6}) (.+?)(\s*\{\s*#.*\s*\})?\s*$")
 HEADER_LINE_RE = re.compile(r"^(#{1,6}) (.+?)(?:\s*\{\s*(#.*)\s*\})?\s*$")
 TIANGOLO_COM = "https://fastapi.tiangolo.com"
 MARKDOWN_LINK_RE = re.compile(
    r"(?<!\\)(?<!\!)"  # not an image ![...] and not escaped \[...]
    r"\[(?P<text>.*?)\]"  # link text (non-greedy)
    r"\("
    r"(?P<url>[^)\s]+)"  # url (no spaces and `)`)
    r'(?:\s+["\'](?P<title>.*?)["\'])?'  # optional title in "" or ''
    r"\)"
    r"(?:\s*\{(?P<attrs>[^}]*)\})?"  # optional attributes in {}
 )
 HTML_LINK_RE = re.compile(r"<a\s+[^>]*>.*?</a>")
 HTML_LINK_TEXT = re.compile(r"<a\b([^>]*)>(.*?)</a>")
 HTML_LINK_OPEN_TAG_RE = re.compile(r"<a\b([^>]*)>")
 HTML_ATTR_RE = re.compile(r'(\w+)\s*=\s*([\'"])(.*?)\2')
 class CodeIncludeInfo(TypedDict):
    line_no: int
    line: str
 class HeaderPermalinkInfo(TypedDict):
    line_no: int
    hashes: str
    permalink: str
 class MarkdownLinkInfo(TypedDict):
    line_no: int
    url: str
    text: str
    title: str | None
    attributes: str | None
 class HTMLLinkAttribute(TypedDict):
    name: str
    quote: str
    value: str
 class HtmlLinkInfo(TypedDict):
    line_no: int
    full_tag: str
    attributes: list[HTMLLinkAttribute]
    text: str
 # Code includes
 # -----------------------------------------------------------------------------------------
 def extract_code_includes(lines: list[str]) -> list[CodeIncludeInfo]:
    """
    Exctract lines that contain code includes.
    Return list of CodeIncludeInfo namedtuples, where each tuple contains:
    - `line_no` - line number (1-based)
    - `line` - text of the line
    """
    includes: list[CodeIncludeInfo] = []
    for line_no, line in enumerate(lines, start=1):
        if CODE_INCLUDE_RE.match(line):
            includes.append(CodeIncludeInfo(line_no=line_no, line=line))
    return includes
 def replace_code_includes_with_placeholders(text: list[str]) -> list[str]:
    """
    Replace code includes with placeholders.
    """
    includes = extract_code_includes(text)
    for include in includes:
        text[include["line_no"] - 1] = CODE_INCLUDE_PLACEHOLDER
    return text
 def replace_placeholders_with_code_includes(
    text: list[str], original_includes: list[CodeIncludeInfo]
 ) -> list[str]:
    """
    Replace code includes placeholders with actual code includes from the original (English) document.
    Fail if the number of placeholders does not match the number of original includes.
    """
    modified_text: list[str] = []
    include_index = 0
    for line in text:
        if line.strip() == CODE_INCLUDE_PLACEHOLDER:
            if include_index >= len(original_includes):
                raise ValueError(
                    "Number of placeholders exceeds number of code includes in the original document"
                )
            modified_text.append(original_includes[include_index]["line"])
            include_index += 1
        else:
            modified_text.append(line)
    if include_index < len(original_includes):
        raise ValueError(
            "Number of placeholders is less than number of code includes in the original document"
        )
    return modified_text
 # Header permalinks
 # -----------------------------------------------------------------------------------------
 def extract_header_permalinks(lines: list[str]) -> list[HeaderPermalinkInfo]:
    """
    Extract list of header permalinks from the given lines.
    Return list of HeaderPermalinkInfo namedtuples, where each tuple contains:
    - `line_no` - line number (1-based)
    - `hashes` - string of hashes representing header level (e.g., "###")
    - `permalink` - permalink string (e.g., "{#permalink}")
    """
    headers: list[HeaderPermalinkInfo] = []
    in_code_block3 = False
    in_code_block4 = False
    for line_no, line in enumerate(lines, start=1):
        if not (in_code_block3 or in_code_block4):
            if line.startswith("```"):
                count = len(line) - len(line.lstrip("`"))
                if count == 3:
                    in_code_block3 = True
                    continue
                elif count >= 4:
                    in_code_block4 = True
                    continue
            header_match = HEADER_WITH_PERMALINK_RE.match(line)
            if header_match:
                hashes, _title, permalink = header_match.groups()
                headers.append(
                    HeaderPermalinkInfo(
                        hashes=hashes, line_no=line_no, permalink=permalink
                    )
                )
        elif in_code_block3:
            if line.startswith("```"):
                count = len(line) - len(line.lstrip("`"))
                if count == 3:
                    in_code_block3 = False
                    continue
        elif in_code_block4:
            if line.startswith("````"):
                count = len(line) - len(line.lstrip("`"))
                if count >= 4:
                    in_code_block4 = False
                    continue
    return headers
 def remove_header_permalinks(lines: list[str]) -> list[str]:
    """
    Remove permalinks from headers in the given lines.
    """
    modified_lines: list[str] = []
    for line in lines:
        header_match = HEADER_WITH_PERMALINK_RE.match(line)
        if header_match:
            hashes, title, _permalink = header_match.groups()
            modified_line = f"{hashes} {title}"
            modified_lines.append(modified_line)
        else:
            modified_lines.append(line)
    return modified_lines
 def replace_header_permalinks(
    text: list[str], original_permalinks: list[HeaderPermalinkInfo]
 ) -> list[str]:
    """
    Replace permalinks in the given text with the permalinks from the original document.
    Fail if the number or order of headers does not match the original.
    """
    modified_text: list[str] = []
    permalink_index = 0
    for line in text:
        header_match = HEADER_LINE_RE.match(line)
        if header_match:
            if permalink_index >= len(original_permalinks):
                raise ValueError(
                    "Number of headers exceeds number of headers in the original document"
                )
            hashes, title, _permalink = header_match.groups()
            original_permalink_info = original_permalinks[permalink_index]
            if original_permalink_info["hashes"] != hashes:
                raise ValueError(
                    "Header levels do not match between document and original document"
                )
            modified_line = f"{hashes} {title}{original_permalink_info['permalink']}"
            modified_text.append(modified_line)
            permalink_index += 1
        else:
            modified_text.append(line)
    if permalink_index < len(original_permalinks):
        raise ValueError(
            "Number of headers is less than number of headers in the original document"
        )
    return modified_text
 # Markdown links
 # -----------------------------------------------------------------------------------------
 def extract_markdown_links(lines: list[str]) -> list[tuple[str, int]]:
    """
    Extract all markdown links from the given lines.
    Return list of MarkdownLinkInfo namedtuples, where each tuple contains:
    - `line_no` - line number (1-based)
    - `url` - link URL
    - `text` - link text
    - `title` - link title (if any)
    """
    links: list[MarkdownLinkInfo] = []
    for line_no, line in enumerate(lines, start=1):
        for m in MARKDOWN_LINK_RE.finditer(line):
            links.append(
                MarkdownLinkInfo(
                    line_no=line_no,
                    url=m.group("url"),
                    text=m.group("text"),
                    title=m.group("title"),
                    attributes=m.group("attrs"),
                )
            )
    return links
 def _construct_markdown_link(
    url: str, text: str, title: str | None, attributes: str | None, lang_code: str
 ) -> str:
    """
    Construct a markdown link, adjusting the URL for the given language code if needed.
    """
    if url.startswith(TIANGOLO_COM):
        url = url.replace(TIANGOLO_COM, f"{TIANGOLO_COM}/{lang_code}")
    if title:
        link = f'[{text}]({url} "{title}")'
    else:
        link = f"[{text}]({url})"
    if attributes:
        link += f" {{{attributes}}}"
    return link
 def replace_markdown_links(
    text: list[str], original_links: list[MarkdownLinkInfo], lang_code: str
 ) -> list[str]:
    """
    Replace markdown links in the given text with the original links.
    Fail if the number of links does not match the original.
    """
    modified_text: list[str] = []
    link_index = 0
    for line in text:
        modified_line = line
        for m in MARKDOWN_LINK_RE.finditer(line):
            if link_index >= len(original_links):
                raise ValueError(
                    "Number of markdown links exceeds number of markdown links in the original document"
                )
            link_text = m.group("text")
            assert isinstance(link_text, str)
            link_title = m.group("title")
            assert link_title is None or isinstance(link_title, str)
            original_link_info = original_links[link_index]
            # Replace
            replacement_link = _construct_markdown_link(
                url=original_link_info["url"],
                text=link_text,
                title=link_title,
                attributes=original_link_info["attributes"],
                lang_code=lang_code,
            )
            modified_line = modified_line.replace(m.group(0), replacement_link, 1)
            link_index += 1
        modified_text.append(modified_line)
    if link_index < len(original_links):
        raise ValueError(
            "Number of markdown links is less than in the original document"
        )
    return modified_text
 # HTML links
 # -----------------------------------------------------------------------------------------
 def extract_html_links(lines: list[str]) -> list[HtmlLinkInfo]:
    """
    Extract all HTML links from the given lines.
    Return list of HtmlLinkInfo namedtuples, where each tuple contains:
    - `line_no` - line number (1-based)
    - `full_tag` - full HTML link tag
    - `attributes` - list of HTMLLinkAttribute namedtuples (name, quote, value)
    - `text` - link text
    """
    links = []
    for line_no, line in enumerate(lines, start=1):
        for html_link in HTML_LINK_RE.finditer(line):
            link_str = html_link.group(0)
            link_text_match = HTML_LINK_TEXT.match(link_str)
            assert link_text_match is not None
            link_text = link_text_match.group(2)
            assert isinstance(link_text, str)
            link_open_tag_match = HTML_LINK_OPEN_TAG_RE.match(link_str)
            assert link_open_tag_match is not None
            link_open_tag = link_open_tag_match.group(1)
            assert isinstance(link_open_tag, str)
            attributes: list[HTMLLinkAttribute] = []
            for attr_name, attr_quote, attr_value in re.findall(
                HTML_ATTR_RE, link_open_tag
            ):
                assert isinstance(attr_name, str)
                assert isinstance(attr_quote, str)
                assert isinstance(attr_value, str)
                attributes.append(
                    HTMLLinkAttribute(
                        name=attr_name, quote=attr_quote, value=attr_value
                    )
                )
            links.append(
                HtmlLinkInfo(
                    line_no=line_no,
                    full_tag=link_str,
                    attributes=attributes,
                    text=link_text,
                )
            )
    return links
 def _construct_html_link(
    link_text: str,
    attributes: list[HTMLLinkAttribute],
    lang_code: str,
 ) -> str:
    """
    Reconstruct HTML link, adjusting the URL for the given language code if needed.
    """
    attributes_upd: list[HTMLLinkAttribute] = []
    for attribute in attributes:
        if attribute["name"] == "href":
            original_url = attribute["value"]
            if original_url.startswith(TIANGOLO_COM):
                url = original_url.replace(TIANGOLO_COM, f"{TIANGOLO_COM}/{lang_code}")
            else:
                url = original_url
            attributes_upd.append(
                HTMLLinkAttribute(name="href", quote=attribute["quote"], value=url)
            )
        else:
            attributes_upd.append(attribute)
    attrs_str = " ".join(
        f"{attribute['name']}={attribute['quote']}{attribute['value']}{attribute['quote']}"
        for attribute in attributes_upd
    )
    return f"<a {attrs_str}>{link_text}</a>"
 def replace_html_links(
    text: list[str], original_links: list[HtmlLinkInfo], lang_code: str
 ) -> list[str]:
    """
    Replace HTML links in the given text with the links from the original document.
    Adjust URLs for the given language code.
    Fail if the number of links does not match the original.
    """
    links = extract_html_links(text)
    if len(links) > len(original_links):
        raise ValueError(
            "Number of HTML links exceeds number of HTML links in the original document"
        )
    elif len(links) < len(original_links):
        raise ValueError("Number of HTML links is less than in the original document")
    modified_text = text.copy()
    for link_index, link in enumerate(links):
        original_link_info = original_links[link_index]
        # Replace in the document text
        replacement_link = _construct_html_link(
            link_text=link["text"],
            attributes=original_link_info["attributes"],
            lang_code=lang_code,
        )
        line_no = link["line_no"] - 1
        modified_text[line_no] = modified_text[line_no].replace(
            link["full_tag"], replacement_link, 1
        )
    return modified_text
--- a/scripts/translation_fixer.py
+++ b/scripts/translation_fixer.py
@ -0,0 +1,86 @@
 from pathlib import Path
 from typing import Annotated
 import typer
 from scripts.doc_parsing_utils import (
    extract_code_includes,
    extract_header_permalinks,
    extract_html_links,
    extract_markdown_links,
    replace_code_includes_with_placeholders,
    replace_header_permalinks,
    replace_html_links,
    replace_markdown_links,
    replace_placeholders_with_code_includes,
 )
 cli = typer.Typer()
@cli.callback()
 def callback():
    pass
@cli.command()
 def fix_pages(
    doc_paths: Annotated[
        list[Path],
        typer.Argument(help="List of paths to documents."),
    ],
 ):
    for path in doc_paths:
        lang_code = path.parts[1]
        if lang_code == "en":
            print(f"Skipping English document: {path}")
            continue
        en_doc_path = Path("docs") / "en" / Path(*path.parts[2:])
        doc_lines = path.read_text(encoding="utf-8").splitlines()
        en_doc_lines = en_doc_path.read_text(encoding="utf-8").splitlines()
        # Fix code includes
        en_code_includes = extract_code_includes(en_doc_lines)
        doc_lines_with_placeholders = replace_code_includes_with_placeholders(doc_lines)
        fixed_doc_lines = replace_placeholders_with_code_includes(
            doc_lines_with_placeholders, en_code_includes
        )
        if fixed_doc_lines != doc_lines:
            print(f"Fixing code includes in: {path}")
        doc_lines = fixed_doc_lines
        # Fix permalinks
        en_permalinks = extract_header_permalinks(en_doc_lines)
        fixed_doc_lines = replace_header_permalinks(doc_lines, en_permalinks)
        if fixed_doc_lines != doc_lines:
            print(f"Fixing header permalinks in: {path}")
        doc_lines = fixed_doc_lines
        # Fix markdown links
        en_markdown_links = extract_markdown_links(en_doc_lines)
        fixed_doc_lines = replace_markdown_links(
            doc_lines, en_markdown_links, lang_code
        )
        if fixed_doc_lines != doc_lines:
            print(f"Fixing markdown links in: {path}")
        doc_lines = fixed_doc_lines
        # Fix HTML links
        en_html_links = extract_html_links(en_doc_lines)
        fixed_doc_lines = replace_html_links(doc_lines, en_html_links, lang_code)
        if fixed_doc_lines != doc_lines:
            print(f"Fixing HTML links in: {path}")
        doc_lines = fixed_doc_lines
        # Fix multiline code blocks
        # TODO: Implement
        # Write back the fixed document
        doc_lines.append("")  # Ensure file ends with a newline
        path.write_text("\n".join(doc_lines), encoding="utf-8")
 if __name__ == "__main__":
    cli()