🛠️ Add `add-permalinks` and `add-permalinks-page` to `scripts/docs.py` (#14033)

Co-authored-by: Sebastián Ramírez <[email protected]>
8 months ago · 32b375c5e4
2 changed files with 107 additions and 3 deletions
--- a/requirements-docs.txt
+++ b/requirements-docs.txt
@ -18,3 +18,4 @@ griffe-warnings-deprecated==1.1.0
 black==25.1.0
 mkdocs-macros-plugin==1.4.1
 markdown-include-variants==0.0.5
+python-slugify==8.0.4
--- a/scripts/docs.py
+++ b/scripts/docs.py
@ -4,6 +4,7 @@ import os
 import re
 import shutil
 import subprocess
+from html.parser import HTMLParser
 from http.server import HTTPServer, SimpleHTTPRequestHandler
 from multiprocessing import Pool
 from pathlib import Path
@ -14,6 +15,7 @@ import typer
 import yaml
 from jinja2 import Template
 from ruff.__main__ import find_ruff_bin
+from slugify import slugify as py_slugify

 logging.basicConfig(level=logging.INFO)

@ -25,8 +27,8 @@ missing_translation_snippet = """
 {!../../docs/missing-translation.md!}
 """

-non_translated_sections = [
-    "reference/",
+non_translated_sections = (
+    f"reference{os.sep}",
    "release-notes.md",
    "fastapi-people.md",
    "external-links.md",
@ -34,7 +36,7 @@ non_translated_sections = [
    "management-tasks.md",
    "management.md",
    "contributing.md",
-]
+)

 docs_path = Path("docs")
 en_docs_path = Path("docs/en")
@ -42,7 +44,39 @@ en_config_path: Path = en_docs_path / mkdocs_name
 site_path = Path("site").absolute()
 build_site_path = Path("site_build").absolute()

+header_pattern = re.compile(r"^(#{1,6}) (.+?)(?:\s*\{\s*(#.*)\s*\})?\s*$")
 header_with_permalink_pattern = re.compile(r"^(#{1,6}) (.+?)(\s*\{\s*#.*\s*\})\s*$")
+code_block3_pattern = re.compile(r"^\s*```")
+code_block4_pattern = re.compile(r"^\s*````")
+
+
+class VisibleTextExtractor(HTMLParser):
+    """Extract visible text from a string with HTML tags."""
+
+    def __init__(self):
+        super().__init__()
+        self.text_parts = []
+
+    def handle_data(self, data):
+        self.text_parts.append(data)
+
+    def extract_visible_text(self, html: str) -> str:
+        self.reset()
+        self.text_parts = []
+        self.feed(html)
+        return "".join(self.text_parts).strip()
+
+
+def slugify(text: str) -> str:
+    return py_slugify(
+        text,
+        replacements=[
+            ("`", ""),  # `dict`s -> dicts
+            ("'s", "s"),  # it's -> its
+            ("'t", "t"),  # don't -> dont
+            ("**", ""),  # **FastAPI**s -> FastAPIs
+        ],
+    )


 def get_en_config() -> Dict[str, Any]:
@ -426,5 +460,74 @@ def generate_docs_src_versions_for_file(file_path: Path) -> None:
        version_file.write_text(content_format, encoding="utf-8")


+@app.command()
+def add_permalinks_page(path: Path, update_existing: bool = False):
+    """
+    Add or update header permalinks in specific page of En docs.
+    """
+
+    if not path.is_relative_to(en_docs_path / "docs"):
+        raise RuntimeError(f"Path must be inside {en_docs_path}")
+    rel_path = path.relative_to(en_docs_path / "docs")
+
+    # Skip excluded sections
+    if str(rel_path).startswith(non_translated_sections):
+        return
+
+    visible_text_extractor = VisibleTextExtractor()
+    updated_lines = []
+    in_code_block3 = False
+    in_code_block4 = False
+    permalinks = set()
+
+    with path.open("r", encoding="utf-8") as f:
+        lines = f.readlines()
+
+    for line in lines:
+        # Handle codeblocks start and end
+        if not (in_code_block3 or in_code_block4):
+            if code_block4_pattern.match(line):
+                in_code_block4 = True
+            elif code_block3_pattern.match(line):
+                in_code_block3 = True
+        else:
+            if in_code_block4 and code_block4_pattern.match(line):
+                in_code_block4 = False
+            elif in_code_block3 and code_block3_pattern.match(line):
+                in_code_block3 = False
+
+        # Process Headers only outside codeblocks
+        if not (in_code_block3 or in_code_block4):
+            match = header_pattern.match(line)
+            if match:
+                hashes, title, _permalink = match.groups()
+                if (not _permalink) or update_existing:
+                    slug = slugify(visible_text_extractor.extract_visible_text(title))
+                    if slug in permalinks:
+                        # If the slug is already used, append a number to make it unique
+                        count = 1
+                        original_slug = slug
+                        while slug in permalinks:
+                            slug = f"{original_slug}_{count}"
+                            count += 1
+                    permalinks.add(slug)
+
+                    line = f"{hashes} {title} {{ #{slug} }}\n"
+
+        updated_lines.append(line)
+
+    with path.open("w", encoding="utf-8") as f:
+        f.writelines(updated_lines)
+
+
+@app.command()
+def add_permalinks(update_existing: bool = False) -> None:
+    """
+    Add or update header permalinks in all pages of En docs.
+    """
+    for md_file in en_docs_path.rglob("*.md"):
+        add_permalinks_page(md_file, update_existing=update_existing)
+
+
 if __name__ == "__main__":
    app()