2 changed files with 530 additions and 0 deletions
@ -0,0 +1,444 @@ |
|||||
|
import re |
||||
|
from typing import TypedDict |
||||
|
|
||||
|
CODE_INCLUDE_RE = re.compile(r"^\{\*\s*(\S+)\s*(.*)\*\}$") |
||||
|
CODE_INCLUDE_PLACEHOLDER = "<CODE_INCLUDE>" |
||||
|
|
||||
|
HEADER_WITH_PERMALINK_RE = re.compile(r"^(#{1,6}) (.+?)(\s*\{\s*#.*\s*\})?\s*$") |
||||
|
HEADER_LINE_RE = re.compile(r"^(#{1,6}) (.+?)(?:\s*\{\s*(#.*)\s*\})?\s*$") |
||||
|
|
||||
|
TIANGOLO_COM = "https://fastapi.tiangolo.com" |
||||
|
|
||||
|
MARKDOWN_LINK_RE = re.compile( |
||||
|
r"(?<!\\)(?<!\!)" # not an image ![...] and not escaped \[...] |
||||
|
r"\[(?P<text>.*?)\]" # link text (non-greedy) |
||||
|
r"\(" |
||||
|
r"(?P<url>[^)\s]+)" # url (no spaces and `)`) |
||||
|
r'(?:\s+["\'](?P<title>.*?)["\'])?' # optional title in "" or '' |
||||
|
r"\)" |
||||
|
r"(?:\s*\{(?P<attrs>[^}]*)\})?" # optional attributes in {} |
||||
|
) |
||||
|
|
||||
|
HTML_LINK_RE = re.compile(r"<a\s+[^>]*>.*?</a>") |
||||
|
HTML_LINK_TEXT = re.compile(r"<a\b([^>]*)>(.*?)</a>") |
||||
|
HTML_LINK_OPEN_TAG_RE = re.compile(r"<a\b([^>]*)>") |
||||
|
HTML_ATTR_RE = re.compile(r'(\w+)\s*=\s*([\'"])(.*?)\2') |
||||
|
|
||||
|
|
||||
|
class CodeIncludeInfo(TypedDict): |
||||
|
line_no: int |
||||
|
line: str |
||||
|
|
||||
|
|
||||
|
class HeaderPermalinkInfo(TypedDict): |
||||
|
line_no: int |
||||
|
hashes: str |
||||
|
permalink: str |
||||
|
|
||||
|
|
||||
|
class MarkdownLinkInfo(TypedDict): |
||||
|
line_no: int |
||||
|
url: str |
||||
|
text: str |
||||
|
title: str | None |
||||
|
attributes: str | None |
||||
|
|
||||
|
|
||||
|
class HTMLLinkAttribute(TypedDict): |
||||
|
name: str |
||||
|
quote: str |
||||
|
value: str |
||||
|
|
||||
|
|
||||
|
class HtmlLinkInfo(TypedDict): |
||||
|
line_no: int |
||||
|
full_tag: str |
||||
|
attributes: list[HTMLLinkAttribute] |
||||
|
text: str |
||||
|
|
||||
|
|
||||
|
# Code includes |
||||
|
# ----------------------------------------------------------------------------------------- |
||||
|
|
||||
|
|
||||
|
def extract_code_includes(lines: list[str]) -> list[CodeIncludeInfo]: |
||||
|
""" |
||||
|
Exctract lines that contain code includes. |
||||
|
|
||||
|
Return list of CodeIncludeInfo namedtuples, where each tuple contains: |
||||
|
- `line_no` - line number (1-based) |
||||
|
- `line` - text of the line |
||||
|
""" |
||||
|
|
||||
|
includes: list[CodeIncludeInfo] = [] |
||||
|
for line_no, line in enumerate(lines, start=1): |
||||
|
if CODE_INCLUDE_RE.match(line): |
||||
|
includes.append(CodeIncludeInfo(line_no=line_no, line=line)) |
||||
|
return includes |
||||
|
|
||||
|
|
||||
|
def replace_code_includes_with_placeholders(text: list[str]) -> list[str]: |
||||
|
""" |
||||
|
Replace code includes with placeholders. |
||||
|
""" |
||||
|
|
||||
|
includes = extract_code_includes(text) |
||||
|
for include in includes: |
||||
|
text[include["line_no"] - 1] = CODE_INCLUDE_PLACEHOLDER |
||||
|
return text |
||||
|
|
||||
|
|
||||
|
def replace_placeholders_with_code_includes( |
||||
|
text: list[str], original_includes: list[CodeIncludeInfo] |
||||
|
) -> list[str]: |
||||
|
""" |
||||
|
Replace code includes placeholders with actual code includes from the original (English) document. |
||||
|
Fail if the number of placeholders does not match the number of original includes. |
||||
|
""" |
||||
|
|
||||
|
modified_text: list[str] = [] |
||||
|
include_index = 0 |
||||
|
for line in text: |
||||
|
if line.strip() == CODE_INCLUDE_PLACEHOLDER: |
||||
|
if include_index >= len(original_includes): |
||||
|
raise ValueError( |
||||
|
"Number of placeholders exceeds number of code includes in the original document" |
||||
|
) |
||||
|
modified_text.append(original_includes[include_index]["line"]) |
||||
|
include_index += 1 |
||||
|
else: |
||||
|
modified_text.append(line) |
||||
|
|
||||
|
if include_index < len(original_includes): |
||||
|
raise ValueError( |
||||
|
"Number of placeholders is less than number of code includes in the original document" |
||||
|
) |
||||
|
|
||||
|
return modified_text |
||||
|
|
||||
|
|
||||
|
# Header permalinks |
||||
|
# ----------------------------------------------------------------------------------------- |
||||
|
|
||||
|
|
||||
|
def extract_header_permalinks(lines: list[str]) -> list[HeaderPermalinkInfo]: |
||||
|
""" |
||||
|
Extract list of header permalinks from the given lines. |
||||
|
|
||||
|
Return list of HeaderPermalinkInfo namedtuples, where each tuple contains: |
||||
|
- `line_no` - line number (1-based) |
||||
|
- `hashes` - string of hashes representing header level (e.g., "###") |
||||
|
- `permalink` - permalink string (e.g., "{#permalink}") |
||||
|
""" |
||||
|
|
||||
|
headers: list[HeaderPermalinkInfo] = [] |
||||
|
in_code_block3 = False |
||||
|
in_code_block4 = False |
||||
|
|
||||
|
for line_no, line in enumerate(lines, start=1): |
||||
|
if not (in_code_block3 or in_code_block4): |
||||
|
if line.startswith("```"): |
||||
|
count = len(line) - len(line.lstrip("`")) |
||||
|
if count == 3: |
||||
|
in_code_block3 = True |
||||
|
continue |
||||
|
elif count >= 4: |
||||
|
in_code_block4 = True |
||||
|
continue |
||||
|
|
||||
|
header_match = HEADER_WITH_PERMALINK_RE.match(line) |
||||
|
if header_match: |
||||
|
hashes, _title, permalink = header_match.groups() |
||||
|
headers.append( |
||||
|
HeaderPermalinkInfo( |
||||
|
hashes=hashes, line_no=line_no, permalink=permalink |
||||
|
) |
||||
|
) |
||||
|
|
||||
|
elif in_code_block3: |
||||
|
if line.startswith("```"): |
||||
|
count = len(line) - len(line.lstrip("`")) |
||||
|
if count == 3: |
||||
|
in_code_block3 = False |
||||
|
continue |
||||
|
|
||||
|
elif in_code_block4: |
||||
|
if line.startswith("````"): |
||||
|
count = len(line) - len(line.lstrip("`")) |
||||
|
if count >= 4: |
||||
|
in_code_block4 = False |
||||
|
continue |
||||
|
|
||||
|
return headers |
||||
|
|
||||
|
|
||||
|
def remove_header_permalinks(lines: list[str]) -> list[str]: |
||||
|
""" |
||||
|
Remove permalinks from headers in the given lines. |
||||
|
""" |
||||
|
|
||||
|
modified_lines: list[str] = [] |
||||
|
for line in lines: |
||||
|
header_match = HEADER_WITH_PERMALINK_RE.match(line) |
||||
|
if header_match: |
||||
|
hashes, title, _permalink = header_match.groups() |
||||
|
modified_line = f"{hashes} {title}" |
||||
|
modified_lines.append(modified_line) |
||||
|
else: |
||||
|
modified_lines.append(line) |
||||
|
return modified_lines |
||||
|
|
||||
|
|
||||
|
def replace_header_permalinks( |
||||
|
text: list[str], original_permalinks: list[HeaderPermalinkInfo] |
||||
|
) -> list[str]: |
||||
|
""" |
||||
|
Replace permalinks in the given text with the permalinks from the original document. |
||||
|
|
||||
|
Fail if the number or order of headers does not match the original. |
||||
|
""" |
||||
|
|
||||
|
modified_text: list[str] = [] |
||||
|
permalink_index = 0 |
||||
|
for line in text: |
||||
|
header_match = HEADER_LINE_RE.match(line) |
||||
|
if header_match: |
||||
|
if permalink_index >= len(original_permalinks): |
||||
|
raise ValueError( |
||||
|
"Number of headers exceeds number of headers in the original document" |
||||
|
) |
||||
|
hashes, title, _permalink = header_match.groups() |
||||
|
original_permalink_info = original_permalinks[permalink_index] |
||||
|
if original_permalink_info["hashes"] != hashes: |
||||
|
raise ValueError( |
||||
|
"Header levels do not match between document and original document" |
||||
|
) |
||||
|
|
||||
|
modified_line = f"{hashes} {title}{original_permalink_info['permalink']}" |
||||
|
modified_text.append(modified_line) |
||||
|
permalink_index += 1 |
||||
|
else: |
||||
|
modified_text.append(line) |
||||
|
|
||||
|
if permalink_index < len(original_permalinks): |
||||
|
raise ValueError( |
||||
|
"Number of headers is less than number of headers in the original document" |
||||
|
) |
||||
|
|
||||
|
return modified_text |
||||
|
|
||||
|
|
||||
|
# Markdown links |
||||
|
# ----------------------------------------------------------------------------------------- |
||||
|
|
||||
|
|
||||
|
def extract_markdown_links(lines: list[str]) -> list[tuple[str, int]]: |
||||
|
""" |
||||
|
Extract all markdown links from the given lines. |
||||
|
|
||||
|
Return list of MarkdownLinkInfo namedtuples, where each tuple contains: |
||||
|
- `line_no` - line number (1-based) |
||||
|
- `url` - link URL |
||||
|
- `text` - link text |
||||
|
- `title` - link title (if any) |
||||
|
""" |
||||
|
|
||||
|
links: list[MarkdownLinkInfo] = [] |
||||
|
for line_no, line in enumerate(lines, start=1): |
||||
|
for m in MARKDOWN_LINK_RE.finditer(line): |
||||
|
links.append( |
||||
|
MarkdownLinkInfo( |
||||
|
line_no=line_no, |
||||
|
url=m.group("url"), |
||||
|
text=m.group("text"), |
||||
|
title=m.group("title"), |
||||
|
attributes=m.group("attrs"), |
||||
|
) |
||||
|
) |
||||
|
return links |
||||
|
|
||||
|
|
||||
|
def _construct_markdown_link( |
||||
|
url: str, text: str, title: str | None, attributes: str | None, lang_code: str |
||||
|
) -> str: |
||||
|
""" |
||||
|
Construct a markdown link, adjusting the URL for the given language code if needed. |
||||
|
""" |
||||
|
|
||||
|
if url.startswith(TIANGOLO_COM): |
||||
|
url = url.replace(TIANGOLO_COM, f"{TIANGOLO_COM}/{lang_code}") |
||||
|
|
||||
|
if title: |
||||
|
link = f'[{text}]({url} "{title}")' |
||||
|
else: |
||||
|
link = f"[{text}]({url})" |
||||
|
|
||||
|
if attributes: |
||||
|
link += f" {{{attributes}}}" |
||||
|
|
||||
|
return link |
||||
|
|
||||
|
|
||||
|
def replace_markdown_links( |
||||
|
text: list[str], original_links: list[MarkdownLinkInfo], lang_code: str |
||||
|
) -> list[str]: |
||||
|
""" |
||||
|
Replace markdown links in the given text with the original links. |
||||
|
|
||||
|
Fail if the number of links does not match the original. |
||||
|
""" |
||||
|
|
||||
|
modified_text: list[str] = [] |
||||
|
link_index = 0 |
||||
|
for line in text: |
||||
|
modified_line = line |
||||
|
for m in MARKDOWN_LINK_RE.finditer(line): |
||||
|
if link_index >= len(original_links): |
||||
|
raise ValueError( |
||||
|
"Number of markdown links exceeds number of markdown links in the original document" |
||||
|
) |
||||
|
link_text = m.group("text") |
||||
|
assert isinstance(link_text, str) |
||||
|
link_title = m.group("title") |
||||
|
assert link_title is None or isinstance(link_title, str) |
||||
|
|
||||
|
original_link_info = original_links[link_index] |
||||
|
|
||||
|
# Replace |
||||
|
replacement_link = _construct_markdown_link( |
||||
|
url=original_link_info["url"], |
||||
|
text=link_text, |
||||
|
title=link_title, |
||||
|
attributes=original_link_info["attributes"], |
||||
|
lang_code=lang_code, |
||||
|
) |
||||
|
modified_line = modified_line.replace(m.group(0), replacement_link, 1) |
||||
|
|
||||
|
link_index += 1 |
||||
|
modified_text.append(modified_line) |
||||
|
|
||||
|
if link_index < len(original_links): |
||||
|
raise ValueError( |
||||
|
"Number of markdown links is less than in the original document" |
||||
|
) |
||||
|
|
||||
|
return modified_text |
||||
|
|
||||
|
|
||||
|
# HTML links |
||||
|
# ----------------------------------------------------------------------------------------- |
||||
|
|
||||
|
|
||||
|
def extract_html_links(lines: list[str]) -> list[HtmlLinkInfo]: |
||||
|
""" |
||||
|
Extract all HTML links from the given lines. |
||||
|
|
||||
|
Return list of HtmlLinkInfo namedtuples, where each tuple contains: |
||||
|
- `line_no` - line number (1-based) |
||||
|
- `full_tag` - full HTML link tag |
||||
|
- `attributes` - list of HTMLLinkAttribute namedtuples (name, quote, value) |
||||
|
- `text` - link text |
||||
|
""" |
||||
|
|
||||
|
links = [] |
||||
|
for line_no, line in enumerate(lines, start=1): |
||||
|
for html_link in HTML_LINK_RE.finditer(line): |
||||
|
link_str = html_link.group(0) |
||||
|
|
||||
|
link_text_match = HTML_LINK_TEXT.match(link_str) |
||||
|
assert link_text_match is not None |
||||
|
link_text = link_text_match.group(2) |
||||
|
assert isinstance(link_text, str) |
||||
|
|
||||
|
link_open_tag_match = HTML_LINK_OPEN_TAG_RE.match(link_str) |
||||
|
assert link_open_tag_match is not None |
||||
|
link_open_tag = link_open_tag_match.group(1) |
||||
|
assert isinstance(link_open_tag, str) |
||||
|
|
||||
|
attributes: list[HTMLLinkAttribute] = [] |
||||
|
for attr_name, attr_quote, attr_value in re.findall( |
||||
|
HTML_ATTR_RE, link_open_tag |
||||
|
): |
||||
|
assert isinstance(attr_name, str) |
||||
|
assert isinstance(attr_quote, str) |
||||
|
assert isinstance(attr_value, str) |
||||
|
attributes.append( |
||||
|
HTMLLinkAttribute( |
||||
|
name=attr_name, quote=attr_quote, value=attr_value |
||||
|
) |
||||
|
) |
||||
|
links.append( |
||||
|
HtmlLinkInfo( |
||||
|
line_no=line_no, |
||||
|
full_tag=link_str, |
||||
|
attributes=attributes, |
||||
|
text=link_text, |
||||
|
) |
||||
|
) |
||||
|
return links |
||||
|
|
||||
|
|
||||
|
def _construct_html_link( |
||||
|
link_text: str, |
||||
|
attributes: list[HTMLLinkAttribute], |
||||
|
lang_code: str, |
||||
|
) -> str: |
||||
|
""" |
||||
|
Reconstruct HTML link, adjusting the URL for the given language code if needed. |
||||
|
""" |
||||
|
|
||||
|
attributes_upd: list[HTMLLinkAttribute] = [] |
||||
|
for attribute in attributes: |
||||
|
if attribute["name"] == "href": |
||||
|
original_url = attribute["value"] |
||||
|
if original_url.startswith(TIANGOLO_COM): |
||||
|
url = original_url.replace(TIANGOLO_COM, f"{TIANGOLO_COM}/{lang_code}") |
||||
|
else: |
||||
|
url = original_url |
||||
|
attributes_upd.append( |
||||
|
HTMLLinkAttribute(name="href", quote=attribute["quote"], value=url) |
||||
|
) |
||||
|
else: |
||||
|
attributes_upd.append(attribute) |
||||
|
|
||||
|
attrs_str = " ".join( |
||||
|
f"{attribute['name']}={attribute['quote']}{attribute['value']}{attribute['quote']}" |
||||
|
for attribute in attributes_upd |
||||
|
) |
||||
|
return f"<a {attrs_str}>{link_text}</a>" |
||||
|
|
||||
|
|
||||
|
def replace_html_links( |
||||
|
text: list[str], original_links: list[HtmlLinkInfo], lang_code: str |
||||
|
) -> list[str]: |
||||
|
""" |
||||
|
Replace HTML links in the given text with the links from the original document. |
||||
|
|
||||
|
Adjust URLs for the given language code. |
||||
|
Fail if the number of links does not match the original. |
||||
|
""" |
||||
|
|
||||
|
links = extract_html_links(text) |
||||
|
if len(links) > len(original_links): |
||||
|
raise ValueError( |
||||
|
"Number of HTML links exceeds number of HTML links in the original document" |
||||
|
) |
||||
|
elif len(links) < len(original_links): |
||||
|
raise ValueError("Number of HTML links is less than in the original document") |
||||
|
|
||||
|
modified_text = text.copy() |
||||
|
for link_index, link in enumerate(links): |
||||
|
original_link_info = original_links[link_index] |
||||
|
|
||||
|
# Replace in the document text |
||||
|
replacement_link = _construct_html_link( |
||||
|
link_text=link["text"], |
||||
|
attributes=original_link_info["attributes"], |
||||
|
lang_code=lang_code, |
||||
|
) |
||||
|
line_no = link["line_no"] - 1 |
||||
|
modified_text[line_no] = modified_text[line_no].replace( |
||||
|
link["full_tag"], replacement_link, 1 |
||||
|
) |
||||
|
|
||||
|
return modified_text |
||||
@ -0,0 +1,86 @@ |
|||||
|
from pathlib import Path |
||||
|
from typing import Annotated |
||||
|
|
||||
|
import typer |
||||
|
|
||||
|
from scripts.doc_parsing_utils import ( |
||||
|
extract_code_includes, |
||||
|
extract_header_permalinks, |
||||
|
extract_html_links, |
||||
|
extract_markdown_links, |
||||
|
replace_code_includes_with_placeholders, |
||||
|
replace_header_permalinks, |
||||
|
replace_html_links, |
||||
|
replace_markdown_links, |
||||
|
replace_placeholders_with_code_includes, |
||||
|
) |
||||
|
|
||||
|
cli = typer.Typer() |
||||
|
|
||||
|
|
||||
|
@cli.callback() |
||||
|
def callback(): |
||||
|
pass |
||||
|
|
||||
|
|
||||
|
@cli.command() |
||||
|
def fix_pages( |
||||
|
doc_paths: Annotated[ |
||||
|
list[Path], |
||||
|
typer.Argument(help="List of paths to documents."), |
||||
|
], |
||||
|
): |
||||
|
for path in doc_paths: |
||||
|
lang_code = path.parts[1] |
||||
|
if lang_code == "en": |
||||
|
print(f"Skipping English document: {path}") |
||||
|
continue |
||||
|
|
||||
|
en_doc_path = Path("docs") / "en" / Path(*path.parts[2:]) |
||||
|
|
||||
|
doc_lines = path.read_text(encoding="utf-8").splitlines() |
||||
|
en_doc_lines = en_doc_path.read_text(encoding="utf-8").splitlines() |
||||
|
|
||||
|
# Fix code includes |
||||
|
en_code_includes = extract_code_includes(en_doc_lines) |
||||
|
doc_lines_with_placeholders = replace_code_includes_with_placeholders(doc_lines) |
||||
|
fixed_doc_lines = replace_placeholders_with_code_includes( |
||||
|
doc_lines_with_placeholders, en_code_includes |
||||
|
) |
||||
|
if fixed_doc_lines != doc_lines: |
||||
|
print(f"Fixing code includes in: {path}") |
||||
|
doc_lines = fixed_doc_lines |
||||
|
|
||||
|
# Fix permalinks |
||||
|
en_permalinks = extract_header_permalinks(en_doc_lines) |
||||
|
fixed_doc_lines = replace_header_permalinks(doc_lines, en_permalinks) |
||||
|
if fixed_doc_lines != doc_lines: |
||||
|
print(f"Fixing header permalinks in: {path}") |
||||
|
doc_lines = fixed_doc_lines |
||||
|
|
||||
|
# Fix markdown links |
||||
|
en_markdown_links = extract_markdown_links(en_doc_lines) |
||||
|
fixed_doc_lines = replace_markdown_links( |
||||
|
doc_lines, en_markdown_links, lang_code |
||||
|
) |
||||
|
if fixed_doc_lines != doc_lines: |
||||
|
print(f"Fixing markdown links in: {path}") |
||||
|
doc_lines = fixed_doc_lines |
||||
|
|
||||
|
# Fix HTML links |
||||
|
en_html_links = extract_html_links(en_doc_lines) |
||||
|
fixed_doc_lines = replace_html_links(doc_lines, en_html_links, lang_code) |
||||
|
if fixed_doc_lines != doc_lines: |
||||
|
print(f"Fixing HTML links in: {path}") |
||||
|
doc_lines = fixed_doc_lines |
||||
|
|
||||
|
# Fix multiline code blocks |
||||
|
# TODO: Implement |
||||
|
|
||||
|
# Write back the fixed document |
||||
|
doc_lines.append("") # Ensure file ends with a newline |
||||
|
path.write_text("\n".join(doc_lines), encoding="utf-8") |
||||
|
|
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
cli() |
||||
Loading…
Reference in new issue