From acd9c4e1aac6888c462e21dc7f25f1f7d5c228ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebasti=C3=A1n=20Ram=C3=ADrez?= Date: Mon, 30 Dec 2024 18:46:43 +0000 Subject: [PATCH] =?UTF-8?q?=F0=9F=94=A8=20Add=20internal=20scripts=20to=20?= =?UTF-8?q?generate=20language=20translations=20with=20PydanticAI,=20inclu?= =?UTF-8?q?de=20Spanish=20prompt=20(#13123)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/es/llm-prompt.md | 148 +++++++++++++++++++++++++++++++ requirements-translations.txt | 1 + scripts/translate.py | 162 ++++++++++++++++++++++++++++++++++ 3 files changed, 311 insertions(+) create mode 100644 docs/es/llm-prompt.md create mode 100644 requirements-translations.txt create mode 100644 scripts/translate.py diff --git a/docs/es/llm-prompt.md b/docs/es/llm-prompt.md new file mode 100644 index 000000000..3340dbc99 --- /dev/null +++ b/docs/es/llm-prompt.md @@ -0,0 +1,148 @@ +Translate to Spanish (español). + +Use the informal grammar (use "tú" instead of "usted"). + +For instructions or titles in imperative, keep them in imperative, for example "Edit it" to "Edítalo". + +There are special blocks of notes, tips and others that look like: + +/// note + +To translate it, keep the same line and add the translation after a vertical bar: + +/// note | Nota + +Some examples: + +Source: + +/// tip + +Result: + +/// tip | Consejo + +Source: + +/// details | Preview + +Result: + +/// details | Vista previa + +Source: + +/// warning + +Result: + +/// warning | Advertencia + +Source: + +/// info + +Result: + +/// info | Información + +Source: + +/// note | Technical Details + +Result: + +/// note | Detalles Técnicos + +--- + +For the next terms, use the following translations: + +* framework: framework (do not translate to "marco") +* performance: rendimiento +* program (verb): programar +* code (verb): programar +* type hints: anotaciones de tipos +* type annotations: anotaciones de tipos +* autocomplete: autocompletado +* completion (in the context of autocompletion): autocompletado +* feature: funcionalidad +* sponsor: sponsor +* host (in a podcast): host +* request (as in HTTP request): request +* response (as in HTTP response): response +* path operation function: path operation function (do not translate to "función de operación de ruta") +* path operation: path operation (do not translate to "operación de ruta") +* path (as in URL path): path (do not translate to "ruta") +* query (as in URL query): query (do not translate to "consulta") +* cookie (as in HTTP cookie): cookie +* header (as in HTTP header): header +* form (as in HTML form): formulario +* type checks: chequeo de tipos +* parse: parse +* parsing: parsing +* marshall: marshall +* library: paquete (do not translate to "biblioteca" or "librería") +* instance: instance (do not translate to "instancia") +* scratch the surface: tocar los conceptos básicos +* string: string +* bug: bug +* docs: documentación (do not translate to "documentos") +* cheat sheet: cheat sheet (do not translate to "chuleta") +* key (as in key-value pair, dictionary key): clave +* array (as in JSON array): array +* API key: API key (do not translate to "clave API") +* 100% test coverage: cobertura de tests del 100% +* back and forth: de un lado a otro +* I/O (as in "input and output"): I/O (do not translate to "E/S") +* Machine Learning: Machine Learning (do not translate to "Aprendizaje Automático") +* Deep Learning: Deep Learning (do not translate to "Aprendizaje Profundo") +* callback hell: callback hell (do not translate to "infierno de callbacks") +* tip: Consejo (do not translate to "tip") +* check: Revisa (do not translate to "chequea" or "comprobación) +* Cross-Origin Resource Sharing: Cross-Origin Resource Sharing (do not translate to "Compartición de Recursos de Origen Cruzado") +* Release Notes: Release Notes (do not translate to "Notas de la Versión") +* Semantic Versioning: Semantic Versioning (do not translate to "Versionado Semántico") +* dependable: dependable (do not translate to "confiable" or "fiable") +* list (as in Python list): list +* context manager: context manager (do not translate to "gestor de contexto" or "administrador de contexto") +* a little bit: un poquito +* graph (data structure, as in "dependency graph"): grafo (do not translate to "gráfico") +* form data: form data (do not translate to "datos de formulario" or "datos de form") +* import (as in code import): import (do not translate to "importación") +* JSON Schema: JSON Schema (do not translate to "Esquema JSON") +* embed: embeber (do not translate to "incrustar") +* request body: request body (do not translate to "cuerpo de la petición") +* response body: response body (do not translate to "cuerpo de la respuesta") +* cross domain: cross domain (do not translate to "dominio cruzado") +* cross origin: cross origin (do not translate to "origen cruzado") +* plugin: plugin (do not translate to "complemento" or "extensión") +* plug-in: plug-in (do not translate to "complemento" or "extensión") +* plug-ins: plug-ins (do not translate to "complementos" or "extensiones") +* full stack: full stack (do not translate to "pila completa") +* full-stack: full-stack (do not translate to "de pila completa") +* stack: stack (do not translate to "pila") +* loop (as in async loop): loop (do not translate to "bucle" or "ciclo") +* hard dependencies: dependencias obligatorias (do not translate to "dependencias duras") +* locking: locking (do not translate to "bloqueo") +* testing (as in software testing): escribir pruebas (do not translate to "probar") +* code base: code base (do not translate to "base de código") +* default: por defecto (do not translate to "predeterminado") +* default values: valores por defecto (do not translate to "valores predeterminados") +* media type: media type (do not translate to "tipo de medio") +* instantiate: crear un instance (do not translate to "instanciar") +* OAuth2 Scopes: Scopes de OAuth2 (do not translate to "Alcances de OAuth2") +* on the fly: sobre la marcha (do not translate to "al vuelo") +* terminal: terminal (femenine, as in "la terminal") +* terminals: terminales (plural femenine, as in "las terminales") +* lifespan: lifespan (do not translate to "vida útil" or "tiempo de vida") +* unload: quitar de memoria (do not translate to "descargar") +* mount (noun): mount (do not translate to "montura") +* mount (verb): montar +* statement (as in code statement): statement (do not translate to "declaración" or "sentencia") +* worker process: worker process (do not translate to "proceso trabajador" or "proceso de trabajo") +* worker processes: worker processes (do not translate to "procesos trabajadores" or "procesos de trabajo") +* worker: worker (do not translate to "trabajador") +* load balancer: load balancer (do not translate to "balanceador de carga") +* load balance: load balance (do not translate to "balancear carga") +* self hosting: self hosting (do not translate to "auto alojamiento") diff --git a/requirements-translations.txt b/requirements-translations.txt new file mode 100644 index 000000000..a8f8a02d7 --- /dev/null +++ b/requirements-translations.txt @@ -0,0 +1 @@ +pydantic-ai==0.0.15 diff --git a/scripts/translate.py b/scripts/translate.py new file mode 100644 index 000000000..ce11b3877 --- /dev/null +++ b/scripts/translate.py @@ -0,0 +1,162 @@ +from functools import lru_cache +from pathlib import Path +from typing import Iterable + +import typer +import yaml +from pydantic_ai import Agent + +non_translated_sections = ( + "reference/", + "release-notes.md", + "fastapi-people.md", + "external-links.md", + "newsletter.md", + "management-tasks.md", + "management.md", + "contributing.md", +) + + +general_prompt = """ +For technical terms in English that don't have a common translation term use the original term in English. + +For code snippets or fragments, surrounded by backticks (`), don't translate the content, keep the original in English. For example, `list`, `dict`, keep them as is. + +The content is written in markdown, write the translation in markdown as well. Don't add triple backticks (`) around the generated translation content. + +When there's an example of code, the console or a terminal, normally surrounded by triple backticks and a keyword like "console" or "bash" (e.g. ```console), do not translate the content, keep the original in English. + +The original content will be surrounded by triple percentage signs (%) and you should translate it to the target language. Do not include the triple percentage signs in the translation. +""" + + +@lru_cache +def get_langs() -> dict[str, str]: + return yaml.safe_load(Path("docs/language_names.yml").read_text()) + + +def generate_lang_path(*, lang: str, path: Path) -> Path: + en_docs_path = Path("docs/en/docs") + assert str(path).startswith( + str(en_docs_path) + ), f"Path must be inside {en_docs_path}" + lang_docs_path = Path(f"docs/{lang}/docs") + out_path = Path(str(path).replace(str(en_docs_path), str(lang_docs_path))) + return out_path + + +def translate_page(*, lang: str, path: Path) -> None: + langs = get_langs() + language = langs[lang] + lang_path = Path(f"docs/{lang}") + lang_path.mkdir(exist_ok=True) + lang_prompt_path = lang_path / "llm-prompt.md" + assert lang_prompt_path.exists(), f"Prompt file not found: {lang_prompt_path}" + lang_prompt_content = lang_prompt_path.read_text() + + en_docs_path = Path("docs/en/docs") + assert str(path).startswith( + str(en_docs_path) + ), f"Path must be inside {en_docs_path}" + out_path = generate_lang_path(lang=lang, path=path) + out_path.parent.mkdir(parents=True, exist_ok=True) + original_content = path.read_text() + old_translation: str | None = None + if out_path.exists(): + old_translation = out_path.read_text() + agent = Agent("openai:gpt-4o") + + prompt_segments = [ + lang_prompt_content, + general_prompt, + ] + if old_translation: + prompt_segments.extend( + [ + "There's an existing previous translation for this content that is probably outdated with old content or old instructions.", + "Update the translation given your current instructions and the original content.", + "If you have instructions to translate specific terms or phrases in a specific way, please follow those instructions instead of keeping the old and outdated content.", + "Previous translation:", + f"%%%\n{old_translation}%%%", + ] + ) + prompt_segments.extend( + [ + f"Translate to {language} ({lang}).", + "Original content:", + f"%%%\n{original_content}%%%", + ] + ) + prompt = "\n\n".join(prompt_segments) + + result = agent.run_sync(prompt) + out_content = f"{result.data.strip()}\n" + out_path.write_text(out_content) + + +def iter_paths_to_translate() -> Iterable[Path]: + """ + Iterate on the markdown files to translate in order of priority. + """ + first_dirs = [ + Path("docs/en/docs/learn"), + Path("docs/en/docs/tutorial"), + Path("docs/en/docs/advanced"), + Path("docs/en/docs/about"), + Path("docs/en/docs/how-to"), + ] + first_parent = Path("docs/en/docs") + yield from first_parent.glob("*.md") + for dir_path in first_dirs: + yield from dir_path.rglob("*.md") + first_dirs_str = tuple(str(d) for d in first_dirs) + for path in Path("docs/en/docs").rglob("*.md"): + if str(path).startswith(first_dirs_str): + continue + if path.parent == first_parent: + continue + yield path + + +def translate_all(lang: str) -> None: + paths_to_process: list[Path] = [] + for path in iter_paths_to_translate(): + if str(path).replace("docs/en/docs/", "").startswith(non_translated_sections): + continue + paths_to_process.append(path) + print("Original paths:") + for p in paths_to_process: + print(f" - {p}") + print(f"Total original paths: {len(paths_to_process)}") + missing_paths: list[Path] = [] + skipped_paths: list[Path] = [] + for p in paths_to_process: + lang_path = generate_lang_path(lang=lang, path=p) + if lang_path.exists(): + skipped_paths.append(p) + continue + missing_paths.append(p) + print("Paths to skip:") + for p in skipped_paths: + print(f" - {p}") + print(f"Total paths to skip: {len(skipped_paths)}") + print("Paths to process:") + for p in missing_paths: + print(f" - {p}") + print(f"Total paths to process: {len(missing_paths)}") + for p in missing_paths: + print(f"Translating: {p}") + translate_page(lang="es", path=p) + print(f"Done translating: {p}") + + +def main(*, lang: str, path: Path = None) -> None: + if path: + translate_page(lang=lang, path=path) + else: + translate_all(lang=lang) + + +if __name__ == "__main__": + typer.run(main)