#!/usr/bin/env python3 """ Translation Processor A script to process reStructuredText and HTML files by replacing translation tags with content from .po files, handling image references, and optionally converting to markdown. """ import argparse import os import re import shutil import sys import urllib.parse from typing import Dict, List, Optional, Tuple import polib # Optional dependency for markdown conversion try: import docutils.core DOCUTILS_AVAILABLE = True except ImportError: DOCUTILS_AVAILABLE = False try: import pypandoc PANDOC_AVAILABLE = True except ImportError: PANDOC_AVAILABLE = False def load_translations(po_file_path: str) -> Dict[str, str]: """ Load translations from a .po file into a dictionary. Args: po_file_path: Path to the .po file Returns: Dictionary mapping original text to translated text """ try: po = polib.pofile(po_file_path) translations = {} for entry in po: if entry.msgstr and entry.msgid: # Only include entries with translations translations[entry.msgid] = entry.msgstr return translations except Exception as e: print(f"Error loading translations from {po_file_path}: {e}", file=sys.stderr) return {} def replace_template_vars(content: str) -> str: """ Replace template variables like {{spec_url()}} with their values. Args: content: Content with template variables Returns: Content with template variables replaced """ # Define base URLs for different types of links base_urls = { 'spec_url': 'https://i2p.net/spec/', 'proposal_url': 'https://i2p.net/spec/proposals/proposal', 'i2p_url': 'https://i2p.net/', 'site_url': 'https://i2p.net/', 'get_url': 'https://i2p.net/' } # Handle {{spec_url("name")}} pattern def replace_spec_url(match): func_name = match.group(1) arg = match.group(2).strip('"\'') if match.group(2) else "" if func_name in base_urls: if func_name == 'proposal_url': return f"{base_urls[func_name]}{arg}.html" else: return f"{base_urls[func_name]}{arg}" # Handle special cases for other template functions if func_name == 'url_for': # Extract the filename from patterns like url_for('static', filename='images/...') filename_match = re.search(r'filename=[\'"](.*?)[\'"]', arg) if filename_match: return f"/_static/{filename_match.group(1)}" elif func_name == 'i2pconv': # For i2p domain conversions, return as is return arg return match.group(0) # Return unchanged if not recognized # This pattern matches template functions like {{spec_url("ntcp2")}} template_pattern = r'{{([a-zA-Z_]+)\(([^}]*?)\)}}' processed_content = re.sub(template_pattern, replace_spec_url, content) # Handle other simple variable substitutions like {{ _('text') }} def replace_simple_var(match): var_name = match.group(1).strip() # For translation function calls like _('text'), return just the text if var_name.startswith("_('") and var_name.endswith("')"): return var_name[3:-2] # Extract the text between quotes return match.group(0) # Return unchanged if not recognized simple_var_pattern = r'{{([^}]+?)}}' return re.sub(simple_var_pattern, replace_simple_var, processed_content) def replace_translations(content: str, translations: Dict[str, str]) -> str: """ Replace translation tags in the content with translated text. Args: content: Content with translation tags translations: Dictionary of translations Returns: Content with translations applied """ # First, handle simple {% trans %}...{% endtrans %} blocks def replace_simple_match(match): text = match.group(1).strip() return translations.get(text, text) # Match {% trans %}...{% endtrans %} patterns simple_pattern = r'{%\s*trans\s*%}(.*?){%\s*endtrans\s*%}' content = re.sub(simple_pattern, replace_simple_match, content, flags=re.DOTALL) # Now handle more complex translation blocks with arguments def replace_complex_match(match): # Extract parameters if present params_str = match.group(1) or "" text = match.group(2).strip() # Process parameters (for future use) params = {} if params_str: param_matches = re.finditer(r'(\w+)=["\'](.*?)["\']', params_str) for param_match in param_matches: key, value = param_match.groups() params[key] = value # Replace parameter references in the text if f"{{{key}}}" in text: text = text.replace(f"{{{key}}}", value) # Apply translation translated = translations.get(text, text) # If there are parameter values, they need to be maintained in the translation for key, value in params.items(): if f"{{{key}}}" in translated: translated = translated.replace(f"{{{key}}}", value) return translated # Match {% trans param1="value" -%}...{%- endtrans %} patterns with optional parameters complex_pattern = r'{%\s*trans\s*(.*?)-%}(.*?){%-\s*endtrans\s*%}' content = re.sub(complex_pattern, replace_complex_match, content, flags=re.DOTALL) return content def process_rst_images(content: str, base_dir: str, assets_dir: str) -> str: """ Process image references in RST content. Args: content: RST content with image references base_dir: Base directory of the input file assets_dir: Directory to store images Returns: Content with updated image references """ # Create assets directory if it doesn't exist images_dir = os.path.join(assets_dir, "images") os.makedirs(images_dir, exist_ok=True) # Find image references # This pattern matches both basic image directives and figure directives pattern = r'\.\.\s+(image|figure)::\s+([\S]+)' def process_match(match): directive_type = match.group(1) # image or figure old_path = match.group(2).strip() # Handle relative paths if not os.path.isabs(old_path): full_old_path = os.path.join(base_dir, old_path) else: full_old_path = old_path # Extract filename from path filename = os.path.basename(old_path) new_rel_path = os.path.join("images", filename) new_full_path = os.path.join(assets_dir, new_rel_path) # Copy the image if it exists if os.path.exists(full_old_path): try: shutil.copy2(full_old_path, new_full_path) print(f"Copied image: {full_old_path} -> {new_full_path}") except Exception as e: print(f"Error copying image {full_old_path}: {e}", file=sys.stderr) else: print(f"Warning: Image file not found: {full_old_path}", file=sys.stderr) # Return the updated directive return f".. {directive_type}:: {new_rel_path}" return re.sub(pattern, process_match, content) def process_html_images(content: str, base_dir: str, assets_dir: str) -> str: """ Process image references in HTML content. Args: content: HTML content with image references base_dir: Base directory of the input file assets_dir: Directory to store images Returns: Content with updated image references """ # Create assets directory if it doesn't exist images_dir = os.path.join(assets_dir, "images") os.makedirs(images_dir, exist_ok=True) # Find image references in HTML # This pattern matches tags pattern = r']*src=["\']((?!https?://)[^"\']+)["\'][^>]*>' def process_match(match): img_tag = match.group(0) path = match.group(1) # Skip URLs if path.startswith(('http://', 'https://')): return img_tag # Handle templated paths if "{{" in path: # Extract paths from template expressions like {{ url_for('static', filename='images/file.png') }} template_match = re.search(r'filename=[\'"](.*?)[\'"]', path) if template_match: path = f"/_static/{template_match.group(1)}" else: return img_tag # Can't process this template # Remove leading /_static/ if present if path.startswith('/_static/'): path = path[9:] # Remove /_static/ prefix # Handle relative paths if not os.path.isabs(path): full_old_path = os.path.join(base_dir, path) if not os.path.exists(full_old_path): # Try looking in static directory full_old_path = os.path.join(base_dir, 'static', path) else: full_old_path = path # Extract filename from path filename = os.path.basename(path) new_rel_path = os.path.join("images", filename) new_full_path = os.path.join(assets_dir, new_rel_path) # Copy the image if it exists if os.path.exists(full_old_path): try: shutil.copy2(full_old_path, new_full_path) print(f"Copied image: {full_old_path} -> {new_full_path}") except Exception as e: print(f"Error copying image {full_old_path}: {e}", file=sys.stderr) return img_tag # Return original if error else: print(f"Warning: Image file not found: {full_old_path}", file=sys.stderr) return img_tag # Return original if not found # Return the updated img tag with new path return img_tag.replace(match.group(1), f"images/{filename}") return re.sub(pattern, process_match, content) def clean_markdown_output(markdown: str) -> str: """ Clean up markdown output by handling URL-encoded template variables. Args: markdown: Markdown content with possible URL-encoded templates Returns: Cleaned markdown content """ # Replace URL-encoded template variables def replace_encoded_templates(match): # Decode the URL-encoded string encoded_text = match.group(0) decoded_text = urllib.parse.unquote(encoded_text) # Extract values from template expressions like {{ url_for('static', filename='images/file.png') }} if decoded_text.startswith('{{') and decoded_text.endswith('}}'): template_content = decoded_text.strip('{}').strip() # Handle url_for template function if 'url_for' in template_content and 'filename=' in template_content: filename_match = re.search(r'filename=[\'"](.*?)[\'"]', template_content) if filename_match: return filename_match.group(1) # Return empty string for other template functions return '' return encoded_text # Find URL-encoded sequences that might be template variables encoded_pattern = r'%7B%7B.*?%7D%7D' markdown = re.sub(encoded_pattern, replace_encoded_templates, markdown) # Clean up any broken image links that might have resulted from template replacements # Change ![text](broken_link) to ![text](images/filename.ext) when possible def fix_image_links(match): alt_text = match.group(1) link = match.group(2) # If link is empty or looks like a broken template if not link or link.startswith('%7B') or link.startswith('{{'): # Try to extract image filename from alt text or use a placeholder filename = alt_text.replace(' ', '-').lower() if filename: return f"![{alt_text}](images/{filename}.png)" return match.group(0) # Return unchanged # Fix image links image_pattern = r'!\[(.*?)\]\((.*?)\)' markdown = re.sub(image_pattern, fix_image_links, markdown) return markdown def convert_to_markdown(content: str, is_rst: bool = True) -> str: """ Convert content to Markdown. Args: content: Content to convert is_rst: Whether the content is RST (True) or HTML (False) Returns: Markdown content """ if not PANDOC_AVAILABLE: print("Warning: pypandoc not available. Conversion to Markdown skipped.", file=sys.stderr) return content try: if is_rst: if not DOCUTILS_AVAILABLE: print("Warning: docutils not available. RST to HTML conversion skipped.", file=sys.stderr) return content # Convert RST to HTML html = docutils.core.publish_string( source=content, writer_name='html', settings_overrides={'output_encoding': 'unicode'} ) # Convert HTML to Markdown markdown = pypandoc.convert_text(html, 'md', format='html') else: # Convert HTML directly to Markdown markdown = pypandoc.convert_text(content, 'md', format='html') # Post-process markdown to clean up URL-encoded template variables markdown = clean_markdown_output(markdown) return markdown except Exception as e: print(f"Error converting to markdown: {e}", file=sys.stderr) return content def is_draft(file_path: str) -> bool: """ Check if a file is a draft based on its name. Args: file_path: Path to the file Returns: True if the file is a draft, False otherwise """ return '.draft.' in file_path.lower() def process_file( input_path: str, po_file_path: str, output_path: Optional[str] = None, to_markdown: bool = False, assets_dir: str = "./assets" ) -> Tuple[bool, str]: """ Process a file by replacing translations and handling images. Args: input_path: Path to input file (RST or HTML) po_file_path: Path to .po file with translations output_path: Path to write output (default: add .translated suffix) to_markdown: Whether to convert to markdown assets_dir: Directory to store assets Returns: Tuple of (success: bool, output_path: str) """ # Check that input files exist if not os.path.exists(input_path): print(f"Error: Input file does not exist: {input_path}", file=sys.stderr) return False, "" if not os.path.exists(po_file_path): print(f"Error: PO file does not exist: {po_file_path}", file=sys.stderr) return False, "" # Determine file type is_rst = input_path.lower().endswith('.rst') is_html = input_path.lower().endswith(('.html', '.htm')) if not (is_rst or is_html): print(f"Error: Unsupported file type: {input_path}. Only .rst, .html, and .htm files are supported.", file=sys.stderr) return False, "" # Determine output path if not specified if not output_path: base, ext = os.path.splitext(input_path) # Handle .draft.rst/.draft.html case if '.draft.' in base.lower(): base = base.replace('.draft', '') # Set extension based on conversion type if to_markdown: out_ext = '.md' else: out_ext = ext output_path = f"{base}.translated{out_ext}" # Create assets directory os.makedirs(assets_dir, exist_ok=True) try: # Load translations translations = load_translations(po_file_path) if not translations: print("Warning: No translations found in the .po file", file=sys.stderr) # Read input file with open(input_path, 'r', encoding='utf-8') as f: content = f.read() # Replace template variables content = replace_template_vars(content) # Replace translations content = replace_translations(content, translations) # Process images based on file type base_dir = os.path.dirname(os.path.abspath(input_path)) if is_rst: content = process_rst_images(content, base_dir, assets_dir) elif is_html: content = process_html_images(content, base_dir, assets_dir) # Convert to markdown if requested if to_markdown: content = convert_to_markdown(content, is_rst=is_rst) # Create directory for output file if it doesn't exist os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True) # Write output file with open(output_path, 'w', encoding='utf-8') as f: f.write(content) print(f"Successfully processed {input_path} -> {output_path}") return True, output_path except Exception as e: print(f"Error processing {input_path}: {e}", file=sys.stderr) return False, "" def find_files(directory: str, include_drafts: bool = False) -> List[str]: """ Find all RST and HTML files in a directory recursively. Args: directory: Directory to search include_drafts: Whether to include draft files Returns: List of file paths """ result_files = [] for root, _, files in os.walk(directory): for file in files: if file.lower().endswith(('.rst', '.html', '.htm')): # Skip draft files if not included if not include_drafts and '.draft.' in file.lower(): continue result_files.append(os.path.join(root, file)) return result_files def process_directory( input_dir: str, po_file_path: str, output_dir: Optional[str] = None, to_markdown: bool = False, assets_dir: str = "./assets", include_drafts: bool = False ) -> bool: """ Process all RST and HTML files in a directory. Args: input_dir: Directory containing files po_file_path: Path to .po file with translations output_dir: Directory to write output files (default: add .translated suffix) to_markdown: Whether to convert to markdown assets_dir: Directory to store assets include_drafts: Whether to include draft files Returns: True if all files were processed successfully, False otherwise """ if not os.path.isdir(input_dir): print(f"Error: Input directory does not exist: {input_dir}", file=sys.stderr) return False # Find all RST and HTML files in the directory input_files = find_files(input_dir, include_drafts) if not input_files: print(f"No RST or HTML files found in {input_dir}", file=sys.stderr) return False success = True for input_file in input_files: # Determine output path based on relative path from input_dir rel_path = os.path.relpath(input_file, input_dir) if output_dir: out_path = os.path.join(output_dir, rel_path) # Adjust extension if converting to markdown if to_markdown: out_path = os.path.splitext(out_path)[0] + '.md' # Remove .draft from path if present if '.draft.' in out_path: out_path = out_path.replace('.draft', '') else: out_path = None # Process the file file_success, _ = process_file( input_file, po_file_path, out_path, to_markdown, assets_dir ) if not file_success: success = False return success def main(): """Parse arguments and run the script.""" # Fix for locale/gettext issues by forcing English locale import locale import os # Force 'C' locale to avoid gettext issues os.environ['LC_ALL'] = 'C' locale.setlocale(locale.LC_ALL, 'C') parser = argparse.ArgumentParser( description='Process RST and HTML files by replacing translations and handling images' ) parser.add_argument('input_path', help='Path to input file or directory') parser.add_argument('po_file_path', help='Path to .po file with translations') parser.add_argument('-o', '--output-path', help='Path to write output file or directory') parser.add_argument('--to-markdown', action='store_true', help='Convert output to markdown') parser.add_argument('--assets-dir', default='./assets', help='Directory to store assets (default: ./assets)') parser.add_argument('--include-drafts', action='store_true', help='Process draft files (ending in .draft.rst or .draft.html)') parser.add_argument('--recursive', action='store_true', help='Process directories recursively') args = parser.parse_args() if args.to_markdown and not PANDOC_AVAILABLE: print("Warning: Markdown conversion requires pypandoc package.", file=sys.stderr) print("Install it with: pip install pypandoc", file=sys.stderr) # Check if input path is a directory if os.path.isdir(args.input_path): if args.recursive: success = process_directory( args.input_path, args.po_file_path, args.output_path, args.to_markdown, args.assets_dir, args.include_drafts ) else: print("Error: Input path is a directory. Use --recursive to process it.", file=sys.stderr) success = False else: # Process a single file success, _ = process_file( args.input_path, args.po_file_path, args.output_path, args.to_markdown, args.assets_dir ) sys.exit(0 if success else 1) if __name__ == '__main__': main()