Files
www/main.py
eyedeekay 2c524f049f basics
2025-05-06 22:23:32 -04:00

641 lines
22 KiB
Python

#!/usr/bin/env python3
"""
Translation Processor
A script to process reStructuredText and HTML files by replacing translation tags
with content from .po files, handling image references, and optionally
converting to markdown.
"""
import argparse
import os
import re
import shutil
import sys
import urllib.parse
from typing import Dict, List, Optional, Tuple
import polib
# Optional dependency for markdown conversion
try:
import docutils.core
DOCUTILS_AVAILABLE = True
except ImportError:
DOCUTILS_AVAILABLE = False
try:
import pypandoc
PANDOC_AVAILABLE = True
except ImportError:
PANDOC_AVAILABLE = False
def load_translations(po_file_path: str) -> Dict[str, str]:
"""
Load translations from a .po file into a dictionary.
Args:
po_file_path: Path to the .po file
Returns:
Dictionary mapping original text to translated text
"""
try:
po = polib.pofile(po_file_path)
translations = {}
for entry in po:
if entry.msgstr and entry.msgid: # Only include entries with translations
translations[entry.msgid] = entry.msgstr
return translations
except Exception as e:
print(f"Error loading translations from {po_file_path}: {e}", file=sys.stderr)
return {}
def replace_template_vars(content: str) -> str:
"""
Replace template variables like {{spec_url()}} with their values.
Args:
content: Content with template variables
Returns:
Content with template variables replaced
"""
# Define base URLs for different types of links
base_urls = {
'spec_url': 'https://i2p.net/spec/',
'proposal_url': 'https://i2p.net/spec/proposals/proposal',
'i2p_url': 'https://i2p.net/',
'site_url': 'https://i2p.net/',
'get_url': 'https://i2p.net/'
}
# Handle {{spec_url("name")}} pattern
def replace_spec_url(match):
func_name = match.group(1)
arg = match.group(2).strip('"\'') if match.group(2) else ""
if func_name in base_urls:
if func_name == 'proposal_url':
return f"{base_urls[func_name]}{arg}.html"
else:
return f"{base_urls[func_name]}{arg}"
# Handle special cases for other template functions
if func_name == 'url_for':
# Extract the filename from patterns like url_for('static', filename='images/...')
filename_match = re.search(r'filename=[\'"](.*?)[\'"]', arg)
if filename_match:
return f"/_static/{filename_match.group(1)}"
elif func_name == 'i2pconv':
# For i2p domain conversions, return as is
return arg
return match.group(0) # Return unchanged if not recognized
# This pattern matches template functions like {{spec_url("ntcp2")}}
template_pattern = r'{{([a-zA-Z_]+)\(([^}]*?)\)}}'
processed_content = re.sub(template_pattern, replace_spec_url, content)
# Handle other simple variable substitutions like {{ _('text') }}
def replace_simple_var(match):
var_name = match.group(1).strip()
# For translation function calls like _('text'), return just the text
if var_name.startswith("_('") and var_name.endswith("')"):
return var_name[3:-2] # Extract the text between quotes
return match.group(0) # Return unchanged if not recognized
simple_var_pattern = r'{{([^}]+?)}}'
return re.sub(simple_var_pattern, replace_simple_var, processed_content)
def replace_translations(content: str, translations: Dict[str, str]) -> str:
"""
Replace translation tags in the content with translated text.
Args:
content: Content with translation tags
translations: Dictionary of translations
Returns:
Content with translations applied
"""
# First, handle simple {% trans %}...{% endtrans %} blocks
def replace_simple_match(match):
text = match.group(1).strip()
return translations.get(text, text)
# Match {% trans %}...{% endtrans %} patterns
simple_pattern = r'{%\s*trans\s*%}(.*?){%\s*endtrans\s*%}'
content = re.sub(simple_pattern, replace_simple_match, content, flags=re.DOTALL)
# Now handle more complex translation blocks with arguments
def replace_complex_match(match):
# Extract parameters if present
params_str = match.group(1) or ""
text = match.group(2).strip()
# Process parameters (for future use)
params = {}
if params_str:
param_matches = re.finditer(r'(\w+)=["\'](.*?)["\']', params_str)
for param_match in param_matches:
key, value = param_match.groups()
params[key] = value
# Replace parameter references in the text
if f"{{{key}}}" in text:
text = text.replace(f"{{{key}}}", value)
# Apply translation
translated = translations.get(text, text)
# If there are parameter values, they need to be maintained in the translation
for key, value in params.items():
if f"{{{key}}}" in translated:
translated = translated.replace(f"{{{key}}}", value)
return translated
# Match {% trans param1="value" -%}...{%- endtrans %} patterns with optional parameters
complex_pattern = r'{%\s*trans\s*(.*?)-%}(.*?){%-\s*endtrans\s*%}'
content = re.sub(complex_pattern, replace_complex_match, content, flags=re.DOTALL)
return content
def process_rst_images(content: str, base_dir: str, assets_dir: str) -> str:
"""
Process image references in RST content.
Args:
content: RST content with image references
base_dir: Base directory of the input file
assets_dir: Directory to store images
Returns:
Content with updated image references
"""
# Create assets directory if it doesn't exist
images_dir = os.path.join(assets_dir, "images")
os.makedirs(images_dir, exist_ok=True)
# Find image references
# This pattern matches both basic image directives and figure directives
pattern = r'\.\.\s+(image|figure)::\s+([\S]+)'
def process_match(match):
directive_type = match.group(1) # image or figure
old_path = match.group(2).strip()
# Handle relative paths
if not os.path.isabs(old_path):
full_old_path = os.path.join(base_dir, old_path)
else:
full_old_path = old_path
# Extract filename from path
filename = os.path.basename(old_path)
new_rel_path = os.path.join("images", filename)
new_full_path = os.path.join(assets_dir, new_rel_path)
# Copy the image if it exists
if os.path.exists(full_old_path):
try:
shutil.copy2(full_old_path, new_full_path)
print(f"Copied image: {full_old_path} -> {new_full_path}")
except Exception as e:
print(f"Error copying image {full_old_path}: {e}", file=sys.stderr)
else:
print(f"Warning: Image file not found: {full_old_path}", file=sys.stderr)
# Return the updated directive
return f".. {directive_type}:: {new_rel_path}"
return re.sub(pattern, process_match, content)
def process_html_images(content: str, base_dir: str, assets_dir: str) -> str:
"""
Process image references in HTML content.
Args:
content: HTML content with image references
base_dir: Base directory of the input file
assets_dir: Directory to store images
Returns:
Content with updated image references
"""
# Create assets directory if it doesn't exist
images_dir = os.path.join(assets_dir, "images")
os.makedirs(images_dir, exist_ok=True)
# Find image references in HTML
# This pattern matches <img src="..."> tags
pattern = r'<img\s+[^>]*src=["\']((?!https?://)[^"\']+)["\'][^>]*>'
def process_match(match):
img_tag = match.group(0)
path = match.group(1)
# Skip URLs
if path.startswith(('http://', 'https://')):
return img_tag
# Handle templated paths
if "{{" in path:
# Extract paths from template expressions like {{ url_for('static', filename='images/file.png') }}
template_match = re.search(r'filename=[\'"](.*?)[\'"]', path)
if template_match:
path = f"/_static/{template_match.group(1)}"
else:
return img_tag # Can't process this template
# Remove leading /_static/ if present
if path.startswith('/_static/'):
path = path[9:] # Remove /_static/ prefix
# Handle relative paths
if not os.path.isabs(path):
full_old_path = os.path.join(base_dir, path)
if not os.path.exists(full_old_path):
# Try looking in static directory
full_old_path = os.path.join(base_dir, 'static', path)
else:
full_old_path = path
# Extract filename from path
filename = os.path.basename(path)
new_rel_path = os.path.join("images", filename)
new_full_path = os.path.join(assets_dir, new_rel_path)
# Copy the image if it exists
if os.path.exists(full_old_path):
try:
shutil.copy2(full_old_path, new_full_path)
print(f"Copied image: {full_old_path} -> {new_full_path}")
except Exception as e:
print(f"Error copying image {full_old_path}: {e}", file=sys.stderr)
return img_tag # Return original if error
else:
print(f"Warning: Image file not found: {full_old_path}", file=sys.stderr)
return img_tag # Return original if not found
# Return the updated img tag with new path
return img_tag.replace(match.group(1), f"images/{filename}")
return re.sub(pattern, process_match, content)
def clean_markdown_output(markdown: str) -> str:
"""
Clean up markdown output by handling URL-encoded template variables.
Args:
markdown: Markdown content with possible URL-encoded templates
Returns:
Cleaned markdown content
"""
# Replace URL-encoded template variables
def replace_encoded_templates(match):
# Decode the URL-encoded string
encoded_text = match.group(0)
decoded_text = urllib.parse.unquote(encoded_text)
# Extract values from template expressions like {{ url_for('static', filename='images/file.png') }}
if decoded_text.startswith('{{') and decoded_text.endswith('}}'):
template_content = decoded_text.strip('{}').strip()
# Handle url_for template function
if 'url_for' in template_content and 'filename=' in template_content:
filename_match = re.search(r'filename=[\'"](.*?)[\'"]', template_content)
if filename_match:
return filename_match.group(1)
# Return empty string for other template functions
return ''
return encoded_text
# Find URL-encoded sequences that might be template variables
encoded_pattern = r'%7B%7B.*?%7D%7D'
markdown = re.sub(encoded_pattern, replace_encoded_templates, markdown)
# Clean up any broken image links that might have resulted from template replacements
# Change ![text](broken_link) to ![text](images/filename.ext) when possible
def fix_image_links(match):
alt_text = match.group(1)
link = match.group(2)
# If link is empty or looks like a broken template
if not link or link.startswith('%7B') or link.startswith('{{'):
# Try to extract image filename from alt text or use a placeholder
filename = alt_text.replace(' ', '-').lower()
if filename:
return f"![{alt_text}](images/{filename}.png)"
return match.group(0) # Return unchanged
# Fix image links
image_pattern = r'!\[(.*?)\]\((.*?)\)'
markdown = re.sub(image_pattern, fix_image_links, markdown)
return markdown
def convert_to_markdown(content: str, is_rst: bool = True) -> str:
"""
Convert content to Markdown.
Args:
content: Content to convert
is_rst: Whether the content is RST (True) or HTML (False)
Returns:
Markdown content
"""
if not PANDOC_AVAILABLE:
print("Warning: pypandoc not available. Conversion to Markdown skipped.", file=sys.stderr)
return content
try:
if is_rst:
if not DOCUTILS_AVAILABLE:
print("Warning: docutils not available. RST to HTML conversion skipped.", file=sys.stderr)
return content
# Convert RST to HTML
html = docutils.core.publish_string(
source=content,
writer_name='html',
settings_overrides={'output_encoding': 'unicode'}
)
# Convert HTML to Markdown
markdown = pypandoc.convert_text(html, 'md', format='html')
else:
# Convert HTML directly to Markdown
markdown = pypandoc.convert_text(content, 'md', format='html')
# Post-process markdown to clean up URL-encoded template variables
markdown = clean_markdown_output(markdown)
return markdown
except Exception as e:
print(f"Error converting to markdown: {e}", file=sys.stderr)
return content
def is_draft(file_path: str) -> bool:
"""
Check if a file is a draft based on its name.
Args:
file_path: Path to the file
Returns:
True if the file is a draft, False otherwise
"""
return '.draft.' in file_path.lower()
def process_file(
input_path: str,
po_file_path: str,
output_path: Optional[str] = None,
to_markdown: bool = False,
assets_dir: str = "./assets"
) -> Tuple[bool, str]:
"""
Process a file by replacing translations and handling images.
Args:
input_path: Path to input file (RST or HTML)
po_file_path: Path to .po file with translations
output_path: Path to write output (default: add .translated suffix)
to_markdown: Whether to convert to markdown
assets_dir: Directory to store assets
Returns:
Tuple of (success: bool, output_path: str)
"""
# Check that input files exist
if not os.path.exists(input_path):
print(f"Error: Input file does not exist: {input_path}", file=sys.stderr)
return False, ""
if not os.path.exists(po_file_path):
print(f"Error: PO file does not exist: {po_file_path}", file=sys.stderr)
return False, ""
# Determine file type
is_rst = input_path.lower().endswith('.rst')
is_html = input_path.lower().endswith(('.html', '.htm'))
if not (is_rst or is_html):
print(f"Error: Unsupported file type: {input_path}. Only .rst, .html, and .htm files are supported.",
file=sys.stderr)
return False, ""
# Determine output path if not specified
if not output_path:
base, ext = os.path.splitext(input_path)
# Handle .draft.rst/.draft.html case
if '.draft.' in base.lower():
base = base.replace('.draft', '')
# Set extension based on conversion type
if to_markdown:
out_ext = '.md'
else:
out_ext = ext
output_path = f"{base}.translated{out_ext}"
# Create assets directory
os.makedirs(assets_dir, exist_ok=True)
try:
# Load translations
translations = load_translations(po_file_path)
if not translations:
print("Warning: No translations found in the .po file", file=sys.stderr)
# Read input file
with open(input_path, 'r', encoding='utf-8') as f:
content = f.read()
# Replace template variables
content = replace_template_vars(content)
# Replace translations
content = replace_translations(content, translations)
# Process images based on file type
base_dir = os.path.dirname(os.path.abspath(input_path))
if is_rst:
content = process_rst_images(content, base_dir, assets_dir)
elif is_html:
content = process_html_images(content, base_dir, assets_dir)
# Convert to markdown if requested
if to_markdown:
content = convert_to_markdown(content, is_rst=is_rst)
# Create directory for output file if it doesn't exist
os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
# Write output file
with open(output_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f"Successfully processed {input_path} -> {output_path}")
return True, output_path
except Exception as e:
print(f"Error processing {input_path}: {e}", file=sys.stderr)
return False, ""
def find_files(directory: str, include_drafts: bool = False) -> List[str]:
"""
Find all RST and HTML files in a directory recursively.
Args:
directory: Directory to search
include_drafts: Whether to include draft files
Returns:
List of file paths
"""
result_files = []
for root, _, files in os.walk(directory):
for file in files:
if file.lower().endswith(('.rst', '.html', '.htm')):
# Skip draft files if not included
if not include_drafts and '.draft.' in file.lower():
continue
result_files.append(os.path.join(root, file))
return result_files
def process_directory(
input_dir: str,
po_file_path: str,
output_dir: Optional[str] = None,
to_markdown: bool = False,
assets_dir: str = "./assets",
include_drafts: bool = False
) -> bool:
"""
Process all RST and HTML files in a directory.
Args:
input_dir: Directory containing files
po_file_path: Path to .po file with translations
output_dir: Directory to write output files (default: add .translated suffix)
to_markdown: Whether to convert to markdown
assets_dir: Directory to store assets
include_drafts: Whether to include draft files
Returns:
True if all files were processed successfully, False otherwise
"""
if not os.path.isdir(input_dir):
print(f"Error: Input directory does not exist: {input_dir}", file=sys.stderr)
return False
# Find all RST and HTML files in the directory
input_files = find_files(input_dir, include_drafts)
if not input_files:
print(f"No RST or HTML files found in {input_dir}", file=sys.stderr)
return False
success = True
for input_file in input_files:
# Determine output path based on relative path from input_dir
rel_path = os.path.relpath(input_file, input_dir)
if output_dir:
out_path = os.path.join(output_dir, rel_path)
# Adjust extension if converting to markdown
if to_markdown:
out_path = os.path.splitext(out_path)[0] + '.md'
# Remove .draft from path if present
if '.draft.' in out_path:
out_path = out_path.replace('.draft', '')
else:
out_path = None
# Process the file
file_success, _ = process_file(
input_file,
po_file_path,
out_path,
to_markdown,
assets_dir
)
if not file_success:
success = False
return success
def main():
"""Parse arguments and run the script."""
# Fix for locale/gettext issues by forcing English locale
import locale
import os
# Force 'C' locale to avoid gettext issues
os.environ['LC_ALL'] = 'C'
locale.setlocale(locale.LC_ALL, 'C')
parser = argparse.ArgumentParser(
description='Process RST and HTML files by replacing translations and handling images'
)
parser.add_argument('input_path', help='Path to input file or directory')
parser.add_argument('po_file_path', help='Path to .po file with translations')
parser.add_argument('-o', '--output-path', help='Path to write output file or directory')
parser.add_argument('--to-markdown', action='store_true', help='Convert output to markdown')
parser.add_argument('--assets-dir', default='./assets', help='Directory to store assets (default: ./assets)')
parser.add_argument('--include-drafts', action='store_true', help='Process draft files (ending in .draft.rst or .draft.html)')
parser.add_argument('--recursive', action='store_true', help='Process directories recursively')
args = parser.parse_args()
if args.to_markdown and not PANDOC_AVAILABLE:
print("Warning: Markdown conversion requires pypandoc package.", file=sys.stderr)
print("Install it with: pip install pypandoc", file=sys.stderr)
# Check if input path is a directory
if os.path.isdir(args.input_path):
if args.recursive:
success = process_directory(
args.input_path,
args.po_file_path,
args.output_path,
args.to_markdown,
args.assets_dir,
args.include_drafts
)
else:
print("Error: Input path is a directory. Use --recursive to process it.", file=sys.stderr)
success = False
else:
# Process a single file
success, _ = process_file(
args.input_path,
args.po_file_path,
args.output_path,
args.to_markdown,
args.assets_dir
)
sys.exit(0 if success else 1)
if __name__ == '__main__':
main()