Files
www/main.py

641 lines
22 KiB
Python
Raw Normal View History

2025-05-06 20:31:59 -04:00
#!/usr/bin/env python3
"""
2025-05-06 22:23:32 -04:00
Translation Processor
2025-05-06 20:31:59 -04:00
2025-05-06 22:23:32 -04:00
A script to process reStructuredText and HTML files by replacing translation tags
2025-05-06 20:31:59 -04:00
with content from .po files, handling image references, and optionally
converting to markdown.
"""
import argparse
import os
import re
import shutil
import sys
2025-05-06 22:23:32 -04:00
import urllib.parse
from typing import Dict, List, Optional, Tuple
2025-05-06 20:31:59 -04:00
import polib
# Optional dependency for markdown conversion
try:
import docutils.core
DOCUTILS_AVAILABLE = True
except ImportError:
DOCUTILS_AVAILABLE = False
try:
import pypandoc
PANDOC_AVAILABLE = True
except ImportError:
PANDOC_AVAILABLE = False
def load_translations(po_file_path: str) -> Dict[str, str]:
"""
Load translations from a .po file into a dictionary.
Args:
po_file_path: Path to the .po file
Returns:
Dictionary mapping original text to translated text
"""
try:
po = polib.pofile(po_file_path)
translations = {}
for entry in po:
if entry.msgstr and entry.msgid: # Only include entries with translations
translations[entry.msgid] = entry.msgstr
return translations
except Exception as e:
print(f"Error loading translations from {po_file_path}: {e}", file=sys.stderr)
return {}
2025-05-06 22:23:32 -04:00
def replace_template_vars(content: str) -> str:
"""
Replace template variables like {{spec_url()}} with their values.
Args:
content: Content with template variables
Returns:
Content with template variables replaced
"""
# Define base URLs for different types of links
base_urls = {
'spec_url': 'https://i2p.net/spec/',
'proposal_url': 'https://i2p.net/spec/proposals/proposal',
'i2p_url': 'https://i2p.net/',
'site_url': 'https://i2p.net/',
'get_url': 'https://i2p.net/'
}
# Handle {{spec_url("name")}} pattern
def replace_spec_url(match):
func_name = match.group(1)
arg = match.group(2).strip('"\'') if match.group(2) else ""
if func_name in base_urls:
if func_name == 'proposal_url':
return f"{base_urls[func_name]}{arg}.html"
else:
return f"{base_urls[func_name]}{arg}"
# Handle special cases for other template functions
if func_name == 'url_for':
# Extract the filename from patterns like url_for('static', filename='images/...')
filename_match = re.search(r'filename=[\'"](.*?)[\'"]', arg)
if filename_match:
return f"/_static/{filename_match.group(1)}"
elif func_name == 'i2pconv':
# For i2p domain conversions, return as is
return arg
return match.group(0) # Return unchanged if not recognized
# This pattern matches template functions like {{spec_url("ntcp2")}}
template_pattern = r'{{([a-zA-Z_]+)\(([^}]*?)\)}}'
processed_content = re.sub(template_pattern, replace_spec_url, content)
# Handle other simple variable substitutions like {{ _('text') }}
def replace_simple_var(match):
var_name = match.group(1).strip()
# For translation function calls like _('text'), return just the text
if var_name.startswith("_('") and var_name.endswith("')"):
return var_name[3:-2] # Extract the text between quotes
return match.group(0) # Return unchanged if not recognized
simple_var_pattern = r'{{([^}]+?)}}'
return re.sub(simple_var_pattern, replace_simple_var, processed_content)
2025-05-06 20:31:59 -04:00
def replace_translations(content: str, translations: Dict[str, str]) -> str:
"""
Replace translation tags in the content with translated text.
Args:
2025-05-06 22:23:32 -04:00
content: Content with translation tags
2025-05-06 20:31:59 -04:00
translations: Dictionary of translations
Returns:
Content with translations applied
"""
2025-05-06 22:23:32 -04:00
# First, handle simple {% trans %}...{% endtrans %} blocks
def replace_simple_match(match):
2025-05-06 20:31:59 -04:00
text = match.group(1).strip()
return translations.get(text, text)
# Match {% trans %}...{% endtrans %} patterns
2025-05-06 22:23:32 -04:00
simple_pattern = r'{%\s*trans\s*%}(.*?){%\s*endtrans\s*%}'
content = re.sub(simple_pattern, replace_simple_match, content, flags=re.DOTALL)
# Now handle more complex translation blocks with arguments
def replace_complex_match(match):
# Extract parameters if present
params_str = match.group(1) or ""
text = match.group(2).strip()
# Process parameters (for future use)
params = {}
if params_str:
param_matches = re.finditer(r'(\w+)=["\'](.*?)["\']', params_str)
for param_match in param_matches:
key, value = param_match.groups()
params[key] = value
# Replace parameter references in the text
if f"{{{key}}}" in text:
text = text.replace(f"{{{key}}}", value)
# Apply translation
translated = translations.get(text, text)
# If there are parameter values, they need to be maintained in the translation
for key, value in params.items():
if f"{{{key}}}" in translated:
translated = translated.replace(f"{{{key}}}", value)
return translated
# Match {% trans param1="value" -%}...{%- endtrans %} patterns with optional parameters
complex_pattern = r'{%\s*trans\s*(.*?)-%}(.*?){%-\s*endtrans\s*%}'
content = re.sub(complex_pattern, replace_complex_match, content, flags=re.DOTALL)
return content
2025-05-06 20:31:59 -04:00
2025-05-06 22:23:32 -04:00
def process_rst_images(content: str, base_dir: str, assets_dir: str) -> str:
2025-05-06 20:31:59 -04:00
"""
2025-05-06 22:23:32 -04:00
Process image references in RST content.
2025-05-06 20:31:59 -04:00
Args:
content: RST content with image references
base_dir: Base directory of the input file
assets_dir: Directory to store images
Returns:
Content with updated image references
"""
# Create assets directory if it doesn't exist
images_dir = os.path.join(assets_dir, "images")
os.makedirs(images_dir, exist_ok=True)
# Find image references
# This pattern matches both basic image directives and figure directives
pattern = r'\.\.\s+(image|figure)::\s+([\S]+)'
def process_match(match):
directive_type = match.group(1) # image or figure
old_path = match.group(2).strip()
# Handle relative paths
if not os.path.isabs(old_path):
full_old_path = os.path.join(base_dir, old_path)
else:
full_old_path = old_path
# Extract filename from path
filename = os.path.basename(old_path)
new_rel_path = os.path.join("images", filename)
new_full_path = os.path.join(assets_dir, new_rel_path)
# Copy the image if it exists
if os.path.exists(full_old_path):
try:
shutil.copy2(full_old_path, new_full_path)
print(f"Copied image: {full_old_path} -> {new_full_path}")
except Exception as e:
print(f"Error copying image {full_old_path}: {e}", file=sys.stderr)
else:
print(f"Warning: Image file not found: {full_old_path}", file=sys.stderr)
# Return the updated directive
return f".. {directive_type}:: {new_rel_path}"
return re.sub(pattern, process_match, content)
2025-05-06 22:23:32 -04:00
def process_html_images(content: str, base_dir: str, assets_dir: str) -> str:
2025-05-06 20:31:59 -04:00
"""
2025-05-06 22:23:32 -04:00
Process image references in HTML content.
2025-05-06 20:31:59 -04:00
Args:
2025-05-06 22:23:32 -04:00
content: HTML content with image references
base_dir: Base directory of the input file
assets_dir: Directory to store images
2025-05-06 20:31:59 -04:00
Returns:
2025-05-06 22:23:32 -04:00
Content with updated image references
"""
# Create assets directory if it doesn't exist
images_dir = os.path.join(assets_dir, "images")
os.makedirs(images_dir, exist_ok=True)
# Find image references in HTML
# This pattern matches <img src="..."> tags
pattern = r'<img\s+[^>]*src=["\']((?!https?://)[^"\']+)["\'][^>]*>'
def process_match(match):
img_tag = match.group(0)
path = match.group(1)
# Skip URLs
if path.startswith(('http://', 'https://')):
return img_tag
# Handle templated paths
if "{{" in path:
# Extract paths from template expressions like {{ url_for('static', filename='images/file.png') }}
template_match = re.search(r'filename=[\'"](.*?)[\'"]', path)
if template_match:
path = f"/_static/{template_match.group(1)}"
else:
return img_tag # Can't process this template
# Remove leading /_static/ if present
if path.startswith('/_static/'):
path = path[9:] # Remove /_static/ prefix
# Handle relative paths
if not os.path.isabs(path):
full_old_path = os.path.join(base_dir, path)
if not os.path.exists(full_old_path):
# Try looking in static directory
full_old_path = os.path.join(base_dir, 'static', path)
else:
full_old_path = path
# Extract filename from path
filename = os.path.basename(path)
new_rel_path = os.path.join("images", filename)
new_full_path = os.path.join(assets_dir, new_rel_path)
# Copy the image if it exists
if os.path.exists(full_old_path):
try:
shutil.copy2(full_old_path, new_full_path)
print(f"Copied image: {full_old_path} -> {new_full_path}")
except Exception as e:
print(f"Error copying image {full_old_path}: {e}", file=sys.stderr)
return img_tag # Return original if error
else:
print(f"Warning: Image file not found: {full_old_path}", file=sys.stderr)
return img_tag # Return original if not found
# Return the updated img tag with new path
return img_tag.replace(match.group(1), f"images/{filename}")
return re.sub(pattern, process_match, content)
def clean_markdown_output(markdown: str) -> str:
"""
Clean up markdown output by handling URL-encoded template variables.
Args:
markdown: Markdown content with possible URL-encoded templates
Returns:
Cleaned markdown content
"""
# Replace URL-encoded template variables
def replace_encoded_templates(match):
# Decode the URL-encoded string
encoded_text = match.group(0)
decoded_text = urllib.parse.unquote(encoded_text)
# Extract values from template expressions like {{ url_for('static', filename='images/file.png') }}
if decoded_text.startswith('{{') and decoded_text.endswith('}}'):
template_content = decoded_text.strip('{}').strip()
# Handle url_for template function
if 'url_for' in template_content and 'filename=' in template_content:
filename_match = re.search(r'filename=[\'"](.*?)[\'"]', template_content)
if filename_match:
return filename_match.group(1)
# Return empty string for other template functions
return ''
return encoded_text
# Find URL-encoded sequences that might be template variables
encoded_pattern = r'%7B%7B.*?%7D%7D'
markdown = re.sub(encoded_pattern, replace_encoded_templates, markdown)
# Clean up any broken image links that might have resulted from template replacements
# Change ![text](broken_link) to ![text](images/filename.ext) when possible
def fix_image_links(match):
alt_text = match.group(1)
link = match.group(2)
# If link is empty or looks like a broken template
if not link or link.startswith('%7B') or link.startswith('{{'):
# Try to extract image filename from alt text or use a placeholder
filename = alt_text.replace(' ', '-').lower()
if filename:
return f"![{alt_text}](images/{filename}.png)"
return match.group(0) # Return unchanged
# Fix image links
image_pattern = r'!\[(.*?)\]\((.*?)\)'
markdown = re.sub(image_pattern, fix_image_links, markdown)
return markdown
def convert_to_markdown(content: str, is_rst: bool = True) -> str:
2025-05-06 20:31:59 -04:00
"""
2025-05-06 22:23:32 -04:00
Convert content to Markdown.
2025-05-06 20:31:59 -04:00
2025-05-06 22:23:32 -04:00
Args:
content: Content to convert
is_rst: Whether the content is RST (True) or HTML (False)
Returns:
Markdown content
"""
2025-05-06 20:31:59 -04:00
if not PANDOC_AVAILABLE:
2025-05-06 22:23:32 -04:00
print("Warning: pypandoc not available. Conversion to Markdown skipped.", file=sys.stderr)
return content
2025-05-06 20:31:59 -04:00
try:
2025-05-06 22:23:32 -04:00
if is_rst:
if not DOCUTILS_AVAILABLE:
print("Warning: docutils not available. RST to HTML conversion skipped.", file=sys.stderr)
return content
# Convert RST to HTML
html = docutils.core.publish_string(
source=content,
writer_name='html',
settings_overrides={'output_encoding': 'unicode'}
)
# Convert HTML to Markdown
markdown = pypandoc.convert_text(html, 'md', format='html')
else:
# Convert HTML directly to Markdown
markdown = pypandoc.convert_text(content, 'md', format='html')
# Post-process markdown to clean up URL-encoded template variables
markdown = clean_markdown_output(markdown)
2025-05-06 20:31:59 -04:00
return markdown
except Exception as e:
print(f"Error converting to markdown: {e}", file=sys.stderr)
2025-05-06 22:23:32 -04:00
return content
2025-05-06 20:31:59 -04:00
2025-05-06 22:23:32 -04:00
def is_draft(file_path: str) -> bool:
"""
Check if a file is a draft based on its name.
Args:
file_path: Path to the file
Returns:
True if the file is a draft, False otherwise
"""
return '.draft.' in file_path.lower()
def process_file(
2025-05-06 20:31:59 -04:00
input_path: str,
po_file_path: str,
output_path: Optional[str] = None,
to_markdown: bool = False,
assets_dir: str = "./assets"
) -> Tuple[bool, str]:
"""
2025-05-06 22:23:32 -04:00
Process a file by replacing translations and handling images.
2025-05-06 20:31:59 -04:00
Args:
2025-05-06 22:23:32 -04:00
input_path: Path to input file (RST or HTML)
2025-05-06 20:31:59 -04:00
po_file_path: Path to .po file with translations
output_path: Path to write output (default: add .translated suffix)
to_markdown: Whether to convert to markdown
assets_dir: Directory to store assets
Returns:
Tuple of (success: bool, output_path: str)
"""
# Check that input files exist
if not os.path.exists(input_path):
print(f"Error: Input file does not exist: {input_path}", file=sys.stderr)
return False, ""
if not os.path.exists(po_file_path):
print(f"Error: PO file does not exist: {po_file_path}", file=sys.stderr)
return False, ""
2025-05-06 22:23:32 -04:00
# Determine file type
is_rst = input_path.lower().endswith('.rst')
is_html = input_path.lower().endswith(('.html', '.htm'))
if not (is_rst or is_html):
print(f"Error: Unsupported file type: {input_path}. Only .rst, .html, and .htm files are supported.",
file=sys.stderr)
return False, ""
2025-05-06 20:31:59 -04:00
# Determine output path if not specified
if not output_path:
base, ext = os.path.splitext(input_path)
2025-05-06 22:23:32 -04:00
# Handle .draft.rst/.draft.html case
if '.draft.' in base.lower():
base = base.replace('.draft', '')
# Set extension based on conversion type
if to_markdown:
out_ext = '.md'
else:
out_ext = ext
output_path = f"{base}.translated{out_ext}"
2025-05-06 20:31:59 -04:00
# Create assets directory
os.makedirs(assets_dir, exist_ok=True)
try:
# Load translations
translations = load_translations(po_file_path)
if not translations:
print("Warning: No translations found in the .po file", file=sys.stderr)
# Read input file
with open(input_path, 'r', encoding='utf-8') as f:
content = f.read()
2025-05-06 22:23:32 -04:00
# Replace template variables
content = replace_template_vars(content)
2025-05-06 20:31:59 -04:00
# Replace translations
content = replace_translations(content, translations)
2025-05-06 22:23:32 -04:00
# Process images based on file type
2025-05-06 20:31:59 -04:00
base_dir = os.path.dirname(os.path.abspath(input_path))
2025-05-06 22:23:32 -04:00
if is_rst:
content = process_rst_images(content, base_dir, assets_dir)
elif is_html:
content = process_html_images(content, base_dir, assets_dir)
2025-05-06 20:31:59 -04:00
# Convert to markdown if requested
if to_markdown:
2025-05-06 22:23:32 -04:00
content = convert_to_markdown(content, is_rst=is_rst)
# Create directory for output file if it doesn't exist
os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
2025-05-06 20:31:59 -04:00
# Write output file
with open(output_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f"Successfully processed {input_path} -> {output_path}")
return True, output_path
except Exception as e:
print(f"Error processing {input_path}: {e}", file=sys.stderr)
return False, ""
2025-05-06 22:23:32 -04:00
def find_files(directory: str, include_drafts: bool = False) -> List[str]:
"""
Find all RST and HTML files in a directory recursively.
Args:
directory: Directory to search
include_drafts: Whether to include draft files
Returns:
List of file paths
"""
result_files = []
for root, _, files in os.walk(directory):
for file in files:
if file.lower().endswith(('.rst', '.html', '.htm')):
# Skip draft files if not included
if not include_drafts and '.draft.' in file.lower():
continue
result_files.append(os.path.join(root, file))
return result_files
def process_directory(
input_dir: str,
po_file_path: str,
output_dir: Optional[str] = None,
to_markdown: bool = False,
assets_dir: str = "./assets",
include_drafts: bool = False
) -> bool:
"""
Process all RST and HTML files in a directory.
Args:
input_dir: Directory containing files
po_file_path: Path to .po file with translations
output_dir: Directory to write output files (default: add .translated suffix)
to_markdown: Whether to convert to markdown
assets_dir: Directory to store assets
include_drafts: Whether to include draft files
Returns:
True if all files were processed successfully, False otherwise
"""
if not os.path.isdir(input_dir):
print(f"Error: Input directory does not exist: {input_dir}", file=sys.stderr)
return False
# Find all RST and HTML files in the directory
input_files = find_files(input_dir, include_drafts)
if not input_files:
print(f"No RST or HTML files found in {input_dir}", file=sys.stderr)
return False
success = True
for input_file in input_files:
# Determine output path based on relative path from input_dir
rel_path = os.path.relpath(input_file, input_dir)
if output_dir:
out_path = os.path.join(output_dir, rel_path)
# Adjust extension if converting to markdown
if to_markdown:
out_path = os.path.splitext(out_path)[0] + '.md'
# Remove .draft from path if present
if '.draft.' in out_path:
out_path = out_path.replace('.draft', '')
else:
out_path = None
# Process the file
file_success, _ = process_file(
input_file,
po_file_path,
out_path,
to_markdown,
assets_dir
)
if not file_success:
success = False
return success
2025-05-06 20:31:59 -04:00
def main():
"""Parse arguments and run the script."""
2025-05-06 22:23:32 -04:00
# Fix for locale/gettext issues by forcing English locale
import locale
import os
# Force 'C' locale to avoid gettext issues
os.environ['LC_ALL'] = 'C'
locale.setlocale(locale.LC_ALL, 'C')
2025-05-06 20:31:59 -04:00
parser = argparse.ArgumentParser(
2025-05-06 22:23:32 -04:00
description='Process RST and HTML files by replacing translations and handling images'
2025-05-06 20:31:59 -04:00
)
2025-05-06 22:23:32 -04:00
parser.add_argument('input_path', help='Path to input file or directory')
2025-05-06 20:31:59 -04:00
parser.add_argument('po_file_path', help='Path to .po file with translations')
2025-05-06 22:23:32 -04:00
parser.add_argument('-o', '--output-path', help='Path to write output file or directory')
2025-05-06 20:31:59 -04:00
parser.add_argument('--to-markdown', action='store_true', help='Convert output to markdown')
parser.add_argument('--assets-dir', default='./assets', help='Directory to store assets (default: ./assets)')
2025-05-06 22:23:32 -04:00
parser.add_argument('--include-drafts', action='store_true', help='Process draft files (ending in .draft.rst or .draft.html)')
parser.add_argument('--recursive', action='store_true', help='Process directories recursively')
2025-05-06 20:31:59 -04:00
args = parser.parse_args()
2025-05-06 22:23:32 -04:00
if args.to_markdown and not PANDOC_AVAILABLE:
print("Warning: Markdown conversion requires pypandoc package.", file=sys.stderr)
print("Install it with: pip install pypandoc", file=sys.stderr)
2025-05-06 20:31:59 -04:00
2025-05-06 22:23:32 -04:00
# Check if input path is a directory
if os.path.isdir(args.input_path):
if args.recursive:
success = process_directory(
args.input_path,
args.po_file_path,
args.output_path,
args.to_markdown,
args.assets_dir,
args.include_drafts
)
else:
print("Error: Input path is a directory. Use --recursive to process it.", file=sys.stderr)
success = False
else:
# Process a single file
success, _ = process_file(
args.input_path,
args.po_file_path,
args.output_path,
args.to_markdown,
args.assets_dir
)
2025-05-06 20:31:59 -04:00
sys.exit(0 if success else 1)
if __name__ == '__main__':
main()