From 2c524f049f9aa8a68f485bb654605e3848f18e14 Mon Sep 17 00:00:00 2001 From: eyedeekay Date: Tue, 6 May 2025 22:23:32 -0400 Subject: [PATCH] basics --- clean_markdown.py | 97 ++++++++++ cmd.sh | 12 +- main.py | 476 +++++++++++++++++++++++++++++++++++++++++----- run.sh | 23 ++- 4 files changed, 554 insertions(+), 54 deletions(-) create mode 100644 clean_markdown.py diff --git a/clean_markdown.py b/clean_markdown.py new file mode 100644 index 00000000..2fc6615e --- /dev/null +++ b/clean_markdown.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# filepath: clean_markdown.py + +import re +import os +import sys +import argparse +from pathlib import Path + +def clean_markdown_file(file_path, dry_run=False): + """Remove curly-braced elements from markdown files""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Store original content for comparison + original_content = content + + # Replace {{...}} expressions with empty strings + # Pattern matches {{ followed by any characters (non-greedy) followed by }} + content = re.sub(r'\{\{\s*([^}]*?)\s*\}\}', '', content) + + # Replace {%...%} template tags with empty strings + content = re.sub(r'\{%[^%]*?%\}', '', content) + + # Replace broken links that might result from removing template variables + # e.g., [network database](%7B%7B%20netdb%20%7D%7D) -> [network database]() + content = re.sub(r'\]\(%7B%7B[^)]*?%7D%7D\)', ']()', content) + + # Handle other URL-encoded template variables + content = re.sub(r'%7B%7B[^%]*?%7D%7D', '', content) + + # Fix escaped backslashes that might appear in code blocks + content = re.sub(r'\\\\([`*_{}[\]()#+-.!])', r'\1', content) + + # Clean up any double spaces created by removals + content = re.sub(r' +', ' ', content) + + # Only write if content changed and not in dry run mode + if content != original_content and not dry_run: + with open(file_path, 'w', encoding='utf-8') as f: + f.write(content) + print(f"Cleaned: {file_path}") + return True + elif content != original_content and dry_run: + print(f"Would clean: {file_path} (dry run)") + return True + else: + print(f"No changes needed: {file_path}") + return False + + except Exception as e: + print(f"Error processing {file_path}: {e}") + return False + +def main(): + parser = argparse.ArgumentParser(description="Clean markdown files by removing template variables and expressions.") + parser.add_argument("paths", nargs='+', help="Markdown files or directories to process") + parser.add_argument("--dry-run", action="store_true", help="Show what would be changed without making changes") + parser.add_argument("--recursive", "-r", action="store_true", help="Process directories recursively") + args = parser.parse_args() + + files_processed = 0 + files_changed = 0 + + for path in args.paths: + path_obj = Path(path) + if path_obj.is_file() and path_obj.suffix.lower() in ['.md', '.markdown']: + files_processed += 1 + if clean_markdown_file(path_obj, args.dry_run): + files_changed += 1 + elif path_obj.is_dir(): + if args.recursive: + for md_file in path_obj.glob('**/*.md'): + files_processed += 1 + if clean_markdown_file(md_file, args.dry_run): + files_changed += 1 + for md_file in path_obj.glob('**/*.markdown'): + files_processed += 1 + if clean_markdown_file(md_file, args.dry_run): + files_changed += 1 + else: + for md_file in path_obj.glob('*.md'): + files_processed += 1 + if clean_markdown_file(md_file, args.dry_run): + files_changed += 1 + for md_file in path_obj.glob('*.markdown'): + files_processed += 1 + if clean_markdown_file(md_file, args.dry_run): + files_changed += 1 + else: + print(f"Skipping {path}: Not a markdown file or directory") + + print(f"\nSummary: Processed {files_processed} files, changed {files_changed} files") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/cmd.sh b/cmd.sh index 1134c51f..581be213 100755 --- a/cmd.sh +++ b/cmd.sh @@ -1,9 +1,11 @@ #!/usr/bin/env sh -#outdir is processed+path to output file -outdir=processed/$(dirname $1) -mkdir -p "$outdir" +#outdir is processed/lang/path to output file +outdir="processed/$LANGUAGE/"$(dirname $1) # $1 is the input file input=$1 -output=$(basename $1 .rst).md -python3 "$main" --to-markdown --assets-dir static/ -o "$outdir/$output" "$input" "translations/$LANGUAGE/LC_MESSAGES/blog.po" \ No newline at end of file +output=$(basename $1).md +mkdir -p "$outdir" +echo python3 "$main" --to-markdown --assets-dir static/ -o "$outdir/$output" "$input" "translations/$LANGUAGE/LC_MESSAGES/$posource" +python3 "$main" --to-markdown --assets-dir static/ -o "$outdir/$output" "$input" "translations/$LANGUAGE/LC_MESSAGES/$posource" 2>> err.$LANGUAGE.log 1>> log.$LANGUAGE.log +python3 clean_markdown.py "$outdir/$output" \ No newline at end of file diff --git a/main.py b/main.py index e8ce2edc..03b3d27f 100644 --- a/main.py +++ b/main.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 """ -RST Translation Processor +Translation Processor -A script to process reStructuredText files by replacing translation tags +A script to process reStructuredText and HTML files by replacing translation tags with content from .po files, handling image references, and optionally converting to markdown. """ @@ -12,7 +12,8 @@ import os import re import shutil import sys -from typing import Dict, Optional, Tuple +import urllib.parse +from typing import Dict, List, Optional, Tuple import polib @@ -52,29 +53,122 @@ def load_translations(po_file_path: str) -> Dict[str, str]: return {} +def replace_template_vars(content: str) -> str: + """ + Replace template variables like {{spec_url()}} with their values. + + Args: + content: Content with template variables + + Returns: + Content with template variables replaced + """ + # Define base URLs for different types of links + base_urls = { + 'spec_url': 'https://i2p.net/spec/', + 'proposal_url': 'https://i2p.net/spec/proposals/proposal', + 'i2p_url': 'https://i2p.net/', + 'site_url': 'https://i2p.net/', + 'get_url': 'https://i2p.net/' + } + + # Handle {{spec_url("name")}} pattern + def replace_spec_url(match): + func_name = match.group(1) + arg = match.group(2).strip('"\'') if match.group(2) else "" + + if func_name in base_urls: + if func_name == 'proposal_url': + return f"{base_urls[func_name]}{arg}.html" + else: + return f"{base_urls[func_name]}{arg}" + + # Handle special cases for other template functions + if func_name == 'url_for': + # Extract the filename from patterns like url_for('static', filename='images/...') + filename_match = re.search(r'filename=[\'"](.*?)[\'"]', arg) + if filename_match: + return f"/_static/{filename_match.group(1)}" + elif func_name == 'i2pconv': + # For i2p domain conversions, return as is + return arg + + return match.group(0) # Return unchanged if not recognized + + # This pattern matches template functions like {{spec_url("ntcp2")}} + template_pattern = r'{{([a-zA-Z_]+)\(([^}]*?)\)}}' + processed_content = re.sub(template_pattern, replace_spec_url, content) + + # Handle other simple variable substitutions like {{ _('text') }} + def replace_simple_var(match): + var_name = match.group(1).strip() + # For translation function calls like _('text'), return just the text + if var_name.startswith("_('") and var_name.endswith("')"): + return var_name[3:-2] # Extract the text between quotes + return match.group(0) # Return unchanged if not recognized + + simple_var_pattern = r'{{([^}]+?)}}' + return re.sub(simple_var_pattern, replace_simple_var, processed_content) + + def replace_translations(content: str, translations: Dict[str, str]) -> str: """ Replace translation tags in the content with translated text. Args: - content: RST content with translation tags + content: Content with translation tags translations: Dictionary of translations Returns: Content with translations applied """ - def replace_match(match): + # First, handle simple {% trans %}...{% endtrans %} blocks + def replace_simple_match(match): text = match.group(1).strip() return translations.get(text, text) # Match {% trans %}...{% endtrans %} patterns - pattern = r'{%\s*trans\s*%}(.*?){%\s*endtrans\s*%}' - return re.sub(pattern, replace_match, content, flags=re.DOTALL) + simple_pattern = r'{%\s*trans\s*%}(.*?){%\s*endtrans\s*%}' + content = re.sub(simple_pattern, replace_simple_match, content, flags=re.DOTALL) + + # Now handle more complex translation blocks with arguments + def replace_complex_match(match): + # Extract parameters if present + params_str = match.group(1) or "" + text = match.group(2).strip() + + # Process parameters (for future use) + params = {} + if params_str: + param_matches = re.finditer(r'(\w+)=["\'](.*?)["\']', params_str) + for param_match in param_matches: + key, value = param_match.groups() + params[key] = value + + # Replace parameter references in the text + if f"{{{key}}}" in text: + text = text.replace(f"{{{key}}}", value) + + # Apply translation + translated = translations.get(text, text) + + # If there are parameter values, they need to be maintained in the translation + for key, value in params.items(): + if f"{{{key}}}" in translated: + translated = translated.replace(f"{{{key}}}", value) + + return translated + + # Match {% trans param1="value" -%}...{%- endtrans %} patterns with optional parameters + complex_pattern = r'{%\s*trans\s*(.*?)-%}(.*?){%-\s*endtrans\s*%}' + content = re.sub(complex_pattern, replace_complex_match, content, flags=re.DOTALL) + + return content -def process_images(content: str, base_dir: str, assets_dir: str) -> str: +def process_rst_images(content: str, base_dir: str, assets_dir: str) -> str: """ - Process image references in the content. + Process image references in RST content. Args: content: RST content with image references @@ -123,41 +217,192 @@ def process_images(content: str, base_dir: str, assets_dir: str) -> str: return re.sub(pattern, process_match, content) -def convert_to_markdown(rst_content: str) -> str: +def process_html_images(content: str, base_dir: str, assets_dir: str) -> str: """ - Convert RST content to Markdown. + Process image references in HTML content. Args: - rst_content: RST content to convert + content: HTML content with image references + base_dir: Base directory of the input file + assets_dir: Directory to store images + + Returns: + Content with updated image references + """ + # Create assets directory if it doesn't exist + images_dir = os.path.join(assets_dir, "images") + os.makedirs(images_dir, exist_ok=True) + + # Find image references in HTML + # This pattern matches tags + pattern = r']*src=["\']((?!https?://)[^"\']+)["\'][^>]*>' + + def process_match(match): + img_tag = match.group(0) + path = match.group(1) + + # Skip URLs + if path.startswith(('http://', 'https://')): + return img_tag + + # Handle templated paths + if "{{" in path: + # Extract paths from template expressions like {{ url_for('static', filename='images/file.png') }} + template_match = re.search(r'filename=[\'"](.*?)[\'"]', path) + if template_match: + path = f"/_static/{template_match.group(1)}" + else: + return img_tag # Can't process this template + + # Remove leading /_static/ if present + if path.startswith('/_static/'): + path = path[9:] # Remove /_static/ prefix + + # Handle relative paths + if not os.path.isabs(path): + full_old_path = os.path.join(base_dir, path) + if not os.path.exists(full_old_path): + # Try looking in static directory + full_old_path = os.path.join(base_dir, 'static', path) + else: + full_old_path = path + + # Extract filename from path + filename = os.path.basename(path) + new_rel_path = os.path.join("images", filename) + new_full_path = os.path.join(assets_dir, new_rel_path) + + # Copy the image if it exists + if os.path.exists(full_old_path): + try: + shutil.copy2(full_old_path, new_full_path) + print(f"Copied image: {full_old_path} -> {new_full_path}") + except Exception as e: + print(f"Error copying image {full_old_path}: {e}", file=sys.stderr) + return img_tag # Return original if error + else: + print(f"Warning: Image file not found: {full_old_path}", file=sys.stderr) + return img_tag # Return original if not found + + # Return the updated img tag with new path + return img_tag.replace(match.group(1), f"images/{filename}") + + return re.sub(pattern, process_match, content) + + +def clean_markdown_output(markdown: str) -> str: + """ + Clean up markdown output by handling URL-encoded template variables. + + Args: + markdown: Markdown content with possible URL-encoded templates + + Returns: + Cleaned markdown content + """ + # Replace URL-encoded template variables + def replace_encoded_templates(match): + # Decode the URL-encoded string + encoded_text = match.group(0) + decoded_text = urllib.parse.unquote(encoded_text) + + # Extract values from template expressions like {{ url_for('static', filename='images/file.png') }} + if decoded_text.startswith('{{') and decoded_text.endswith('}}'): + template_content = decoded_text.strip('{}').strip() + + # Handle url_for template function + if 'url_for' in template_content and 'filename=' in template_content: + filename_match = re.search(r'filename=[\'"](.*?)[\'"]', template_content) + if filename_match: + return filename_match.group(1) + + # Return empty string for other template functions + return '' + + return encoded_text + + # Find URL-encoded sequences that might be template variables + encoded_pattern = r'%7B%7B.*?%7D%7D' + markdown = re.sub(encoded_pattern, replace_encoded_templates, markdown) + + # Clean up any broken image links that might have resulted from template replacements + # Change ![text](broken_link) to ![text](images/filename.ext) when possible + def fix_image_links(match): + alt_text = match.group(1) + link = match.group(2) + + # If link is empty or looks like a broken template + if not link or link.startswith('%7B') or link.startswith('{{'): + # Try to extract image filename from alt text or use a placeholder + filename = alt_text.replace(' ', '-').lower() + if filename: + return f"![{alt_text}](images/{filename}.png)" + + return match.group(0) # Return unchanged + + # Fix image links + image_pattern = r'!\[(.*?)\]\((.*?)\)' + markdown = re.sub(image_pattern, fix_image_links, markdown) + + return markdown + + +def convert_to_markdown(content: str, is_rst: bool = True) -> str: + """ + Convert content to Markdown. + + Args: + content: Content to convert + is_rst: Whether the content is RST (True) or HTML (False) Returns: Markdown content """ - if not DOCUTILS_AVAILABLE: - print("Warning: docutils not available. RST to HTML conversion skipped.", file=sys.stderr) - return rst_content - if not PANDOC_AVAILABLE: - print("Warning: pypandoc not available. HTML to Markdown conversion skipped.", file=sys.stderr) - return rst_content + print("Warning: pypandoc not available. Conversion to Markdown skipped.", file=sys.stderr) + return content try: - # Convert RST to HTML - html = docutils.core.publish_string( - source=rst_content, - writer_name='html', - settings_overrides={'output_encoding': 'unicode'} - ) + if is_rst: + if not DOCUTILS_AVAILABLE: + print("Warning: docutils not available. RST to HTML conversion skipped.", file=sys.stderr) + return content + + # Convert RST to HTML + html = docutils.core.publish_string( + source=content, + writer_name='html', + settings_overrides={'output_encoding': 'unicode'} + ) + # Convert HTML to Markdown + markdown = pypandoc.convert_text(html, 'md', format='html') + else: + # Convert HTML directly to Markdown + markdown = pypandoc.convert_text(content, 'md', format='html') + + # Post-process markdown to clean up URL-encoded template variables + markdown = clean_markdown_output(markdown) - # Convert HTML to Markdown - markdown = pypandoc.convert_text(html, 'md', format='html') return markdown except Exception as e: print(f"Error converting to markdown: {e}", file=sys.stderr) - return rst_content + return content -def process_rst_file( +def is_draft(file_path: str) -> bool: + """ + Check if a file is a draft based on its name. + + Args: + file_path: Path to the file + + Returns: + True if the file is a draft, False otherwise + """ + return '.draft.' in file_path.lower() + + +def process_file( input_path: str, po_file_path: str, output_path: Optional[str] = None, @@ -165,10 +410,10 @@ def process_rst_file( assets_dir: str = "./assets" ) -> Tuple[bool, str]: """ - Process an RST file by replacing translations and handling images. + Process a file by replacing translations and handling images. Args: - input_path: Path to input RST file + input_path: Path to input file (RST or HTML) po_file_path: Path to .po file with translations output_path: Path to write output (default: add .translated suffix) to_markdown: Whether to convert to markdown @@ -186,10 +431,29 @@ def process_rst_file( print(f"Error: PO file does not exist: {po_file_path}", file=sys.stderr) return False, "" + # Determine file type + is_rst = input_path.lower().endswith('.rst') + is_html = input_path.lower().endswith(('.html', '.htm')) + + if not (is_rst or is_html): + print(f"Error: Unsupported file type: {input_path}. Only .rst, .html, and .htm files are supported.", + file=sys.stderr) + return False, "" + # Determine output path if not specified if not output_path: base, ext = os.path.splitext(input_path) - output_path = f"{base}.translated{'.md' if to_markdown else ext}" + # Handle .draft.rst/.draft.html case + if '.draft.' in base.lower(): + base = base.replace('.draft', '') + + # Set extension based on conversion type + if to_markdown: + out_ext = '.md' + else: + out_ext = ext + + output_path = f"{base}.translated{out_ext}" # Create assets directory os.makedirs(assets_dir, exist_ok=True) @@ -204,16 +468,25 @@ def process_rst_file( with open(input_path, 'r', encoding='utf-8') as f: content = f.read() + # Replace template variables + content = replace_template_vars(content) + # Replace translations content = replace_translations(content, translations) - # Process images + # Process images based on file type base_dir = os.path.dirname(os.path.abspath(input_path)) - content = process_images(content, base_dir, assets_dir) + if is_rst: + content = process_rst_images(content, base_dir, assets_dir) + elif is_html: + content = process_html_images(content, base_dir, assets_dir) # Convert to markdown if requested if to_markdown: - content = convert_to_markdown(content) + content = convert_to_markdown(content, is_rst=is_rst) + + # Create directory for output file if it doesn't exist + os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True) # Write output file with open(output_path, 'w', encoding='utf-8') as f: @@ -227,30 +500,139 @@ def process_rst_file( return False, "" +def find_files(directory: str, include_drafts: bool = False) -> List[str]: + """ + Find all RST and HTML files in a directory recursively. + + Args: + directory: Directory to search + include_drafts: Whether to include draft files + + Returns: + List of file paths + """ + result_files = [] + for root, _, files in os.walk(directory): + for file in files: + if file.lower().endswith(('.rst', '.html', '.htm')): + # Skip draft files if not included + if not include_drafts and '.draft.' in file.lower(): + continue + result_files.append(os.path.join(root, file)) + return result_files + + +def process_directory( + input_dir: str, + po_file_path: str, + output_dir: Optional[str] = None, + to_markdown: bool = False, + assets_dir: str = "./assets", + include_drafts: bool = False +) -> bool: + """ + Process all RST and HTML files in a directory. + + Args: + input_dir: Directory containing files + po_file_path: Path to .po file with translations + output_dir: Directory to write output files (default: add .translated suffix) + to_markdown: Whether to convert to markdown + assets_dir: Directory to store assets + include_drafts: Whether to include draft files + + Returns: + True if all files were processed successfully, False otherwise + """ + if not os.path.isdir(input_dir): + print(f"Error: Input directory does not exist: {input_dir}", file=sys.stderr) + return False + + # Find all RST and HTML files in the directory + input_files = find_files(input_dir, include_drafts) + if not input_files: + print(f"No RST or HTML files found in {input_dir}", file=sys.stderr) + return False + + success = True + for input_file in input_files: + # Determine output path based on relative path from input_dir + rel_path = os.path.relpath(input_file, input_dir) + if output_dir: + out_path = os.path.join(output_dir, rel_path) + # Adjust extension if converting to markdown + if to_markdown: + out_path = os.path.splitext(out_path)[0] + '.md' + # Remove .draft from path if present + if '.draft.' in out_path: + out_path = out_path.replace('.draft', '') + else: + out_path = None + + # Process the file + file_success, _ = process_file( + input_file, + po_file_path, + out_path, + to_markdown, + assets_dir + ) + + if not file_success: + success = False + + return success + + def main(): """Parse arguments and run the script.""" + # Fix for locale/gettext issues by forcing English locale + import locale + import os + # Force 'C' locale to avoid gettext issues + os.environ['LC_ALL'] = 'C' + locale.setlocale(locale.LC_ALL, 'C') + parser = argparse.ArgumentParser( - description='Process RST files by replacing translations and handling images' + description='Process RST and HTML files by replacing translations and handling images' ) - parser.add_argument('input_path', help='Path to input RST file') + parser.add_argument('input_path', help='Path to input file or directory') parser.add_argument('po_file_path', help='Path to .po file with translations') - parser.add_argument('-o', '--output-path', help='Path to write output file') + parser.add_argument('-o', '--output-path', help='Path to write output file or directory') parser.add_argument('--to-markdown', action='store_true', help='Convert output to markdown') parser.add_argument('--assets-dir', default='./assets', help='Directory to store assets (default: ./assets)') + parser.add_argument('--include-drafts', action='store_true', help='Process draft files (ending in .draft.rst or .draft.html)') + parser.add_argument('--recursive', action='store_true', help='Process directories recursively') args = parser.parse_args() - if args.to_markdown and not (DOCUTILS_AVAILABLE and PANDOC_AVAILABLE): - print("Warning: Markdown conversion requires docutils and pypandoc packages.", file=sys.stderr) - print("Install them with: pip install docutils pypandoc", file=sys.stderr) + if args.to_markdown and not PANDOC_AVAILABLE: + print("Warning: Markdown conversion requires pypandoc package.", file=sys.stderr) + print("Install it with: pip install pypandoc", file=sys.stderr) - success, output_path = process_rst_file( - args.input_path, - args.po_file_path, - args.output_path, - args.to_markdown, - args.assets_dir - ) + # Check if input path is a directory + if os.path.isdir(args.input_path): + if args.recursive: + success = process_directory( + args.input_path, + args.po_file_path, + args.output_path, + args.to_markdown, + args.assets_dir, + args.include_drafts + ) + else: + print("Error: Input path is a directory. Use --recursive to process it.", file=sys.stderr) + success = False + else: + # Process a single file + success, _ = process_file( + args.input_path, + args.po_file_path, + args.output_path, + args.to_markdown, + args.assets_dir + ) sys.exit(0 if success else 1) diff --git a/run.sh b/run.sh index 54ffe910..b27ce78b 100755 --- a/run.sh +++ b/run.sh @@ -2,5 +2,24 @@ export main=$(pwd)/main.py export cmd=$(pwd)/cmd.sh -export LANGUAGE=ru -find blog -name '*.rst' -exec "$cmd" {} \; \ No newline at end of file + + +export LANGUAGES="ar az ca cs da de el es es_AR et_EE fa fi fr gl he hu id it ja ko mg nb nl pl pt pt_BR ro ru sk sl sq sv tr uk zh zh_TW" +# produce translated files for all languages and all directories +for lang in $LANGUAGES; do + export LANGUAGE=$lang + export dir=pages + export posource=docs.po + export ext=html + echo "Processing $lang in $dir for $ext" + find "$dir" -name "*.$ext" -exec "$cmd" {} \; +done + +for lang in $LANGUAGES; do + export LANGUAGE=$lang + export dir=pages + export posource=blog.po + export ext=rst + echo "Processing $lang in $dir for $ext" + find "$dir" -name "*.$ext" -exec "$cmd" {} \; +done