This commit is contained in:
eyedeekay
2025-05-06 22:23:32 -04:00
parent 099860e818
commit 2c524f049f
4 changed files with 554 additions and 54 deletions

97
clean_markdown.py Normal file
View File

@ -0,0 +1,97 @@
#!/usr/bin/env python3
# filepath: clean_markdown.py
import re
import os
import sys
import argparse
from pathlib import Path
def clean_markdown_file(file_path, dry_run=False):
"""Remove curly-braced elements from markdown files"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Store original content for comparison
original_content = content
# Replace {{...}} expressions with empty strings
# Pattern matches {{ followed by any characters (non-greedy) followed by }}
content = re.sub(r'\{\{\s*([^}]*?)\s*\}\}', '', content)
# Replace {%...%} template tags with empty strings
content = re.sub(r'\{%[^%]*?%\}', '', content)
# Replace broken links that might result from removing template variables
# e.g., [network database](%7B%7B%20netdb%20%7D%7D) -> [network database]()
content = re.sub(r'\]\(%7B%7B[^)]*?%7D%7D\)', ']()', content)
# Handle other URL-encoded template variables
content = re.sub(r'%7B%7B[^%]*?%7D%7D', '', content)
# Fix escaped backslashes that might appear in code blocks
content = re.sub(r'\\\\([`*_{}[\]()#+-.!])', r'\1', content)
# Clean up any double spaces created by removals
content = re.sub(r' +', ' ', content)
# Only write if content changed and not in dry run mode
if content != original_content and not dry_run:
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f"Cleaned: {file_path}")
return True
elif content != original_content and dry_run:
print(f"Would clean: {file_path} (dry run)")
return True
else:
print(f"No changes needed: {file_path}")
return False
except Exception as e:
print(f"Error processing {file_path}: {e}")
return False
def main():
parser = argparse.ArgumentParser(description="Clean markdown files by removing template variables and expressions.")
parser.add_argument("paths", nargs='+', help="Markdown files or directories to process")
parser.add_argument("--dry-run", action="store_true", help="Show what would be changed without making changes")
parser.add_argument("--recursive", "-r", action="store_true", help="Process directories recursively")
args = parser.parse_args()
files_processed = 0
files_changed = 0
for path in args.paths:
path_obj = Path(path)
if path_obj.is_file() and path_obj.suffix.lower() in ['.md', '.markdown']:
files_processed += 1
if clean_markdown_file(path_obj, args.dry_run):
files_changed += 1
elif path_obj.is_dir():
if args.recursive:
for md_file in path_obj.glob('**/*.md'):
files_processed += 1
if clean_markdown_file(md_file, args.dry_run):
files_changed += 1
for md_file in path_obj.glob('**/*.markdown'):
files_processed += 1
if clean_markdown_file(md_file, args.dry_run):
files_changed += 1
else:
for md_file in path_obj.glob('*.md'):
files_processed += 1
if clean_markdown_file(md_file, args.dry_run):
files_changed += 1
for md_file in path_obj.glob('*.markdown'):
files_processed += 1
if clean_markdown_file(md_file, args.dry_run):
files_changed += 1
else:
print(f"Skipping {path}: Not a markdown file or directory")
print(f"\nSummary: Processed {files_processed} files, changed {files_changed} files")
if __name__ == "__main__":
main()

12
cmd.sh
View File

@ -1,9 +1,11 @@
#!/usr/bin/env sh #!/usr/bin/env sh
#outdir is processed+path to output file #outdir is processed/lang/path to output file
outdir=processed/$(dirname $1) outdir="processed/$LANGUAGE/"$(dirname $1)
mkdir -p "$outdir"
# $1 is the input file # $1 is the input file
input=$1 input=$1
output=$(basename $1 .rst).md output=$(basename $1).md
python3 "$main" --to-markdown --assets-dir static/ -o "$outdir/$output" "$input" "translations/$LANGUAGE/LC_MESSAGES/blog.po" mkdir -p "$outdir"
echo python3 "$main" --to-markdown --assets-dir static/ -o "$outdir/$output" "$input" "translations/$LANGUAGE/LC_MESSAGES/$posource"
python3 "$main" --to-markdown --assets-dir static/ -o "$outdir/$output" "$input" "translations/$LANGUAGE/LC_MESSAGES/$posource" 2>> err.$LANGUAGE.log 1>> log.$LANGUAGE.log
python3 clean_markdown.py "$outdir/$output"

452
main.py
View File

@ -1,8 +1,8 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
RST Translation Processor Translation Processor
A script to process reStructuredText files by replacing translation tags A script to process reStructuredText and HTML files by replacing translation tags
with content from .po files, handling image references, and optionally with content from .po files, handling image references, and optionally
converting to markdown. converting to markdown.
""" """
@ -12,7 +12,8 @@ import os
import re import re
import shutil import shutil
import sys import sys
from typing import Dict, Optional, Tuple import urllib.parse
from typing import Dict, List, Optional, Tuple
import polib import polib
@ -52,29 +53,122 @@ def load_translations(po_file_path: str) -> Dict[str, str]:
return {} return {}
def replace_template_vars(content: str) -> str:
"""
Replace template variables like {{spec_url()}} with their values.
Args:
content: Content with template variables
Returns:
Content with template variables replaced
"""
# Define base URLs for different types of links
base_urls = {
'spec_url': 'https://i2p.net/spec/',
'proposal_url': 'https://i2p.net/spec/proposals/proposal',
'i2p_url': 'https://i2p.net/',
'site_url': 'https://i2p.net/',
'get_url': 'https://i2p.net/'
}
# Handle {{spec_url("name")}} pattern
def replace_spec_url(match):
func_name = match.group(1)
arg = match.group(2).strip('"\'') if match.group(2) else ""
if func_name in base_urls:
if func_name == 'proposal_url':
return f"{base_urls[func_name]}{arg}.html"
else:
return f"{base_urls[func_name]}{arg}"
# Handle special cases for other template functions
if func_name == 'url_for':
# Extract the filename from patterns like url_for('static', filename='images/...')
filename_match = re.search(r'filename=[\'"](.*?)[\'"]', arg)
if filename_match:
return f"/_static/{filename_match.group(1)}"
elif func_name == 'i2pconv':
# For i2p domain conversions, return as is
return arg
return match.group(0) # Return unchanged if not recognized
# This pattern matches template functions like {{spec_url("ntcp2")}}
template_pattern = r'{{([a-zA-Z_]+)\(([^}]*?)\)}}'
processed_content = re.sub(template_pattern, replace_spec_url, content)
# Handle other simple variable substitutions like {{ _('text') }}
def replace_simple_var(match):
var_name = match.group(1).strip()
# For translation function calls like _('text'), return just the text
if var_name.startswith("_('") and var_name.endswith("')"):
return var_name[3:-2] # Extract the text between quotes
return match.group(0) # Return unchanged if not recognized
simple_var_pattern = r'{{([^}]+?)}}'
return re.sub(simple_var_pattern, replace_simple_var, processed_content)
def replace_translations(content: str, translations: Dict[str, str]) -> str: def replace_translations(content: str, translations: Dict[str, str]) -> str:
""" """
Replace translation tags in the content with translated text. Replace translation tags in the content with translated text.
Args: Args:
content: RST content with translation tags content: Content with translation tags
translations: Dictionary of translations translations: Dictionary of translations
Returns: Returns:
Content with translations applied Content with translations applied
""" """
def replace_match(match): # First, handle simple {% trans %}...{% endtrans %} blocks
def replace_simple_match(match):
text = match.group(1).strip() text = match.group(1).strip()
return translations.get(text, text) return translations.get(text, text)
# Match {% trans %}...{% endtrans %} patterns # Match {% trans %}...{% endtrans %} patterns
pattern = r'{%\s*trans\s*%}(.*?){%\s*endtrans\s*%}' simple_pattern = r'{%\s*trans\s*%}(.*?){%\s*endtrans\s*%}'
return re.sub(pattern, replace_match, content, flags=re.DOTALL) content = re.sub(simple_pattern, replace_simple_match, content, flags=re.DOTALL)
# Now handle more complex translation blocks with arguments
def replace_complex_match(match):
# Extract parameters if present
params_str = match.group(1) or ""
text = match.group(2).strip()
# Process parameters (for future use)
params = {}
if params_str:
param_matches = re.finditer(r'(\w+)=["\'](.*?)["\']', params_str)
for param_match in param_matches:
key, value = param_match.groups()
params[key] = value
# Replace parameter references in the text
if f"{{{key}}}" in text:
text = text.replace(f"{{{key}}}", value)
# Apply translation
translated = translations.get(text, text)
# If there are parameter values, they need to be maintained in the translation
for key, value in params.items():
if f"{{{key}}}" in translated:
translated = translated.replace(f"{{{key}}}", value)
return translated
# Match {% trans param1="value" -%}...{%- endtrans %} patterns with optional parameters
complex_pattern = r'{%\s*trans\s*(.*?)-%}(.*?){%-\s*endtrans\s*%}'
content = re.sub(complex_pattern, replace_complex_match, content, flags=re.DOTALL)
return content
def process_images(content: str, base_dir: str, assets_dir: str) -> str: def process_rst_images(content: str, base_dir: str, assets_dir: str) -> str:
""" """
Process image references in the content. Process image references in RST content.
Args: Args:
content: RST content with image references content: RST content with image references
@ -123,41 +217,192 @@ def process_images(content: str, base_dir: str, assets_dir: str) -> str:
return re.sub(pattern, process_match, content) return re.sub(pattern, process_match, content)
def convert_to_markdown(rst_content: str) -> str: def process_html_images(content: str, base_dir: str, assets_dir: str) -> str:
""" """
Convert RST content to Markdown. Process image references in HTML content.
Args: Args:
rst_content: RST content to convert content: HTML content with image references
base_dir: Base directory of the input file
assets_dir: Directory to store images
Returns:
Content with updated image references
"""
# Create assets directory if it doesn't exist
images_dir = os.path.join(assets_dir, "images")
os.makedirs(images_dir, exist_ok=True)
# Find image references in HTML
# This pattern matches <img src="..."> tags
pattern = r'<img\s+[^>]*src=["\']((?!https?://)[^"\']+)["\'][^>]*>'
def process_match(match):
img_tag = match.group(0)
path = match.group(1)
# Skip URLs
if path.startswith(('http://', 'https://')):
return img_tag
# Handle templated paths
if "{{" in path:
# Extract paths from template expressions like {{ url_for('static', filename='images/file.png') }}
template_match = re.search(r'filename=[\'"](.*?)[\'"]', path)
if template_match:
path = f"/_static/{template_match.group(1)}"
else:
return img_tag # Can't process this template
# Remove leading /_static/ if present
if path.startswith('/_static/'):
path = path[9:] # Remove /_static/ prefix
# Handle relative paths
if not os.path.isabs(path):
full_old_path = os.path.join(base_dir, path)
if not os.path.exists(full_old_path):
# Try looking in static directory
full_old_path = os.path.join(base_dir, 'static', path)
else:
full_old_path = path
# Extract filename from path
filename = os.path.basename(path)
new_rel_path = os.path.join("images", filename)
new_full_path = os.path.join(assets_dir, new_rel_path)
# Copy the image if it exists
if os.path.exists(full_old_path):
try:
shutil.copy2(full_old_path, new_full_path)
print(f"Copied image: {full_old_path} -> {new_full_path}")
except Exception as e:
print(f"Error copying image {full_old_path}: {e}", file=sys.stderr)
return img_tag # Return original if error
else:
print(f"Warning: Image file not found: {full_old_path}", file=sys.stderr)
return img_tag # Return original if not found
# Return the updated img tag with new path
return img_tag.replace(match.group(1), f"images/{filename}")
return re.sub(pattern, process_match, content)
def clean_markdown_output(markdown: str) -> str:
"""
Clean up markdown output by handling URL-encoded template variables.
Args:
markdown: Markdown content with possible URL-encoded templates
Returns:
Cleaned markdown content
"""
# Replace URL-encoded template variables
def replace_encoded_templates(match):
# Decode the URL-encoded string
encoded_text = match.group(0)
decoded_text = urllib.parse.unquote(encoded_text)
# Extract values from template expressions like {{ url_for('static', filename='images/file.png') }}
if decoded_text.startswith('{{') and decoded_text.endswith('}}'):
template_content = decoded_text.strip('{}').strip()
# Handle url_for template function
if 'url_for' in template_content and 'filename=' in template_content:
filename_match = re.search(r'filename=[\'"](.*?)[\'"]', template_content)
if filename_match:
return filename_match.group(1)
# Return empty string for other template functions
return ''
return encoded_text
# Find URL-encoded sequences that might be template variables
encoded_pattern = r'%7B%7B.*?%7D%7D'
markdown = re.sub(encoded_pattern, replace_encoded_templates, markdown)
# Clean up any broken image links that might have resulted from template replacements
# Change ![text](broken_link) to ![text](images/filename.ext) when possible
def fix_image_links(match):
alt_text = match.group(1)
link = match.group(2)
# If link is empty or looks like a broken template
if not link or link.startswith('%7B') or link.startswith('{{'):
# Try to extract image filename from alt text or use a placeholder
filename = alt_text.replace(' ', '-').lower()
if filename:
return f"![{alt_text}](images/{filename}.png)"
return match.group(0) # Return unchanged
# Fix image links
image_pattern = r'!\[(.*?)\]\((.*?)\)'
markdown = re.sub(image_pattern, fix_image_links, markdown)
return markdown
def convert_to_markdown(content: str, is_rst: bool = True) -> str:
"""
Convert content to Markdown.
Args:
content: Content to convert
is_rst: Whether the content is RST (True) or HTML (False)
Returns: Returns:
Markdown content Markdown content
""" """
if not DOCUTILS_AVAILABLE:
print("Warning: docutils not available. RST to HTML conversion skipped.", file=sys.stderr)
return rst_content
if not PANDOC_AVAILABLE: if not PANDOC_AVAILABLE:
print("Warning: pypandoc not available. HTML to Markdown conversion skipped.", file=sys.stderr) print("Warning: pypandoc not available. Conversion to Markdown skipped.", file=sys.stderr)
return rst_content return content
try: try:
if is_rst:
if not DOCUTILS_AVAILABLE:
print("Warning: docutils not available. RST to HTML conversion skipped.", file=sys.stderr)
return content
# Convert RST to HTML # Convert RST to HTML
html = docutils.core.publish_string( html = docutils.core.publish_string(
source=rst_content, source=content,
writer_name='html', writer_name='html',
settings_overrides={'output_encoding': 'unicode'} settings_overrides={'output_encoding': 'unicode'}
) )
# Convert HTML to Markdown # Convert HTML to Markdown
markdown = pypandoc.convert_text(html, 'md', format='html') markdown = pypandoc.convert_text(html, 'md', format='html')
else:
# Convert HTML directly to Markdown
markdown = pypandoc.convert_text(content, 'md', format='html')
# Post-process markdown to clean up URL-encoded template variables
markdown = clean_markdown_output(markdown)
return markdown return markdown
except Exception as e: except Exception as e:
print(f"Error converting to markdown: {e}", file=sys.stderr) print(f"Error converting to markdown: {e}", file=sys.stderr)
return rst_content return content
def process_rst_file( def is_draft(file_path: str) -> bool:
"""
Check if a file is a draft based on its name.
Args:
file_path: Path to the file
Returns:
True if the file is a draft, False otherwise
"""
return '.draft.' in file_path.lower()
def process_file(
input_path: str, input_path: str,
po_file_path: str, po_file_path: str,
output_path: Optional[str] = None, output_path: Optional[str] = None,
@ -165,10 +410,10 @@ def process_rst_file(
assets_dir: str = "./assets" assets_dir: str = "./assets"
) -> Tuple[bool, str]: ) -> Tuple[bool, str]:
""" """
Process an RST file by replacing translations and handling images. Process a file by replacing translations and handling images.
Args: Args:
input_path: Path to input RST file input_path: Path to input file (RST or HTML)
po_file_path: Path to .po file with translations po_file_path: Path to .po file with translations
output_path: Path to write output (default: add .translated suffix) output_path: Path to write output (default: add .translated suffix)
to_markdown: Whether to convert to markdown to_markdown: Whether to convert to markdown
@ -186,10 +431,29 @@ def process_rst_file(
print(f"Error: PO file does not exist: {po_file_path}", file=sys.stderr) print(f"Error: PO file does not exist: {po_file_path}", file=sys.stderr)
return False, "" return False, ""
# Determine file type
is_rst = input_path.lower().endswith('.rst')
is_html = input_path.lower().endswith(('.html', '.htm'))
if not (is_rst or is_html):
print(f"Error: Unsupported file type: {input_path}. Only .rst, .html, and .htm files are supported.",
file=sys.stderr)
return False, ""
# Determine output path if not specified # Determine output path if not specified
if not output_path: if not output_path:
base, ext = os.path.splitext(input_path) base, ext = os.path.splitext(input_path)
output_path = f"{base}.translated{'.md' if to_markdown else ext}" # Handle .draft.rst/.draft.html case
if '.draft.' in base.lower():
base = base.replace('.draft', '')
# Set extension based on conversion type
if to_markdown:
out_ext = '.md'
else:
out_ext = ext
output_path = f"{base}.translated{out_ext}"
# Create assets directory # Create assets directory
os.makedirs(assets_dir, exist_ok=True) os.makedirs(assets_dir, exist_ok=True)
@ -204,16 +468,25 @@ def process_rst_file(
with open(input_path, 'r', encoding='utf-8') as f: with open(input_path, 'r', encoding='utf-8') as f:
content = f.read() content = f.read()
# Replace template variables
content = replace_template_vars(content)
# Replace translations # Replace translations
content = replace_translations(content, translations) content = replace_translations(content, translations)
# Process images # Process images based on file type
base_dir = os.path.dirname(os.path.abspath(input_path)) base_dir = os.path.dirname(os.path.abspath(input_path))
content = process_images(content, base_dir, assets_dir) if is_rst:
content = process_rst_images(content, base_dir, assets_dir)
elif is_html:
content = process_html_images(content, base_dir, assets_dir)
# Convert to markdown if requested # Convert to markdown if requested
if to_markdown: if to_markdown:
content = convert_to_markdown(content) content = convert_to_markdown(content, is_rst=is_rst)
# Create directory for output file if it doesn't exist
os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
# Write output file # Write output file
with open(output_path, 'w', encoding='utf-8') as f: with open(output_path, 'w', encoding='utf-8') as f:
@ -227,24 +500,133 @@ def process_rst_file(
return False, "" return False, ""
def find_files(directory: str, include_drafts: bool = False) -> List[str]:
"""
Find all RST and HTML files in a directory recursively.
Args:
directory: Directory to search
include_drafts: Whether to include draft files
Returns:
List of file paths
"""
result_files = []
for root, _, files in os.walk(directory):
for file in files:
if file.lower().endswith(('.rst', '.html', '.htm')):
# Skip draft files if not included
if not include_drafts and '.draft.' in file.lower():
continue
result_files.append(os.path.join(root, file))
return result_files
def process_directory(
input_dir: str,
po_file_path: str,
output_dir: Optional[str] = None,
to_markdown: bool = False,
assets_dir: str = "./assets",
include_drafts: bool = False
) -> bool:
"""
Process all RST and HTML files in a directory.
Args:
input_dir: Directory containing files
po_file_path: Path to .po file with translations
output_dir: Directory to write output files (default: add .translated suffix)
to_markdown: Whether to convert to markdown
assets_dir: Directory to store assets
include_drafts: Whether to include draft files
Returns:
True if all files were processed successfully, False otherwise
"""
if not os.path.isdir(input_dir):
print(f"Error: Input directory does not exist: {input_dir}", file=sys.stderr)
return False
# Find all RST and HTML files in the directory
input_files = find_files(input_dir, include_drafts)
if not input_files:
print(f"No RST or HTML files found in {input_dir}", file=sys.stderr)
return False
success = True
for input_file in input_files:
# Determine output path based on relative path from input_dir
rel_path = os.path.relpath(input_file, input_dir)
if output_dir:
out_path = os.path.join(output_dir, rel_path)
# Adjust extension if converting to markdown
if to_markdown:
out_path = os.path.splitext(out_path)[0] + '.md'
# Remove .draft from path if present
if '.draft.' in out_path:
out_path = out_path.replace('.draft', '')
else:
out_path = None
# Process the file
file_success, _ = process_file(
input_file,
po_file_path,
out_path,
to_markdown,
assets_dir
)
if not file_success:
success = False
return success
def main(): def main():
"""Parse arguments and run the script.""" """Parse arguments and run the script."""
# Fix for locale/gettext issues by forcing English locale
import locale
import os
# Force 'C' locale to avoid gettext issues
os.environ['LC_ALL'] = 'C'
locale.setlocale(locale.LC_ALL, 'C')
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='Process RST files by replacing translations and handling images' description='Process RST and HTML files by replacing translations and handling images'
) )
parser.add_argument('input_path', help='Path to input RST file') parser.add_argument('input_path', help='Path to input file or directory')
parser.add_argument('po_file_path', help='Path to .po file with translations') parser.add_argument('po_file_path', help='Path to .po file with translations')
parser.add_argument('-o', '--output-path', help='Path to write output file') parser.add_argument('-o', '--output-path', help='Path to write output file or directory')
parser.add_argument('--to-markdown', action='store_true', help='Convert output to markdown') parser.add_argument('--to-markdown', action='store_true', help='Convert output to markdown')
parser.add_argument('--assets-dir', default='./assets', help='Directory to store assets (default: ./assets)') parser.add_argument('--assets-dir', default='./assets', help='Directory to store assets (default: ./assets)')
parser.add_argument('--include-drafts', action='store_true', help='Process draft files (ending in .draft.rst or .draft.html)')
parser.add_argument('--recursive', action='store_true', help='Process directories recursively')
args = parser.parse_args() args = parser.parse_args()
if args.to_markdown and not (DOCUTILS_AVAILABLE and PANDOC_AVAILABLE): if args.to_markdown and not PANDOC_AVAILABLE:
print("Warning: Markdown conversion requires docutils and pypandoc packages.", file=sys.stderr) print("Warning: Markdown conversion requires pypandoc package.", file=sys.stderr)
print("Install them with: pip install docutils pypandoc", file=sys.stderr) print("Install it with: pip install pypandoc", file=sys.stderr)
success, output_path = process_rst_file( # Check if input path is a directory
if os.path.isdir(args.input_path):
if args.recursive:
success = process_directory(
args.input_path,
args.po_file_path,
args.output_path,
args.to_markdown,
args.assets_dir,
args.include_drafts
)
else:
print("Error: Input path is a directory. Use --recursive to process it.", file=sys.stderr)
success = False
else:
# Process a single file
success, _ = process_file(
args.input_path, args.input_path,
args.po_file_path, args.po_file_path,
args.output_path, args.output_path,

23
run.sh
View File

@ -2,5 +2,24 @@
export main=$(pwd)/main.py export main=$(pwd)/main.py
export cmd=$(pwd)/cmd.sh export cmd=$(pwd)/cmd.sh
export LANGUAGE=ru
find blog -name '*.rst' -exec "$cmd" {} \;
export LANGUAGES="ar az ca cs da de el es es_AR et_EE fa fi fr gl he hu id it ja ko mg nb nl pl pt pt_BR ro ru sk sl sq sv tr uk zh zh_TW"
# produce translated files for all languages and all directories
for lang in $LANGUAGES; do
export LANGUAGE=$lang
export dir=pages
export posource=docs.po
export ext=html
echo "Processing $lang in $dir for $ext"
find "$dir" -name "*.$ext" -exec "$cmd" {} \;
done
for lang in $LANGUAGES; do
export LANGUAGE=$lang
export dir=pages
export posource=blog.po
export ext=rst
echo "Processing $lang in $dir for $ext"
find "$dir" -name "*.$ext" -exec "$cmd" {} \;
done