www/clean_markdown.py

#!/usr/bin/env python3
# filepath: clean_markdown.py

import re
import os
import sys
import argparse
from pathlib import Path

def clean_markdown_file(file_path, dry_run=False):
    """Remove curly-braced elements from markdown files"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Store original content for comparison
        original_content = content

        # Replace {{...}} expressions with empty strings
        # Pattern matches {{ followed by any characters (non-greedy) followed by }}
        content = re.sub(r'\{\{\s*([^}]*?)\s*\}\}', '', content)

        # Replace {%...%} template tags with empty strings
        content = re.sub(r'\{%[^%]*?%\}', '', content)

        # Replace broken links that might result from removing template variables
        # e.g., [network database](%7B%7B%20netdb%20%7D%7D) -> [network database]()
        content = re.sub(r'\]\(%7B%7B[^)]*?%7D%7D\)', ']()', content)

        # Handle other URL-encoded template variables
        content = re.sub(r'%7B%7B[^%]*?%7D%7D', '', content)

        # Fix escaped backslashes that might appear in code blocks
        content = re.sub(r'\\\\([`*_{}[\]()#+-.!])', r'\1', content)

        # Clean up any double spaces created by removals
        content = re.sub(r'  +', ' ', content)

        # Only write if content changed and not in dry run mode
        if content != original_content and not dry_run:
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(content)
            print(f"Cleaned: {file_path}")
            return True
        elif content != original_content and dry_run:
            print(f"Would clean: {file_path} (dry run)")
            return True
        else:
            print(f"No changes needed: {file_path}")
            return False

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return False

def main():
    parser = argparse.ArgumentParser(description="Clean markdown files by removing template variables and expressions.")
    parser.add_argument("paths", nargs='+', help="Markdown files or directories to process")
    parser.add_argument("--dry-run", action="store_true", help="Show what would be changed without making changes")
    parser.add_argument("--recursive", "-r", action="store_true", help="Process directories recursively")
    args = parser.parse_args()

    files_processed = 0
    files_changed = 0

    for path in args.paths:
        path_obj = Path(path)
        if path_obj.is_file() and path_obj.suffix.lower() in ['.md', '.markdown']:
            files_processed += 1
            if clean_markdown_file(path_obj, args.dry_run):
                files_changed += 1
        elif path_obj.is_dir():
            if args.recursive:
                for md_file in path_obj.glob('**/*.md'):
                    files_processed += 1
                    if clean_markdown_file(md_file, args.dry_run):
                        files_changed += 1
                for md_file in path_obj.glob('**/*.markdown'):
                    files_processed += 1
                    if clean_markdown_file(md_file, args.dry_run):
                        files_changed += 1
            else:
                for md_file in path_obj.glob('*.md'):
                    files_processed += 1
                    if clean_markdown_file(md_file, args.dry_run):
                        files_changed += 1
                for md_file in path_obj.glob('*.markdown'):
                    files_processed += 1
                    if clean_markdown_file(md_file, args.dry_run):
                        files_changed += 1
        else:
            print(f"Skipping {path}: Not a markdown file or directory")

    print(f"\nSummary: Processed {files_processed} files, changed {files_changed} files")

if __name__ == "__main__":
    main()