Files
www/clean_markdown.py
eyedeekay 2c524f049f basics
2025-05-06 22:23:32 -04:00

97 lines
3.9 KiB
Python

#!/usr/bin/env python3
# filepath: clean_markdown.py
import re
import os
import sys
import argparse
from pathlib import Path
def clean_markdown_file(file_path, dry_run=False):
"""Remove curly-braced elements from markdown files"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Store original content for comparison
original_content = content
# Replace {{...}} expressions with empty strings
# Pattern matches {{ followed by any characters (non-greedy) followed by }}
content = re.sub(r'\{\{\s*([^}]*?)\s*\}\}', '', content)
# Replace {%...%} template tags with empty strings
content = re.sub(r'\{%[^%]*?%\}', '', content)
# Replace broken links that might result from removing template variables
# e.g., [network database](%7B%7B%20netdb%20%7D%7D) -> [network database]()
content = re.sub(r'\]\(%7B%7B[^)]*?%7D%7D\)', ']()', content)
# Handle other URL-encoded template variables
content = re.sub(r'%7B%7B[^%]*?%7D%7D', '', content)
# Fix escaped backslashes that might appear in code blocks
content = re.sub(r'\\\\([`*_{}[\]()#+-.!])', r'\1', content)
# Clean up any double spaces created by removals
content = re.sub(r' +', ' ', content)
# Only write if content changed and not in dry run mode
if content != original_content and not dry_run:
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f"Cleaned: {file_path}")
return True
elif content != original_content and dry_run:
print(f"Would clean: {file_path} (dry run)")
return True
else:
print(f"No changes needed: {file_path}")
return False
except Exception as e:
print(f"Error processing {file_path}: {e}")
return False
def main():
parser = argparse.ArgumentParser(description="Clean markdown files by removing template variables and expressions.")
parser.add_argument("paths", nargs='+', help="Markdown files or directories to process")
parser.add_argument("--dry-run", action="store_true", help="Show what would be changed without making changes")
parser.add_argument("--recursive", "-r", action="store_true", help="Process directories recursively")
args = parser.parse_args()
files_processed = 0
files_changed = 0
for path in args.paths:
path_obj = Path(path)
if path_obj.is_file() and path_obj.suffix.lower() in ['.md', '.markdown']:
files_processed += 1
if clean_markdown_file(path_obj, args.dry_run):
files_changed += 1
elif path_obj.is_dir():
if args.recursive:
for md_file in path_obj.glob('**/*.md'):
files_processed += 1
if clean_markdown_file(md_file, args.dry_run):
files_changed += 1
for md_file in path_obj.glob('**/*.markdown'):
files_processed += 1
if clean_markdown_file(md_file, args.dry_run):
files_changed += 1
else:
for md_file in path_obj.glob('*.md'):
files_processed += 1
if clean_markdown_file(md_file, args.dry_run):
files_changed += 1
for md_file in path_obj.glob('*.markdown'):
files_processed += 1
if clean_markdown_file(md_file, args.dry_run):
files_changed += 1
else:
print(f"Skipping {path}: Not a markdown file or directory")
print(f"\nSummary: Processed {files_processed} files, changed {files_changed} files")
if __name__ == "__main__":
main()