#!/usr/bin/env python3 """ 41_strip_stubs.py — Eliminate "Will be enriched in Phase N" placeholders from wiki/entities/**/*.md. For each entity markdown file: Frontmatter: - narrative_summary: "_Stub. Will be enriched in Phase 7._" → null - narrative_summary_pt_br: "_Stub. Será enriquecido na Fase 7._" → null - drop narrative_summary_confidence - add summary_status: 'none' - add summary_confidence: null Body: - Remove paragraphs containing "_Stub generated by entity dedup. Will be enriched in Phase 6._" / "_Stub gerado pela deduplicação de entidades. Será enriquecido na Fase 6._" - Keep all other content untouched (curated narrative is preserved). Idempotent: a file already migrated (has `summary_status` field) is skipped. Usage: ./41_strip_stubs.py # process all entities ./41_strip_stubs.py --class events # only one class ./41_strip_stubs.py --dry-run # report counts, don't write ./41_strip_stubs.py --verbose # list each touched file """ from __future__ import annotations import argparse import re import sys from datetime import datetime, timezone from pathlib import Path try: import yaml except ImportError: sys.stderr.write("pip3 install pyyaml\n") sys.exit(1) UFO_ROOT = Path(__file__).resolve().parents[2] ENTITIES_BASE = UFO_ROOT / "wiki" / "entities" LOG_PATH = UFO_ROOT / "wiki" / "log.md" STUB_EN = "_Stub. Will be enriched in Phase 7._" STUB_PT = "_Stub. Será enriquecido na Fase 7._" BODY_STUB_EN = "_Stub generated by entity dedup. Will be enriched in Phase 6._" BODY_STUB_PT = "_Stub gerado pela deduplicação de entidades. Será enriquecido na Fase 6._" # Match any line that is exactly one of the body stubs (ignoring surrounding whitespace). BODY_STUB_LINE_RE = re.compile( r"^\s*(?:" + re.escape(BODY_STUB_EN) + r"|" + re.escape(BODY_STUB_PT) + r")\s*$", re.MULTILINE, ) def utc_iso() -> str: return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def split_md(path: Path) -> tuple[dict, str, bool]: """Return (frontmatter, body, had_frontmatter).""" raw = path.read_text(encoding="utf-8") if not raw.startswith("---"): return {}, raw, False end = raw.find("---", 4) if end == -1: return {}, raw, False try: fm = yaml.safe_load(raw[3:end].strip()) or {} except yaml.YAMLError: return {}, raw, False body = raw[end + 3 :].lstrip("\n") return fm, body, True def render_md(fm: dict, body: str) -> str: yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False) sep = "" if body.startswith("\n") else "\n" return f"---\n{yaml_str}---\n{sep}{body}" def clean_body(body: str) -> str: """Drop lines that are exactly a stub paragraph; collapse 3+ blank lines to 2.""" cleaned = BODY_STUB_LINE_RE.sub("", body) cleaned = re.sub(r"\n{3,}", "\n\n", cleaned) return cleaned def migrate_file(path: Path, dry_run: bool, verbose: bool) -> tuple[bool, bool]: """Returns (changed, already_migrated).""" fm, body, had_fm = split_md(path) if not had_fm: return (False, False) # Idempotency: presence of summary_status means already migrated. if "summary_status" in fm: return (False, True) new_fm = dict(fm) # preserve insertion order changed = False # Clear stub textual values if new_fm.get("narrative_summary") == STUB_EN: new_fm["narrative_summary"] = None changed = True if new_fm.get("narrative_summary_pt_br") == STUB_PT: new_fm["narrative_summary_pt_br"] = None changed = True # Drop old confidence key (replaced by summary_confidence) if "narrative_summary_confidence" in new_fm: new_fm.pop("narrative_summary_confidence", None) changed = True # Always add summary_status / summary_confidence so subsequent runs detect # the migration as already done (even if this file had no stub fields). new_fm["summary_status"] = "none" new_fm.setdefault("summary_confidence", None) # Touch last_lint so downstream tools see the change new_fm["last_lint"] = utc_iso() new_body = clean_body(body) if new_body != body: changed = True if not changed: # No stub to strip, but we still added the status field — count as changed. changed = True if dry_run: if verbose: print(f" (dry) {path.relative_to(UFO_ROOT)}") return (True, False) path.write_text(render_md(new_fm, new_body), encoding="utf-8") if verbose: print(f" ✓ {path.relative_to(UFO_ROOT)}") return (True, False) def main() -> int: p = argparse.ArgumentParser() p.add_argument("--class", dest="cls", default=None, help="restrict to one class folder under wiki/entities/") p.add_argument("--dry-run", action="store_true") p.add_argument("--verbose", action="store_true") args = p.parse_args() if not ENTITIES_BASE.exists(): sys.stderr.write(f"missing {ENTITIES_BASE}\n") return 1 if args.cls: roots = [ENTITIES_BASE / args.cls] else: roots = [d for d in ENTITIES_BASE.iterdir() if d.is_dir()] total = 0 migrated = 0 skipped_already = 0 for root in roots: for path in sorted(root.glob("*.md")): total += 1 changed, already = migrate_file(path, args.dry_run, args.verbose) if already: skipped_already += 1 elif changed: migrated += 1 print() print(f" total scanned: {total}") print(f" migrated: {migrated}") print(f" already migrated: {skipped_already}") print(f" dry-run: {args.dry_run}") if not args.dry_run and migrated > 0: LOG_PATH.parent.mkdir(parents=True, exist_ok=True) with LOG_PATH.open("a", encoding="utf-8") as f: f.write( f"\n## {utc_iso()} · STRIP_STUBS\n" f"- script: scripts/maintain/41_strip_stubs.py\n" f"- scanned: {total}\n" f"- migrated: {migrated}\n" f"- already_migrated: {skipped_already}\n" ) return 0 if __name__ == "__main__": sys.exit(main())