disclosure-bureau/scripts/22-update-stub-messages.py

80 lines
2.7 KiB
Python
Executable file

#!/usr/bin/env python3
"""
22-update-stub-messages.py — Phase 0: replace misleading "Will be enriched in
Phase 6" stubs with an honest low-signal message.
For each entity file whose body still has the stub phrasing:
- Read total_mentions, documents_count from frontmatter
- Rewrite body with calibrated message that reflects reality
- Preserve frontmatter as-is
"""
from __future__ import annotations
import re
import sys
from pathlib import Path
try:
import yaml
except ImportError:
sys.stderr.write("pip3 install pyyaml\n"); sys.exit(1)
ENTITIES = Path("/Users/guto/ufo/wiki/entities")
STUB_RE = re.compile(
r"^# [^\n]+\n\n## Description \(EN\)\n\n_Stub generated by entity dedup\..*?_\n\n"
r"## Descrição \(PT-BR\)\n\n_Stub gerado pela deduplicação de entidades\..*?_\n*$",
re.DOTALL,
)
def new_body(canonical: str, total: int, docs: int) -> str:
return (
f"# {canonical}\n\n"
f"## Description (EN)\n\n"
f"_Low-signal entity — referenced **{total} time(s)** across **{docs} document(s)**. "
f"No external enrichment performed (criteria: ≥3 mentions). Use the page references below for raw context._\n\n"
f"## Descrição (PT-BR)\n\n"
f"_Entidade de baixo sinal — referenciada **{total} vez(es)** em **{docs} documento(s)**. "
f"Sem enriquecimento externo (critério: ≥3 menções). Use as referências de páginas abaixo para contexto bruto._\n"
)
def main():
updated = 0
skipped = 0
enriched = 0
for p in ENTITIES.glob("*/*.md"):
c = p.read_text(encoding="utf-8")
if not c.startswith("---"):
skipped += 1
continue
end = c.find("---", 4)
if end < 0:
skipped += 1; continue
fm = yaml.safe_load(c[3:end].strip()) or {}
body = c[end + 3:].lstrip("\n")
# Don't touch entities that have real enrichment content
if fm.get("enrichment_status") in ("deep", "shallow") and "external_sources" in body:
enriched += 1
continue
# Don't touch the seeded entities that had hand-curated bodies
if "Phase 6" not in body and "Phase 7" not in body:
skipped += 1
continue
canonical = fm.get("canonical_name") or p.stem
total = int(fm.get("total_mentions") or 0)
docs = int(fm.get("documents_count") or 0)
new = new_body(canonical, total, docs)
new_full = c[:end + 4] + "\n" + new
if new_full == c:
skipped += 1; continue
p.write_text(new_full, encoding="utf-8")
updated += 1
print(f"Updated: {updated}\nSkipped (no stub / hand-curated): {skipped}\nKept enriched: {enriched}")
if __name__ == "__main__":
main()