80 lines
2.7 KiB
Python
Executable file
80 lines
2.7 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
22-update-stub-messages.py — Phase 0: replace misleading "Will be enriched in
|
|
Phase 6" stubs with an honest low-signal message.
|
|
|
|
For each entity file whose body still has the stub phrasing:
|
|
- Read total_mentions, documents_count from frontmatter
|
|
- Rewrite body with calibrated message that reflects reality
|
|
- Preserve frontmatter as-is
|
|
"""
|
|
from __future__ import annotations
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import yaml
|
|
except ImportError:
|
|
sys.stderr.write("pip3 install pyyaml\n"); sys.exit(1)
|
|
|
|
ENTITIES = Path("/Users/guto/ufo/wiki/entities")
|
|
|
|
STUB_RE = re.compile(
|
|
r"^# [^\n]+\n\n## Description \(EN\)\n\n_Stub generated by entity dedup\..*?_\n\n"
|
|
r"## Descrição \(PT-BR\)\n\n_Stub gerado pela deduplicação de entidades\..*?_\n*$",
|
|
re.DOTALL,
|
|
)
|
|
|
|
|
|
def new_body(canonical: str, total: int, docs: int) -> str:
|
|
return (
|
|
f"# {canonical}\n\n"
|
|
f"## Description (EN)\n\n"
|
|
f"_Low-signal entity — referenced **{total} time(s)** across **{docs} document(s)**. "
|
|
f"No external enrichment performed (criteria: ≥3 mentions). Use the page references below for raw context._\n\n"
|
|
f"## Descrição (PT-BR)\n\n"
|
|
f"_Entidade de baixo sinal — referenciada **{total} vez(es)** em **{docs} documento(s)**. "
|
|
f"Sem enriquecimento externo (critério: ≥3 menções). Use as referências de páginas abaixo para contexto bruto._\n"
|
|
)
|
|
|
|
|
|
def main():
|
|
updated = 0
|
|
skipped = 0
|
|
enriched = 0
|
|
for p in ENTITIES.glob("*/*.md"):
|
|
c = p.read_text(encoding="utf-8")
|
|
if not c.startswith("---"):
|
|
skipped += 1
|
|
continue
|
|
end = c.find("---", 4)
|
|
if end < 0:
|
|
skipped += 1; continue
|
|
fm = yaml.safe_load(c[3:end].strip()) or {}
|
|
body = c[end + 3:].lstrip("\n")
|
|
|
|
# Don't touch entities that have real enrichment content
|
|
if fm.get("enrichment_status") in ("deep", "shallow") and "external_sources" in body:
|
|
enriched += 1
|
|
continue
|
|
# Don't touch the seeded entities that had hand-curated bodies
|
|
if "Phase 6" not in body and "Phase 7" not in body:
|
|
skipped += 1
|
|
continue
|
|
|
|
canonical = fm.get("canonical_name") or p.stem
|
|
total = int(fm.get("total_mentions") or 0)
|
|
docs = int(fm.get("documents_count") or 0)
|
|
new = new_body(canonical, total, docs)
|
|
new_full = c[:end + 4] + "\n" + new
|
|
if new_full == c:
|
|
skipped += 1; continue
|
|
p.write_text(new_full, encoding="utf-8")
|
|
updated += 1
|
|
|
|
print(f"Updated: {updated}\nSkipped (no stub / hand-curated): {skipped}\nKept enriched: {enriched}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|