disclosure-bureau/scripts/maintain/41_strip_stubs.py
guto 4459bd17e4 phase-0: kill stubs, ship 20 curated anchor events, configure SMTP
- scripts/03-dedup-entities.py: stop emitting placeholder narrative ("Stub. Will
  be enriched in Phase 7"); write summary_status=none + null fields instead.
- scripts/maintain/41_strip_stubs.py: idempotent migration that cleaned the
  22,096 entity .md files (now zero stub strings in wiki/).
- scripts/synthesize/01_anchor_events.py: curated 20 anchor UAP events
  (Roswell, Nimitz Tic-Tac, Phoenix Lights, Operação Prato, AATIP, etc.) with
  bilingual Holmes-Watson narrative via claude -p --model sonnet
  (CLAUDE_CODE_OAUTH_TOKEN). All summary_status=curated, confidence=high.
- web/api/timeline + timeline-view: filter narrative-less events by default,
  render "curado" badge for hand-vetted ones, drop the date display alone.
- CLAUDE-schema-full.md: document the summary_status enum and the four states.
- docker-compose.yml: SMTP_HOST=mail.spacemail.com configured;
  GOTRUE_MAILER_AUTOCONFIRM flipped to false (real email confirmation working).
- .nirvana/outputs/.../systems-atelier/: 5 deliverables of the architecture
  audit that produced this roadmap.
2026-05-18 00:44:17 -03:00

194 lines
6.2 KiB
Python

#!/usr/bin/env python3
"""
41_strip_stubs.py — Eliminate "Will be enriched in Phase N" placeholders from
wiki/entities/**/*.md.
For each entity markdown file:
Frontmatter:
- narrative_summary: "_Stub. Will be enriched in Phase 7._" → null
- narrative_summary_pt_br: "_Stub. Será enriquecido na Fase 7._" → null
- drop narrative_summary_confidence
- add summary_status: 'none'
- add summary_confidence: null
Body:
- Remove paragraphs containing "_Stub generated by entity dedup. Will be
enriched in Phase 6._" / "_Stub gerado pela deduplicação de entidades.
Será enriquecido na Fase 6._"
- Keep all other content untouched (curated narrative is preserved).
Idempotent: a file already migrated (has `summary_status` field) is skipped.
Usage:
./41_strip_stubs.py # process all entities
./41_strip_stubs.py --class events # only one class
./41_strip_stubs.py --dry-run # report counts, don't write
./41_strip_stubs.py --verbose # list each touched file
"""
from __future__ import annotations
import argparse
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
try:
import yaml
except ImportError:
sys.stderr.write("pip3 install pyyaml\n")
sys.exit(1)
UFO_ROOT = Path(__file__).resolve().parents[2]
ENTITIES_BASE = UFO_ROOT / "wiki" / "entities"
LOG_PATH = UFO_ROOT / "wiki" / "log.md"
STUB_EN = "_Stub. Will be enriched in Phase 7._"
STUB_PT = "_Stub. Será enriquecido na Fase 7._"
BODY_STUB_EN = "_Stub generated by entity dedup. Will be enriched in Phase 6._"
BODY_STUB_PT = "_Stub gerado pela deduplicação de entidades. Será enriquecido na Fase 6._"
# Match any line that is exactly one of the body stubs (ignoring surrounding whitespace).
BODY_STUB_LINE_RE = re.compile(
r"^\s*(?:" + re.escape(BODY_STUB_EN) + r"|" + re.escape(BODY_STUB_PT) + r")\s*$",
re.MULTILINE,
)
def utc_iso() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def split_md(path: Path) -> tuple[dict, str, bool]:
"""Return (frontmatter, body, had_frontmatter)."""
raw = path.read_text(encoding="utf-8")
if not raw.startswith("---"):
return {}, raw, False
end = raw.find("---", 4)
if end == -1:
return {}, raw, False
try:
fm = yaml.safe_load(raw[3:end].strip()) or {}
except yaml.YAMLError:
return {}, raw, False
body = raw[end + 3 :].lstrip("\n")
return fm, body, True
def render_md(fm: dict, body: str) -> str:
yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
sep = "" if body.startswith("\n") else "\n"
return f"---\n{yaml_str}---\n{sep}{body}"
def clean_body(body: str) -> str:
"""Drop lines that are exactly a stub paragraph; collapse 3+ blank lines to 2."""
cleaned = BODY_STUB_LINE_RE.sub("", body)
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
return cleaned
def migrate_file(path: Path, dry_run: bool, verbose: bool) -> tuple[bool, bool]:
"""Returns (changed, already_migrated)."""
fm, body, had_fm = split_md(path)
if not had_fm:
return (False, False)
# Idempotency: presence of summary_status means already migrated.
if "summary_status" in fm:
return (False, True)
new_fm = dict(fm) # preserve insertion order
changed = False
# Clear stub textual values
if new_fm.get("narrative_summary") == STUB_EN:
new_fm["narrative_summary"] = None
changed = True
if new_fm.get("narrative_summary_pt_br") == STUB_PT:
new_fm["narrative_summary_pt_br"] = None
changed = True
# Drop old confidence key (replaced by summary_confidence)
if "narrative_summary_confidence" in new_fm:
new_fm.pop("narrative_summary_confidence", None)
changed = True
# Always add summary_status / summary_confidence so subsequent runs detect
# the migration as already done (even if this file had no stub fields).
new_fm["summary_status"] = "none"
new_fm.setdefault("summary_confidence", None)
# Touch last_lint so downstream tools see the change
new_fm["last_lint"] = utc_iso()
new_body = clean_body(body)
if new_body != body:
changed = True
if not changed:
# No stub to strip, but we still added the status field — count as changed.
changed = True
if dry_run:
if verbose:
print(f" (dry) {path.relative_to(UFO_ROOT)}")
return (True, False)
path.write_text(render_md(new_fm, new_body), encoding="utf-8")
if verbose:
print(f"{path.relative_to(UFO_ROOT)}")
return (True, False)
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument("--class", dest="cls", default=None,
help="restrict to one class folder under wiki/entities/")
p.add_argument("--dry-run", action="store_true")
p.add_argument("--verbose", action="store_true")
args = p.parse_args()
if not ENTITIES_BASE.exists():
sys.stderr.write(f"missing {ENTITIES_BASE}\n")
return 1
if args.cls:
roots = [ENTITIES_BASE / args.cls]
else:
roots = [d for d in ENTITIES_BASE.iterdir() if d.is_dir()]
total = 0
migrated = 0
skipped_already = 0
for root in roots:
for path in sorted(root.glob("*.md")):
total += 1
changed, already = migrate_file(path, args.dry_run, args.verbose)
if already:
skipped_already += 1
elif changed:
migrated += 1
print()
print(f" total scanned: {total}")
print(f" migrated: {migrated}")
print(f" already migrated: {skipped_already}")
print(f" dry-run: {args.dry_run}")
if not args.dry_run and migrated > 0:
LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
with LOG_PATH.open("a", encoding="utf-8") as f:
f.write(
f"\n## {utc_iso()} · STRIP_STUBS\n"
f"- script: scripts/maintain/41_strip_stubs.py\n"
f"- scanned: {total}\n"
f"- migrated: {migrated}\n"
f"- already_migrated: {skipped_already}\n"
)
return 0
if __name__ == "__main__":
sys.exit(main())