195 lines
6.2 KiB
Python
195 lines
6.2 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
41_strip_stubs.py — Eliminate "Will be enriched in Phase N" placeholders from
|
||
|
|
wiki/entities/**/*.md.
|
||
|
|
|
||
|
|
For each entity markdown file:
|
||
|
|
Frontmatter:
|
||
|
|
- narrative_summary: "_Stub. Will be enriched in Phase 7._" → null
|
||
|
|
- narrative_summary_pt_br: "_Stub. Será enriquecido na Fase 7._" → null
|
||
|
|
- drop narrative_summary_confidence
|
||
|
|
- add summary_status: 'none'
|
||
|
|
- add summary_confidence: null
|
||
|
|
|
||
|
|
Body:
|
||
|
|
- Remove paragraphs containing "_Stub generated by entity dedup. Will be
|
||
|
|
enriched in Phase 6._" / "_Stub gerado pela deduplicação de entidades.
|
||
|
|
Será enriquecido na Fase 6._"
|
||
|
|
- Keep all other content untouched (curated narrative is preserved).
|
||
|
|
|
||
|
|
Idempotent: a file already migrated (has `summary_status` field) is skipped.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
./41_strip_stubs.py # process all entities
|
||
|
|
./41_strip_stubs.py --class events # only one class
|
||
|
|
./41_strip_stubs.py --dry-run # report counts, don't write
|
||
|
|
./41_strip_stubs.py --verbose # list each touched file
|
||
|
|
"""
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import re
|
||
|
|
import sys
|
||
|
|
from datetime import datetime, timezone
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
try:
|
||
|
|
import yaml
|
||
|
|
except ImportError:
|
||
|
|
sys.stderr.write("pip3 install pyyaml\n")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
UFO_ROOT = Path(__file__).resolve().parents[2]
|
||
|
|
ENTITIES_BASE = UFO_ROOT / "wiki" / "entities"
|
||
|
|
LOG_PATH = UFO_ROOT / "wiki" / "log.md"
|
||
|
|
|
||
|
|
STUB_EN = "_Stub. Will be enriched in Phase 7._"
|
||
|
|
STUB_PT = "_Stub. Será enriquecido na Fase 7._"
|
||
|
|
BODY_STUB_EN = "_Stub generated by entity dedup. Will be enriched in Phase 6._"
|
||
|
|
BODY_STUB_PT = "_Stub gerado pela deduplicação de entidades. Será enriquecido na Fase 6._"
|
||
|
|
|
||
|
|
# Match any line that is exactly one of the body stubs (ignoring surrounding whitespace).
|
||
|
|
BODY_STUB_LINE_RE = re.compile(
|
||
|
|
r"^\s*(?:" + re.escape(BODY_STUB_EN) + r"|" + re.escape(BODY_STUB_PT) + r")\s*$",
|
||
|
|
re.MULTILINE,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def utc_iso() -> str:
|
||
|
|
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||
|
|
|
||
|
|
|
||
|
|
def split_md(path: Path) -> tuple[dict, str, bool]:
|
||
|
|
"""Return (frontmatter, body, had_frontmatter)."""
|
||
|
|
raw = path.read_text(encoding="utf-8")
|
||
|
|
if not raw.startswith("---"):
|
||
|
|
return {}, raw, False
|
||
|
|
end = raw.find("---", 4)
|
||
|
|
if end == -1:
|
||
|
|
return {}, raw, False
|
||
|
|
try:
|
||
|
|
fm = yaml.safe_load(raw[3:end].strip()) or {}
|
||
|
|
except yaml.YAMLError:
|
||
|
|
return {}, raw, False
|
||
|
|
body = raw[end + 3 :].lstrip("\n")
|
||
|
|
return fm, body, True
|
||
|
|
|
||
|
|
|
||
|
|
def render_md(fm: dict, body: str) -> str:
|
||
|
|
yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
||
|
|
sep = "" if body.startswith("\n") else "\n"
|
||
|
|
return f"---\n{yaml_str}---\n{sep}{body}"
|
||
|
|
|
||
|
|
|
||
|
|
def clean_body(body: str) -> str:
|
||
|
|
"""Drop lines that are exactly a stub paragraph; collapse 3+ blank lines to 2."""
|
||
|
|
cleaned = BODY_STUB_LINE_RE.sub("", body)
|
||
|
|
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
|
||
|
|
return cleaned
|
||
|
|
|
||
|
|
|
||
|
|
def migrate_file(path: Path, dry_run: bool, verbose: bool) -> tuple[bool, bool]:
|
||
|
|
"""Returns (changed, already_migrated)."""
|
||
|
|
fm, body, had_fm = split_md(path)
|
||
|
|
if not had_fm:
|
||
|
|
return (False, False)
|
||
|
|
|
||
|
|
# Idempotency: presence of summary_status means already migrated.
|
||
|
|
if "summary_status" in fm:
|
||
|
|
return (False, True)
|
||
|
|
|
||
|
|
new_fm = dict(fm) # preserve insertion order
|
||
|
|
|
||
|
|
changed = False
|
||
|
|
|
||
|
|
# Clear stub textual values
|
||
|
|
if new_fm.get("narrative_summary") == STUB_EN:
|
||
|
|
new_fm["narrative_summary"] = None
|
||
|
|
changed = True
|
||
|
|
if new_fm.get("narrative_summary_pt_br") == STUB_PT:
|
||
|
|
new_fm["narrative_summary_pt_br"] = None
|
||
|
|
changed = True
|
||
|
|
|
||
|
|
# Drop old confidence key (replaced by summary_confidence)
|
||
|
|
if "narrative_summary_confidence" in new_fm:
|
||
|
|
new_fm.pop("narrative_summary_confidence", None)
|
||
|
|
changed = True
|
||
|
|
|
||
|
|
# Always add summary_status / summary_confidence so subsequent runs detect
|
||
|
|
# the migration as already done (even if this file had no stub fields).
|
||
|
|
new_fm["summary_status"] = "none"
|
||
|
|
new_fm.setdefault("summary_confidence", None)
|
||
|
|
# Touch last_lint so downstream tools see the change
|
||
|
|
new_fm["last_lint"] = utc_iso()
|
||
|
|
|
||
|
|
new_body = clean_body(body)
|
||
|
|
if new_body != body:
|
||
|
|
changed = True
|
||
|
|
|
||
|
|
if not changed:
|
||
|
|
# No stub to strip, but we still added the status field — count as changed.
|
||
|
|
changed = True
|
||
|
|
|
||
|
|
if dry_run:
|
||
|
|
if verbose:
|
||
|
|
print(f" (dry) {path.relative_to(UFO_ROOT)}")
|
||
|
|
return (True, False)
|
||
|
|
|
||
|
|
path.write_text(render_md(new_fm, new_body), encoding="utf-8")
|
||
|
|
if verbose:
|
||
|
|
print(f" ✓ {path.relative_to(UFO_ROOT)}")
|
||
|
|
return (True, False)
|
||
|
|
|
||
|
|
|
||
|
|
def main() -> int:
|
||
|
|
p = argparse.ArgumentParser()
|
||
|
|
p.add_argument("--class", dest="cls", default=None,
|
||
|
|
help="restrict to one class folder under wiki/entities/")
|
||
|
|
p.add_argument("--dry-run", action="store_true")
|
||
|
|
p.add_argument("--verbose", action="store_true")
|
||
|
|
args = p.parse_args()
|
||
|
|
|
||
|
|
if not ENTITIES_BASE.exists():
|
||
|
|
sys.stderr.write(f"missing {ENTITIES_BASE}\n")
|
||
|
|
return 1
|
||
|
|
|
||
|
|
if args.cls:
|
||
|
|
roots = [ENTITIES_BASE / args.cls]
|
||
|
|
else:
|
||
|
|
roots = [d for d in ENTITIES_BASE.iterdir() if d.is_dir()]
|
||
|
|
|
||
|
|
total = 0
|
||
|
|
migrated = 0
|
||
|
|
skipped_already = 0
|
||
|
|
for root in roots:
|
||
|
|
for path in sorted(root.glob("*.md")):
|
||
|
|
total += 1
|
||
|
|
changed, already = migrate_file(path, args.dry_run, args.verbose)
|
||
|
|
if already:
|
||
|
|
skipped_already += 1
|
||
|
|
elif changed:
|
||
|
|
migrated += 1
|
||
|
|
|
||
|
|
print()
|
||
|
|
print(f" total scanned: {total}")
|
||
|
|
print(f" migrated: {migrated}")
|
||
|
|
print(f" already migrated: {skipped_already}")
|
||
|
|
print(f" dry-run: {args.dry_run}")
|
||
|
|
|
||
|
|
if not args.dry_run and migrated > 0:
|
||
|
|
LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||
|
|
with LOG_PATH.open("a", encoding="utf-8") as f:
|
||
|
|
f.write(
|
||
|
|
f"\n## {utc_iso()} · STRIP_STUBS\n"
|
||
|
|
f"- script: scripts/maintain/41_strip_stubs.py\n"
|
||
|
|
f"- scanned: {total}\n"
|
||
|
|
f"- migrated: {migrated}\n"
|
||
|
|
f"- already_migrated: {skipped_already}\n"
|
||
|
|
)
|
||
|
|
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
sys.exit(main())
|