disclosure-bureau/scripts/synthesize/30_rebuild_wiki_from_reextract.py
Luiz Gustavo a7e9dce6d2 rebuild entity layer from Sonnet-vision reextract pipeline
Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity
JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page
extraction. Add synthesize scripts to regenerate wiki/entities from the 116
_reextract.json (30), aggregate missing page.md from chunks (31), and reprocess
805 pages the doc-rebuilder agent dropped on context overflow (32). Add
maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and
typed relation extraction.

Web: wire relations API + entity-relations component; entity/timeline/doc
pages consume the rebuilt layer.

Note: raw/, processing/, wiki/ remain gitignored (bulk data managed
separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on
disk only. The 27 curated anchor events under wiki/entities/events/ are
preserved.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-21 12:20:24 -03:00

447 lines
18 KiB
Python

#!/usr/bin/env python3
"""
30_rebuild_wiki_from_reextract.py — Rebuild wiki/entities/ from scratch using
the 116 _reextract.json files as the SOLE source of truth.
Pipeline:
1. Load every raw/<doc>--subagent/_reextract.json
2. Load every raw/<doc>--subagent/_index.json (chunk_id → page map)
3. Cross-doc dedup:
person/org/loc: by canonical_name (lowercase, ASCII-fold)
event: by event_id (EV-YYYY-MM-DD-slug)
uap_object: per (event, observed_index) — never deduped cross-event
4. Generate IDs per CLAUDE.md regex
5. Write wiki/entities/{type}/<id>.md (clean frontmatter + EN/PT-BR body stubs)
6. Print summary
Does NOT touch DB. DB sync is a separate step.
Idempotent: re-running with same inputs produces same outputs (deterministic).
"""
from __future__ import annotations
import json
import re
import sys
import unicodedata
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
import yaml
UFO = Path("/Users/guto/ufo")
RAW = UFO / "raw"
ENT = UFO / "wiki" / "entities"
SCHEMA_VERSION = "0.1.0"
WIKI_VERSION = "0.1.0"
NOW = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def canonicalize_name(name: str) -> str:
"""Generic name → kebab-case ASCII-fold id (mirrors scripts/03-dedup-entities.py)."""
if not name:
return ""
nfkd = unicodedata.normalize("NFKD", name)
ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
lower = ascii_str.lower()
replaced = re.sub(r"[^a-z0-9-]", "-", lower)
collapsed = re.sub(r"-+", "-", replaced).strip("-")
if collapsed and collapsed[0].isdigit():
collapsed = "x-" + collapsed
return collapsed
def event_id_from(label: str, date_start: str | None) -> str:
slug = canonicalize_name(label or "")[:40].strip("-") or "unlabeled"
date = date_start or ""
m = re.match(r"^(\d{4})-(\d{2})-(\d{2})$", date)
if m: return f"EV-{m.group(1)}-{m.group(2)}-{m.group(3)}-{slug}"
m = re.match(r"^(\d{4})-(\d{2})$", date)
if m: return f"EV-{m.group(1)}-{m.group(2)}-XX-{slug}"
m = re.match(r"^(\d{4})$", date)
if m: return f"EV-{m.group(1)}-XX-XX-{slug}"
return f"EV-XXXX-XX-XX-{slug}"
def uap_object_id(event_id: str, index: int) -> str:
if event_id.startswith("EV-"):
parts = event_id[3:].split("-", 4)
if len(parts) >= 4:
year = parts[0]
slug = "-".join(parts[3:])
compact = re.sub(r"[^A-Z0-9]", "", slug.upper())[:20] or "UNK"
return f"OBJ-EV{year}-{compact}-{index:02d}"
return f"OBJ-UNK-{index:02d}"
def dump_yaml(obj: dict) -> str:
"""Stable YAML dump matching existing entity file style."""
return yaml.safe_dump(obj, sort_keys=False, allow_unicode=True,
default_flow_style=False, width=10_000).strip()
def write_entity(path: Path, frontmatter: dict, body_title: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
content = (
f"---\n{dump_yaml(frontmatter)}\n---\n\n"
f"# {body_title}\n\n"
f"## Description (EN)\n\n"
f"## Descrição (PT-BR)\n"
)
path.write_text(content, encoding="utf-8")
def load_chunk_to_page(doc_id: str) -> dict[str, int]:
idx_path = RAW / f"{doc_id}--subagent" / "_index.json"
if not idx_path.is_file(): return {}
try:
idx = json.loads(idx_path.read_text(encoding="utf-8"))
return {c.get("chunk_id"): c.get("page") for c in (idx.get("chunks") or [])
if c.get("chunk_id") and c.get("page") is not None}
except Exception:
return {}
def page_refs_for(doc_id: str, chunks: list[str], chunk_to_page: dict[str, int]) -> list[str]:
pages = set()
for c in chunks or []:
p = chunk_to_page.get(c)
if p is not None: pages.add(int(p))
return [f"[[{doc_id}/p{p:03d}]]" for p in sorted(pages)]
# ─────────────────────────────────────────────────────────────────────────────
# AGGREGATION
# ─────────────────────────────────────────────────────────────────────────────
class EntityBucket:
"""Aggregates one entity across multiple documents."""
__slots__ = ("ent_id", "canonical_name", "aliases", "first_class",
"by_doc", "extra")
def __init__(self, ent_id: str, canonical_name: str):
self.ent_id = ent_id
self.canonical_name = canonical_name
self.aliases: set[str] = set()
self.first_class: str | None = None
# doc_id → {chunks: list, raw: dict}
self.by_doc: dict[str, dict] = {}
self.extra: dict = {} # type-specific scratch (affiliation, geo_class, etc.)
def add_occurrence(self, doc_id: str, raw_entity: dict, ent_class: str | None):
if self.first_class is None and ent_class:
self.first_class = ent_class
if raw_entity.get("name") or raw_entity.get("label"):
self.aliases.add((raw_entity.get("name") or raw_entity.get("label")).strip())
for a in raw_entity.get("aliases_in_doc") or []:
if a and a.strip(): self.aliases.add(a.strip())
self.by_doc.setdefault(doc_id, {"chunks": [], "raw": raw_entity})
ev = raw_entity.get("evidence_chunks") or []
self.by_doc[doc_id]["chunks"] = sorted(set(self.by_doc[doc_id]["chunks"]) | set(ev))
def merge_dates(buckets: dict[str, EntityBucket], get_date) -> dict[str, dict]:
"""Aggregate per-bucket dates from per-doc raw_entity. (For events only.)"""
out = {}
for k, b in buckets.items():
for doc_id, occ in b.by_doc.items():
d = get_date(occ["raw"])
if d:
out.setdefault(k, {}).setdefault("dates", set()).add(d)
return out
def aggregate_all() -> dict:
"""Walk all _reextract.json files. Return a structured aggregation."""
people: dict[str, EntityBucket] = {}
orgs: dict[str, EntityBucket] = {}
locs: dict[str, EntityBucket] = {}
events: dict[str, EntityBucket] = {}
uap_objs: dict[str, EntityBucket] = {} # per (doc, event, idx) — never deduped
relations: list[dict] = []
docs_processed = 0
chunk_maps: dict[str, dict[str, int]] = {}
for jpath in sorted(RAW.glob("*--subagent/_reextract.json")):
doc_id = jpath.parent.name.removesuffix("--subagent")
try:
data = json.loads(jpath.read_text(encoding="utf-8"))
except Exception as e:
print(f" skip {doc_id}: {e}", file=sys.stderr); continue
docs_processed += 1
chunk_maps[doc_id] = load_chunk_to_page(doc_id)
# people
for p in data.get("people") or []:
name = (p.get("name") or "").strip()
if not name or name.lower() == "unknown": continue
pid = canonicalize_name(name)
if not pid: continue
bucket = people.setdefault(pid, EntityBucket(pid, name))
bucket.add_occurrence(doc_id, p, p.get("person_class"))
# organizations
for o in data.get("organizations") or []:
name = (o.get("name") or "").strip()
if not name or name.lower() == "unknown": continue
oid = canonicalize_name(name)
if not oid: continue
bucket = orgs.setdefault(oid, EntityBucket(oid, name))
bucket.add_occurrence(doc_id, o, o.get("org_class"))
# locations
for l in data.get("locations") or []:
name = (l.get("name") or "").strip()
if not name or name.lower() == "unknown": continue
lid = canonicalize_name(name)
if not lid: continue
bucket = locs.setdefault(lid, EntityBucket(lid, name))
bucket.add_occurrence(doc_id, l, l.get("geo_class"))
# events
for e in data.get("events") or []:
label = (e.get("label") or "").strip()
if not label: continue
eid = event_id_from(label, e.get("date_start"))
bucket = events.setdefault(eid, EntityBucket(eid, label))
bucket.add_occurrence(doc_id, e, e.get("event_class"))
# uap_objects — never cross-event-deduped; inherit parent event's evidence_chunks
event_chunks = e.get("evidence_chunks") or []
for i, u in enumerate(e.get("uap_objects_observed") or [], 1):
if not isinstance(u, dict): continue
uid = uap_object_id(eid, i)
ubucket = uap_objs.setdefault(uid, EntityBucket(uid, f"{label} — object {i}"))
u_with_evidence = {**u, "evidence_chunks": u.get("evidence_chunks") or event_chunks}
ubucket.add_occurrence(doc_id, u_with_evidence, u.get("shape"))
ubucket.extra.setdefault("event_id", eid)
# relations — collected raw, mapped to canonical IDs later
for r in data.get("relations") or []:
if not isinstance(r, dict): continue
relations.append({"doc_id": doc_id, **r})
return {
"docs_processed": docs_processed,
"people": people, "organizations": orgs, "locations": locs,
"events": events, "uap_objects": uap_objs,
"relations": relations, "chunk_maps": chunk_maps,
}
# ─────────────────────────────────────────────────────────────────────────────
# WRITERS
# ─────────────────────────────────────────────────────────────────────────────
def write_person(b: EntityBucket, chunk_maps: dict) -> None:
mentioned_in = sorted({
ref
for doc_id, occ in b.by_doc.items()
for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
})
affiliations = sorted({
(occ["raw"].get("affiliation") or "").strip()
for occ in b.by_doc.values() if occ["raw"].get("affiliation")
} - {""})
roles = sorted({
(occ["raw"].get("role_at_doc_date") or "").strip()
for occ in b.by_doc.values() if occ["raw"].get("role_at_doc_date")
} - {""})
fm = {
"schema_version": SCHEMA_VERSION,
"type": "entity",
"entity_class": "person",
"person_id": b.ent_id,
"canonical_name": b.canonical_name,
"aliases": sorted(b.aliases),
"person_class": b.first_class,
"affiliations": affiliations,
"roles": roles,
"mentioned_in": mentioned_in,
"total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
"documents_count": len(b.by_doc),
"enrichment_status": "none",
"last_ingest": NOW,
"wiki_version": WIKI_VERSION,
"source": "reextract-v1",
}
write_entity(ENT / "people" / f"{b.ent_id}.md", fm, b.canonical_name)
def write_org(b: EntityBucket, chunk_maps: dict) -> None:
mentioned_in = sorted({
ref for doc_id, occ in b.by_doc.items()
for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
})
countries = sorted({
(occ["raw"].get("country") or "").strip()
for occ in b.by_doc.values() if occ["raw"].get("country")
} - {""})
fm = {
"schema_version": SCHEMA_VERSION,
"type": "entity",
"entity_class": "organization",
"organization_id": b.ent_id,
"canonical_name": b.canonical_name,
"aliases": sorted(b.aliases),
"org_class": b.first_class,
"countries": countries,
"mentioned_in": mentioned_in,
"total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
"documents_count": len(b.by_doc),
"enrichment_status": "none",
"last_ingest": NOW,
"wiki_version": WIKI_VERSION,
"source": "reextract-v1",
}
write_entity(ENT / "organizations" / f"{b.ent_id}.md", fm, b.canonical_name)
def write_location(b: EntityBucket, chunk_maps: dict) -> None:
mentioned_in = sorted({
ref for doc_id, occ in b.by_doc.items()
for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
})
countries = sorted({
(occ["raw"].get("country") or "").strip()
for occ in b.by_doc.values() if occ["raw"].get("country")
} - {""})
regions = sorted({
(occ["raw"].get("region_or_state") or "").strip()
for occ in b.by_doc.values() if occ["raw"].get("region_or_state")
} - {""})
fm = {
"schema_version": SCHEMA_VERSION,
"type": "entity",
"entity_class": "location",
"location_id": b.ent_id,
"canonical_name": b.canonical_name,
"aliases": sorted(b.aliases),
"geo_class": b.first_class,
"countries": countries,
"regions_or_states": regions,
"mentioned_in": mentioned_in,
"total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
"documents_count": len(b.by_doc),
"enrichment_status": "none",
"last_ingest": NOW,
"wiki_version": WIKI_VERSION,
"source": "reextract-v1",
}
write_entity(ENT / "locations" / f"{b.ent_id}.md", fm, b.canonical_name)
def write_event(b: EntityBucket, chunk_maps: dict) -> None:
mentioned_in = sorted({
ref for doc_id, occ in b.by_doc.items()
for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
})
date_starts = sorted({occ["raw"].get("date_start") for occ in b.by_doc.values() if occ["raw"].get("date_start")})
date_ends = sorted({occ["raw"].get("date_end") for occ in b.by_doc.values() if occ["raw"].get("date_end")})
primary_locs = sorted({
(occ["raw"].get("primary_location_name") or "").strip()
for occ in b.by_doc.values() if occ["raw"].get("primary_location_name")
} - {""})
geos = sorted({
occ["raw"].get("primary_location_geo_class")
for occ in b.by_doc.values() if occ["raw"].get("primary_location_geo_class")
} - {None})
# narrative: take the longest non-empty
def best(field):
best_val = ""
for occ in b.by_doc.values():
v = (occ["raw"].get(field) or "").strip()
if len(v) > len(best_val): best_val = v
return best_val or None
fm = {
"schema_version": SCHEMA_VERSION,
"type": "entity",
"entity_class": "event",
"event_id": b.ent_id,
"canonical_name": b.canonical_name,
"aliases": sorted(b.aliases),
"event_class": b.first_class,
"date_start": date_starts[0] if date_starts else None,
"date_end": date_ends[-1] if date_ends else None,
"date_confidence": None,
"primary_location_names": primary_locs,
"primary_location_geo_classes": geos,
"narrative_summary_en": best("narrative_summary"),
"narrative_summary_pt_br": best("narrative_summary_pt_br"),
"mentioned_in": mentioned_in,
"total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
"documents_count": len(b.by_doc),
"enrichment_status": "none",
"last_ingest": NOW,
"wiki_version": WIKI_VERSION,
"source": "reextract-v1",
}
write_entity(ENT / "events" / f"{b.ent_id}.md", fm, b.canonical_name)
def write_uap_object(b: EntityBucket, chunk_maps: dict) -> None:
mentioned_in = sorted({
ref for doc_id, occ in b.by_doc.items()
for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
})
raw_first = next(iter(b.by_doc.values()))["raw"]
fm = {
"schema_version": SCHEMA_VERSION,
"type": "entity",
"entity_class": "uap_object",
"uap_object_id": b.ent_id,
"canonical_name": b.canonical_name,
"event_id": b.extra.get("event_id"),
"shape": raw_first.get("shape"),
"color": raw_first.get("color"),
"medium": raw_first.get("medium"),
"size_estimate_m": raw_first.get("size_estimate_m"),
"altitude_ft": raw_first.get("altitude_ft"),
"speed_kts": raw_first.get("speed_kts"),
"maneuver_notes": raw_first.get("maneuver_notes"),
"mentioned_in": mentioned_in,
"total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
"documents_count": len(b.by_doc),
"last_ingest": NOW,
"wiki_version": WIKI_VERSION,
"source": "reextract-v1",
}
write_entity(ENT / "uap-objects" / f"{b.ent_id}.md", fm, b.canonical_name)
def main():
print(f"[1/3] Aggregating from {RAW}/*--subagent/_reextract.json ...")
agg = aggregate_all()
print(f" docs processed: {agg['docs_processed']}")
print(f" unique people: {len(agg['people'])}")
print(f" unique orgs: {len(agg['organizations'])}")
print(f" unique locs: {len(agg['locations'])}")
print(f" unique events: {len(agg['events'])}")
print(f" uap objects: {len(agg['uap_objects'])}")
print(f" raw relations: {len(agg['relations'])}")
print(f"\n[2/3] Writing entity markdown files ...")
cmaps = agg["chunk_maps"]
written = {"people": 0, "organizations": 0, "locations": 0, "events": 0, "uap_objects": 0}
for b in agg["people"].values(): write_person(b, cmaps); written["people"] += 1
for b in agg["organizations"].values(): write_org(b, cmaps); written["organizations"] += 1
for b in agg["locations"].values(): write_location(b, cmaps); written["locations"] += 1
for b in agg["events"].values(): write_event(b, cmaps); written["events"] += 1
for b in agg["uap_objects"].values(): write_uap_object(b, cmaps);written["uap_objects"] += 1
for k, n in written.items(): print(f" {k}: {n}")
print(f"\n[3/3] Saving relations index to wiki/entities/_relations.json (for downstream ingest)")
rels_path = ENT / "_relations.json"
rels_path.write_text(json.dumps({
"schema_version": SCHEMA_VERSION,
"rebuilt_at": NOW,
"count": len(agg["relations"]),
"relations": agg["relations"],
}, indent=2, ensure_ascii=False), encoding="utf-8")
print(f" saved {len(agg['relations'])} relations to {rels_path}")
print(f"\n✓ done.")
if __name__ == "__main__":
main()