#!/usr/bin/env python3 """ 30_rebuild_wiki_from_reextract.py — Rebuild wiki/entities/ from scratch using the 116 _reextract.json files as the SOLE source of truth. Pipeline: 1. Load every raw/--subagent/_reextract.json 2. Load every raw/--subagent/_index.json (chunk_id → page map) 3. Cross-doc dedup: person/org/loc: by canonical_name (lowercase, ASCII-fold) event: by event_id (EV-YYYY-MM-DD-slug) uap_object: per (event, observed_index) — never deduped cross-event 4. Generate IDs per CLAUDE.md regex 5. Write wiki/entities/{type}/.md (clean frontmatter + EN/PT-BR body stubs) 6. Print summary Does NOT touch DB. DB sync is a separate step. Idempotent: re-running with same inputs produces same outputs (deterministic). """ from __future__ import annotations import json import re import sys import unicodedata from collections import defaultdict from datetime import datetime, timezone from pathlib import Path import yaml UFO = Path("/Users/guto/ufo") RAW = UFO / "raw" ENT = UFO / "wiki" / "entities" SCHEMA_VERSION = "0.1.0" WIKI_VERSION = "0.1.0" NOW = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def canonicalize_name(name: str) -> str: """Generic name → kebab-case ASCII-fold id (mirrors scripts/03-dedup-entities.py).""" if not name: return "" nfkd = unicodedata.normalize("NFKD", name) ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c)) lower = ascii_str.lower() replaced = re.sub(r"[^a-z0-9-]", "-", lower) collapsed = re.sub(r"-+", "-", replaced).strip("-") if collapsed and collapsed[0].isdigit(): collapsed = "x-" + collapsed return collapsed def event_id_from(label: str, date_start: str | None) -> str: slug = canonicalize_name(label or "")[:40].strip("-") or "unlabeled" date = date_start or "" m = re.match(r"^(\d{4})-(\d{2})-(\d{2})$", date) if m: return f"EV-{m.group(1)}-{m.group(2)}-{m.group(3)}-{slug}" m = re.match(r"^(\d{4})-(\d{2})$", date) if m: return f"EV-{m.group(1)}-{m.group(2)}-XX-{slug}" m = re.match(r"^(\d{4})$", date) if m: return f"EV-{m.group(1)}-XX-XX-{slug}" return f"EV-XXXX-XX-XX-{slug}" def uap_object_id(event_id: str, index: int) -> str: if event_id.startswith("EV-"): parts = event_id[3:].split("-", 4) if len(parts) >= 4: year = parts[0] slug = "-".join(parts[3:]) compact = re.sub(r"[^A-Z0-9]", "", slug.upper())[:20] or "UNK" return f"OBJ-EV{year}-{compact}-{index:02d}" return f"OBJ-UNK-{index:02d}" def dump_yaml(obj: dict) -> str: """Stable YAML dump matching existing entity file style.""" return yaml.safe_dump(obj, sort_keys=False, allow_unicode=True, default_flow_style=False, width=10_000).strip() def write_entity(path: Path, frontmatter: dict, body_title: str) -> None: path.parent.mkdir(parents=True, exist_ok=True) content = ( f"---\n{dump_yaml(frontmatter)}\n---\n\n" f"# {body_title}\n\n" f"## Description (EN)\n\n" f"## Descrição (PT-BR)\n" ) path.write_text(content, encoding="utf-8") def load_chunk_to_page(doc_id: str) -> dict[str, int]: idx_path = RAW / f"{doc_id}--subagent" / "_index.json" if not idx_path.is_file(): return {} try: idx = json.loads(idx_path.read_text(encoding="utf-8")) return {c.get("chunk_id"): c.get("page") for c in (idx.get("chunks") or []) if c.get("chunk_id") and c.get("page") is not None} except Exception: return {} def page_refs_for(doc_id: str, chunks: list[str], chunk_to_page: dict[str, int]) -> list[str]: pages = set() for c in chunks or []: p = chunk_to_page.get(c) if p is not None: pages.add(int(p)) return [f"[[{doc_id}/p{p:03d}]]" for p in sorted(pages)] # ───────────────────────────────────────────────────────────────────────────── # AGGREGATION # ───────────────────────────────────────────────────────────────────────────── class EntityBucket: """Aggregates one entity across multiple documents.""" __slots__ = ("ent_id", "canonical_name", "aliases", "first_class", "by_doc", "extra") def __init__(self, ent_id: str, canonical_name: str): self.ent_id = ent_id self.canonical_name = canonical_name self.aliases: set[str] = set() self.first_class: str | None = None # doc_id → {chunks: list, raw: dict} self.by_doc: dict[str, dict] = {} self.extra: dict = {} # type-specific scratch (affiliation, geo_class, etc.) def add_occurrence(self, doc_id: str, raw_entity: dict, ent_class: str | None): if self.first_class is None and ent_class: self.first_class = ent_class if raw_entity.get("name") or raw_entity.get("label"): self.aliases.add((raw_entity.get("name") or raw_entity.get("label")).strip()) for a in raw_entity.get("aliases_in_doc") or []: if a and a.strip(): self.aliases.add(a.strip()) self.by_doc.setdefault(doc_id, {"chunks": [], "raw": raw_entity}) ev = raw_entity.get("evidence_chunks") or [] self.by_doc[doc_id]["chunks"] = sorted(set(self.by_doc[doc_id]["chunks"]) | set(ev)) def merge_dates(buckets: dict[str, EntityBucket], get_date) -> dict[str, dict]: """Aggregate per-bucket dates from per-doc raw_entity. (For events only.)""" out = {} for k, b in buckets.items(): for doc_id, occ in b.by_doc.items(): d = get_date(occ["raw"]) if d: out.setdefault(k, {}).setdefault("dates", set()).add(d) return out def aggregate_all() -> dict: """Walk all _reextract.json files. Return a structured aggregation.""" people: dict[str, EntityBucket] = {} orgs: dict[str, EntityBucket] = {} locs: dict[str, EntityBucket] = {} events: dict[str, EntityBucket] = {} uap_objs: dict[str, EntityBucket] = {} # per (doc, event, idx) — never deduped relations: list[dict] = [] docs_processed = 0 chunk_maps: dict[str, dict[str, int]] = {} for jpath in sorted(RAW.glob("*--subagent/_reextract.json")): doc_id = jpath.parent.name.removesuffix("--subagent") try: data = json.loads(jpath.read_text(encoding="utf-8")) except Exception as e: print(f" skip {doc_id}: {e}", file=sys.stderr); continue docs_processed += 1 chunk_maps[doc_id] = load_chunk_to_page(doc_id) # people for p in data.get("people") or []: name = (p.get("name") or "").strip() if not name or name.lower() == "unknown": continue pid = canonicalize_name(name) if not pid: continue bucket = people.setdefault(pid, EntityBucket(pid, name)) bucket.add_occurrence(doc_id, p, p.get("person_class")) # organizations for o in data.get("organizations") or []: name = (o.get("name") or "").strip() if not name or name.lower() == "unknown": continue oid = canonicalize_name(name) if not oid: continue bucket = orgs.setdefault(oid, EntityBucket(oid, name)) bucket.add_occurrence(doc_id, o, o.get("org_class")) # locations for l in data.get("locations") or []: name = (l.get("name") or "").strip() if not name or name.lower() == "unknown": continue lid = canonicalize_name(name) if not lid: continue bucket = locs.setdefault(lid, EntityBucket(lid, name)) bucket.add_occurrence(doc_id, l, l.get("geo_class")) # events for e in data.get("events") or []: label = (e.get("label") or "").strip() if not label: continue eid = event_id_from(label, e.get("date_start")) bucket = events.setdefault(eid, EntityBucket(eid, label)) bucket.add_occurrence(doc_id, e, e.get("event_class")) # uap_objects — never cross-event-deduped; inherit parent event's evidence_chunks event_chunks = e.get("evidence_chunks") or [] for i, u in enumerate(e.get("uap_objects_observed") or [], 1): if not isinstance(u, dict): continue uid = uap_object_id(eid, i) ubucket = uap_objs.setdefault(uid, EntityBucket(uid, f"{label} — object {i}")) u_with_evidence = {**u, "evidence_chunks": u.get("evidence_chunks") or event_chunks} ubucket.add_occurrence(doc_id, u_with_evidence, u.get("shape")) ubucket.extra.setdefault("event_id", eid) # relations — collected raw, mapped to canonical IDs later for r in data.get("relations") or []: if not isinstance(r, dict): continue relations.append({"doc_id": doc_id, **r}) return { "docs_processed": docs_processed, "people": people, "organizations": orgs, "locations": locs, "events": events, "uap_objects": uap_objs, "relations": relations, "chunk_maps": chunk_maps, } # ───────────────────────────────────────────────────────────────────────────── # WRITERS # ───────────────────────────────────────────────────────────────────────────── def write_person(b: EntityBucket, chunk_maps: dict) -> None: mentioned_in = sorted({ ref for doc_id, occ in b.by_doc.items() for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {})) }) affiliations = sorted({ (occ["raw"].get("affiliation") or "").strip() for occ in b.by_doc.values() if occ["raw"].get("affiliation") } - {""}) roles = sorted({ (occ["raw"].get("role_at_doc_date") or "").strip() for occ in b.by_doc.values() if occ["raw"].get("role_at_doc_date") } - {""}) fm = { "schema_version": SCHEMA_VERSION, "type": "entity", "entity_class": "person", "person_id": b.ent_id, "canonical_name": b.canonical_name, "aliases": sorted(b.aliases), "person_class": b.first_class, "affiliations": affiliations, "roles": roles, "mentioned_in": mentioned_in, "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()), "documents_count": len(b.by_doc), "enrichment_status": "none", "last_ingest": NOW, "wiki_version": WIKI_VERSION, "source": "reextract-v1", } write_entity(ENT / "people" / f"{b.ent_id}.md", fm, b.canonical_name) def write_org(b: EntityBucket, chunk_maps: dict) -> None: mentioned_in = sorted({ ref for doc_id, occ in b.by_doc.items() for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {})) }) countries = sorted({ (occ["raw"].get("country") or "").strip() for occ in b.by_doc.values() if occ["raw"].get("country") } - {""}) fm = { "schema_version": SCHEMA_VERSION, "type": "entity", "entity_class": "organization", "organization_id": b.ent_id, "canonical_name": b.canonical_name, "aliases": sorted(b.aliases), "org_class": b.first_class, "countries": countries, "mentioned_in": mentioned_in, "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()), "documents_count": len(b.by_doc), "enrichment_status": "none", "last_ingest": NOW, "wiki_version": WIKI_VERSION, "source": "reextract-v1", } write_entity(ENT / "organizations" / f"{b.ent_id}.md", fm, b.canonical_name) def write_location(b: EntityBucket, chunk_maps: dict) -> None: mentioned_in = sorted({ ref for doc_id, occ in b.by_doc.items() for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {})) }) countries = sorted({ (occ["raw"].get("country") or "").strip() for occ in b.by_doc.values() if occ["raw"].get("country") } - {""}) regions = sorted({ (occ["raw"].get("region_or_state") or "").strip() for occ in b.by_doc.values() if occ["raw"].get("region_or_state") } - {""}) fm = { "schema_version": SCHEMA_VERSION, "type": "entity", "entity_class": "location", "location_id": b.ent_id, "canonical_name": b.canonical_name, "aliases": sorted(b.aliases), "geo_class": b.first_class, "countries": countries, "regions_or_states": regions, "mentioned_in": mentioned_in, "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()), "documents_count": len(b.by_doc), "enrichment_status": "none", "last_ingest": NOW, "wiki_version": WIKI_VERSION, "source": "reextract-v1", } write_entity(ENT / "locations" / f"{b.ent_id}.md", fm, b.canonical_name) def write_event(b: EntityBucket, chunk_maps: dict) -> None: mentioned_in = sorted({ ref for doc_id, occ in b.by_doc.items() for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {})) }) date_starts = sorted({occ["raw"].get("date_start") for occ in b.by_doc.values() if occ["raw"].get("date_start")}) date_ends = sorted({occ["raw"].get("date_end") for occ in b.by_doc.values() if occ["raw"].get("date_end")}) primary_locs = sorted({ (occ["raw"].get("primary_location_name") or "").strip() for occ in b.by_doc.values() if occ["raw"].get("primary_location_name") } - {""}) geos = sorted({ occ["raw"].get("primary_location_geo_class") for occ in b.by_doc.values() if occ["raw"].get("primary_location_geo_class") } - {None}) # narrative: take the longest non-empty def best(field): best_val = "" for occ in b.by_doc.values(): v = (occ["raw"].get(field) or "").strip() if len(v) > len(best_val): best_val = v return best_val or None fm = { "schema_version": SCHEMA_VERSION, "type": "entity", "entity_class": "event", "event_id": b.ent_id, "canonical_name": b.canonical_name, "aliases": sorted(b.aliases), "event_class": b.first_class, "date_start": date_starts[0] if date_starts else None, "date_end": date_ends[-1] if date_ends else None, "date_confidence": None, "primary_location_names": primary_locs, "primary_location_geo_classes": geos, "narrative_summary_en": best("narrative_summary"), "narrative_summary_pt_br": best("narrative_summary_pt_br"), "mentioned_in": mentioned_in, "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()), "documents_count": len(b.by_doc), "enrichment_status": "none", "last_ingest": NOW, "wiki_version": WIKI_VERSION, "source": "reextract-v1", } write_entity(ENT / "events" / f"{b.ent_id}.md", fm, b.canonical_name) def write_uap_object(b: EntityBucket, chunk_maps: dict) -> None: mentioned_in = sorted({ ref for doc_id, occ in b.by_doc.items() for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {})) }) raw_first = next(iter(b.by_doc.values()))["raw"] fm = { "schema_version": SCHEMA_VERSION, "type": "entity", "entity_class": "uap_object", "uap_object_id": b.ent_id, "canonical_name": b.canonical_name, "event_id": b.extra.get("event_id"), "shape": raw_first.get("shape"), "color": raw_first.get("color"), "medium": raw_first.get("medium"), "size_estimate_m": raw_first.get("size_estimate_m"), "altitude_ft": raw_first.get("altitude_ft"), "speed_kts": raw_first.get("speed_kts"), "maneuver_notes": raw_first.get("maneuver_notes"), "mentioned_in": mentioned_in, "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()), "documents_count": len(b.by_doc), "last_ingest": NOW, "wiki_version": WIKI_VERSION, "source": "reextract-v1", } write_entity(ENT / "uap-objects" / f"{b.ent_id}.md", fm, b.canonical_name) def main(): print(f"[1/3] Aggregating from {RAW}/*--subagent/_reextract.json ...") agg = aggregate_all() print(f" docs processed: {agg['docs_processed']}") print(f" unique people: {len(agg['people'])}") print(f" unique orgs: {len(agg['organizations'])}") print(f" unique locs: {len(agg['locations'])}") print(f" unique events: {len(agg['events'])}") print(f" uap objects: {len(agg['uap_objects'])}") print(f" raw relations: {len(agg['relations'])}") print(f"\n[2/3] Writing entity markdown files ...") cmaps = agg["chunk_maps"] written = {"people": 0, "organizations": 0, "locations": 0, "events": 0, "uap_objects": 0} for b in agg["people"].values(): write_person(b, cmaps); written["people"] += 1 for b in agg["organizations"].values(): write_org(b, cmaps); written["organizations"] += 1 for b in agg["locations"].values(): write_location(b, cmaps); written["locations"] += 1 for b in agg["events"].values(): write_event(b, cmaps); written["events"] += 1 for b in agg["uap_objects"].values(): write_uap_object(b, cmaps);written["uap_objects"] += 1 for k, n in written.items(): print(f" {k}: {n}") print(f"\n[3/3] Saving relations index to wiki/entities/_relations.json (for downstream ingest)") rels_path = ENT / "_relations.json" rels_path.write_text(json.dumps({ "schema_version": SCHEMA_VERSION, "rebuilt_at": NOW, "count": len(agg["relations"]), "relations": agg["relations"], }, indent=2, ensure_ascii=False), encoding="utf-8") print(f" saved {len(agg['relations'])} relations to {rels_path}") print(f"\n✓ done.") if __name__ == "__main__": main()