disclosure-bureau/scripts/synthesize/30_rebuild_wiki_from_reextract.py

#!/usr/bin/env python3
"""
30_rebuild_wiki_from_reextract.py — Rebuild wiki/entities/ from scratch using
the 116 _reextract.json files as the SOLE source of truth.

Pipeline:
  1. Load every raw/<doc>--subagent/_reextract.json
  2. Load every raw/<doc>--subagent/_index.json (chunk_id → page map)
  3. Cross-doc dedup:
       person/org/loc:  by canonical_name (lowercase, ASCII-fold)
       event:           by event_id (EV-YYYY-MM-DD-slug)
       uap_object:      per (event, observed_index) — never deduped cross-event
  4. Generate IDs per CLAUDE.md regex
  5. Write wiki/entities/{type}/<id>.md (clean frontmatter + EN/PT-BR body stubs)
  6. Print summary

Does NOT touch DB. DB sync is a separate step.
Idempotent: re-running with same inputs produces same outputs (deterministic).
"""
from __future__ import annotations
import json
import re
import sys
import unicodedata
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path

import yaml

UFO = Path("/Users/guto/ufo")
RAW = UFO / "raw"
ENT = UFO / "wiki" / "entities"

SCHEMA_VERSION = "0.1.0"
WIKI_VERSION = "0.1.0"
NOW = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def canonicalize_name(name: str) -> str:
    """Generic name → kebab-case ASCII-fold id (mirrors scripts/03-dedup-entities.py)."""
    if not name:
        return ""
    nfkd = unicodedata.normalize("NFKD", name)
    ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
    lower = ascii_str.lower()
    replaced = re.sub(r"[^a-z0-9-]", "-", lower)
    collapsed = re.sub(r"-+", "-", replaced).strip("-")
    if collapsed and collapsed[0].isdigit():
        collapsed = "x-" + collapsed
    return collapsed


def event_id_from(label: str, date_start: str | None) -> str:
    slug = canonicalize_name(label or "")[:40].strip("-") or "unlabeled"
    date = date_start or ""
    m = re.match(r"^(\d{4})-(\d{2})-(\d{2})$", date)
    if m: return f"EV-{m.group(1)}-{m.group(2)}-{m.group(3)}-{slug}"
    m = re.match(r"^(\d{4})-(\d{2})$", date)
    if m: return f"EV-{m.group(1)}-{m.group(2)}-XX-{slug}"
    m = re.match(r"^(\d{4})$", date)
    if m: return f"EV-{m.group(1)}-XX-XX-{slug}"
    return f"EV-XXXX-XX-XX-{slug}"


def uap_object_id(event_id: str, index: int) -> str:
    if event_id.startswith("EV-"):
        parts = event_id[3:].split("-", 4)
        if len(parts) >= 4:
            year = parts[0]
            slug = "-".join(parts[3:])
            compact = re.sub(r"[^A-Z0-9]", "", slug.upper())[:20] or "UNK"
            return f"OBJ-EV{year}-{compact}-{index:02d}"
    return f"OBJ-UNK-{index:02d}"


def dump_yaml(obj: dict) -> str:
    """Stable YAML dump matching existing entity file style."""
    return yaml.safe_dump(obj, sort_keys=False, allow_unicode=True,
                          default_flow_style=False, width=10_000).strip()


def write_entity(path: Path, frontmatter: dict, body_title: str) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    content = (
        f"---\n{dump_yaml(frontmatter)}\n---\n\n"
        f"# {body_title}\n\n"
        f"## Description (EN)\n\n"
        f"## Descrição (PT-BR)\n"
    )
    path.write_text(content, encoding="utf-8")


def load_chunk_to_page(doc_id: str) -> dict[str, int]:
    idx_path = RAW / f"{doc_id}--subagent" / "_index.json"
    if not idx_path.is_file(): return {}
    try:
        idx = json.loads(idx_path.read_text(encoding="utf-8"))
        return {c.get("chunk_id"): c.get("page") for c in (idx.get("chunks") or [])
                if c.get("chunk_id") and c.get("page") is not None}
    except Exception:
        return {}


def page_refs_for(doc_id: str, chunks: list[str], chunk_to_page: dict[str, int]) -> list[str]:
    pages = set()
    for c in chunks or []:
        p = chunk_to_page.get(c)
        if p is not None: pages.add(int(p))
    return [f"[[{doc_id}/p{p:03d}]]" for p in sorted(pages)]


# ─────────────────────────────────────────────────────────────────────────────
# AGGREGATION
# ─────────────────────────────────────────────────────────────────────────────

class EntityBucket:
    """Aggregates one entity across multiple documents."""
    __slots__ = ("ent_id", "canonical_name", "aliases", "first_class",
                 "by_doc", "extra")

    def __init__(self, ent_id: str, canonical_name: str):
        self.ent_id = ent_id
        self.canonical_name = canonical_name
        self.aliases: set[str] = set()
        self.first_class: str | None = None
        # doc_id → {chunks: list, raw: dict}
        self.by_doc: dict[str, dict] = {}
        self.extra: dict = {}    # type-specific scratch (affiliation, geo_class, etc.)

    def add_occurrence(self, doc_id: str, raw_entity: dict, ent_class: str | None):
        if self.first_class is None and ent_class:
            self.first_class = ent_class
        if raw_entity.get("name") or raw_entity.get("label"):
            self.aliases.add((raw_entity.get("name") or raw_entity.get("label")).strip())
        for a in raw_entity.get("aliases_in_doc") or []:
            if a and a.strip(): self.aliases.add(a.strip())
        self.by_doc.setdefault(doc_id, {"chunks": [], "raw": raw_entity})
        ev = raw_entity.get("evidence_chunks") or []
        self.by_doc[doc_id]["chunks"] = sorted(set(self.by_doc[doc_id]["chunks"]) | set(ev))


def merge_dates(buckets: dict[str, EntityBucket], get_date) -> dict[str, dict]:
    """Aggregate per-bucket dates from per-doc raw_entity. (For events only.)"""
    out = {}
    for k, b in buckets.items():
        for doc_id, occ in b.by_doc.items():
            d = get_date(occ["raw"])
            if d:
                out.setdefault(k, {}).setdefault("dates", set()).add(d)
    return out


def aggregate_all() -> dict:
    """Walk all _reextract.json files. Return a structured aggregation."""
    people: dict[str, EntityBucket] = {}
    orgs:   dict[str, EntityBucket] = {}
    locs:   dict[str, EntityBucket] = {}
    events: dict[str, EntityBucket] = {}
    uap_objs: dict[str, EntityBucket] = {}  # per (doc, event, idx) — never deduped
    relations: list[dict] = []
    docs_processed = 0
    chunk_maps: dict[str, dict[str, int]] = {}

    for jpath in sorted(RAW.glob("*--subagent/_reextract.json")):
        doc_id = jpath.parent.name.removesuffix("--subagent")
        try:
            data = json.loads(jpath.read_text(encoding="utf-8"))
        except Exception as e:
            print(f"  skip {doc_id}: {e}", file=sys.stderr); continue
        docs_processed += 1
        chunk_maps[doc_id] = load_chunk_to_page(doc_id)

        # people
        for p in data.get("people") or []:
            name = (p.get("name") or "").strip()
            if not name or name.lower() == "unknown": continue
            pid = canonicalize_name(name)
            if not pid: continue
            bucket = people.setdefault(pid, EntityBucket(pid, name))
            bucket.add_occurrence(doc_id, p, p.get("person_class"))

        # organizations
        for o in data.get("organizations") or []:
            name = (o.get("name") or "").strip()
            if not name or name.lower() == "unknown": continue
            oid = canonicalize_name(name)
            if not oid: continue
            bucket = orgs.setdefault(oid, EntityBucket(oid, name))
            bucket.add_occurrence(doc_id, o, o.get("org_class"))

        # locations
        for l in data.get("locations") or []:
            name = (l.get("name") or "").strip()
            if not name or name.lower() == "unknown": continue
            lid = canonicalize_name(name)
            if not lid: continue
            bucket = locs.setdefault(lid, EntityBucket(lid, name))
            bucket.add_occurrence(doc_id, l, l.get("geo_class"))

        # events
        for e in data.get("events") or []:
            label = (e.get("label") or "").strip()
            if not label: continue
            eid = event_id_from(label, e.get("date_start"))
            bucket = events.setdefault(eid, EntityBucket(eid, label))
            bucket.add_occurrence(doc_id, e, e.get("event_class"))

            # uap_objects — never cross-event-deduped; inherit parent event's evidence_chunks
            event_chunks = e.get("evidence_chunks") or []
            for i, u in enumerate(e.get("uap_objects_observed") or [], 1):
                if not isinstance(u, dict): continue
                uid = uap_object_id(eid, i)
                ubucket = uap_objs.setdefault(uid, EntityBucket(uid, f"{label} — object {i}"))
                u_with_evidence = {**u, "evidence_chunks": u.get("evidence_chunks") or event_chunks}
                ubucket.add_occurrence(doc_id, u_with_evidence, u.get("shape"))
                ubucket.extra.setdefault("event_id", eid)

        # relations — collected raw, mapped to canonical IDs later
        for r in data.get("relations") or []:
            if not isinstance(r, dict): continue
            relations.append({"doc_id": doc_id, **r})

    return {
        "docs_processed": docs_processed,
        "people": people, "organizations": orgs, "locations": locs,
        "events": events, "uap_objects": uap_objs,
        "relations": relations, "chunk_maps": chunk_maps,
    }


# ─────────────────────────────────────────────────────────────────────────────
# WRITERS
# ─────────────────────────────────────────────────────────────────────────────

def write_person(b: EntityBucket, chunk_maps: dict) -> None:
    mentioned_in = sorted({
        ref
        for doc_id, occ in b.by_doc.items()
        for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
    })
    affiliations = sorted({
        (occ["raw"].get("affiliation") or "").strip()
        for occ in b.by_doc.values() if occ["raw"].get("affiliation")
    } - {""})
    roles = sorted({
        (occ["raw"].get("role_at_doc_date") or "").strip()
        for occ in b.by_doc.values() if occ["raw"].get("role_at_doc_date")
    } - {""})
    fm = {
        "schema_version": SCHEMA_VERSION,
        "type": "entity",
        "entity_class": "person",
        "person_id": b.ent_id,
        "canonical_name": b.canonical_name,
        "aliases": sorted(b.aliases),
        "person_class": b.first_class,
        "affiliations": affiliations,
        "roles": roles,
        "mentioned_in": mentioned_in,
        "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
        "documents_count": len(b.by_doc),
        "enrichment_status": "none",
        "last_ingest": NOW,
        "wiki_version": WIKI_VERSION,
        "source": "reextract-v1",
    }
    write_entity(ENT / "people" / f"{b.ent_id}.md", fm, b.canonical_name)


def write_org(b: EntityBucket, chunk_maps: dict) -> None:
    mentioned_in = sorted({
        ref for doc_id, occ in b.by_doc.items()
        for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
    })
    countries = sorted({
        (occ["raw"].get("country") or "").strip()
        for occ in b.by_doc.values() if occ["raw"].get("country")
    } - {""})
    fm = {
        "schema_version": SCHEMA_VERSION,
        "type": "entity",
        "entity_class": "organization",
        "organization_id": b.ent_id,
        "canonical_name": b.canonical_name,
        "aliases": sorted(b.aliases),
        "org_class": b.first_class,
        "countries": countries,
        "mentioned_in": mentioned_in,
        "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
        "documents_count": len(b.by_doc),
        "enrichment_status": "none",
        "last_ingest": NOW,
        "wiki_version": WIKI_VERSION,
        "source": "reextract-v1",
    }
    write_entity(ENT / "organizations" / f"{b.ent_id}.md", fm, b.canonical_name)


def write_location(b: EntityBucket, chunk_maps: dict) -> None:
    mentioned_in = sorted({
        ref for doc_id, occ in b.by_doc.items()
        for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
    })
    countries = sorted({
        (occ["raw"].get("country") or "").strip()
        for occ in b.by_doc.values() if occ["raw"].get("country")
    } - {""})
    regions = sorted({
        (occ["raw"].get("region_or_state") or "").strip()
        for occ in b.by_doc.values() if occ["raw"].get("region_or_state")
    } - {""})
    fm = {
        "schema_version": SCHEMA_VERSION,
        "type": "entity",
        "entity_class": "location",
        "location_id": b.ent_id,
        "canonical_name": b.canonical_name,
        "aliases": sorted(b.aliases),
        "geo_class": b.first_class,
        "countries": countries,
        "regions_or_states": regions,
        "mentioned_in": mentioned_in,
        "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
        "documents_count": len(b.by_doc),
        "enrichment_status": "none",
        "last_ingest": NOW,
        "wiki_version": WIKI_VERSION,
        "source": "reextract-v1",
    }
    write_entity(ENT / "locations" / f"{b.ent_id}.md", fm, b.canonical_name)


def write_event(b: EntityBucket, chunk_maps: dict) -> None:
    mentioned_in = sorted({
        ref for doc_id, occ in b.by_doc.items()
        for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
    })
    date_starts = sorted({occ["raw"].get("date_start") for occ in b.by_doc.values() if occ["raw"].get("date_start")})
    date_ends = sorted({occ["raw"].get("date_end") for occ in b.by_doc.values() if occ["raw"].get("date_end")})
    primary_locs = sorted({
        (occ["raw"].get("primary_location_name") or "").strip()
        for occ in b.by_doc.values() if occ["raw"].get("primary_location_name")
    } - {""})
    geos = sorted({
        occ["raw"].get("primary_location_geo_class")
        for occ in b.by_doc.values() if occ["raw"].get("primary_location_geo_class")
    } - {None})
    # narrative: take the longest non-empty
    def best(field):
        best_val = ""
        for occ in b.by_doc.values():
            v = (occ["raw"].get(field) or "").strip()
            if len(v) > len(best_val): best_val = v
        return best_val or None

    fm = {
        "schema_version": SCHEMA_VERSION,
        "type": "entity",
        "entity_class": "event",
        "event_id": b.ent_id,
        "canonical_name": b.canonical_name,
        "aliases": sorted(b.aliases),
        "event_class": b.first_class,
        "date_start": date_starts[0] if date_starts else None,
        "date_end": date_ends[-1] if date_ends else None,
        "date_confidence": None,
        "primary_location_names": primary_locs,
        "primary_location_geo_classes": geos,
        "narrative_summary_en": best("narrative_summary"),
        "narrative_summary_pt_br": best("narrative_summary_pt_br"),
        "mentioned_in": mentioned_in,
        "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
        "documents_count": len(b.by_doc),
        "enrichment_status": "none",
        "last_ingest": NOW,
        "wiki_version": WIKI_VERSION,
        "source": "reextract-v1",
    }
    write_entity(ENT / "events" / f"{b.ent_id}.md", fm, b.canonical_name)


def write_uap_object(b: EntityBucket, chunk_maps: dict) -> None:
    mentioned_in = sorted({
        ref for doc_id, occ in b.by_doc.items()
        for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
    })
    raw_first = next(iter(b.by_doc.values()))["raw"]
    fm = {
        "schema_version": SCHEMA_VERSION,
        "type": "entity",
        "entity_class": "uap_object",
        "uap_object_id": b.ent_id,
        "canonical_name": b.canonical_name,
        "event_id": b.extra.get("event_id"),
        "shape": raw_first.get("shape"),
        "color": raw_first.get("color"),
        "medium": raw_first.get("medium"),
        "size_estimate_m": raw_first.get("size_estimate_m"),
        "altitude_ft": raw_first.get("altitude_ft"),
        "speed_kts": raw_first.get("speed_kts"),
        "maneuver_notes": raw_first.get("maneuver_notes"),
        "mentioned_in": mentioned_in,
        "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
        "documents_count": len(b.by_doc),
        "last_ingest": NOW,
        "wiki_version": WIKI_VERSION,
        "source": "reextract-v1",
    }
    write_entity(ENT / "uap-objects" / f"{b.ent_id}.md", fm, b.canonical_name)


def main():
    print(f"[1/3] Aggregating from {RAW}/*--subagent/_reextract.json ...")
    agg = aggregate_all()
    print(f"      docs processed:  {agg['docs_processed']}")
    print(f"      unique people:   {len(agg['people'])}")
    print(f"      unique orgs:     {len(agg['organizations'])}")
    print(f"      unique locs:     {len(agg['locations'])}")
    print(f"      unique events:   {len(agg['events'])}")
    print(f"      uap objects:     {len(agg['uap_objects'])}")
    print(f"      raw relations:   {len(agg['relations'])}")

    print(f"\n[2/3] Writing entity markdown files ...")
    cmaps = agg["chunk_maps"]
    written = {"people": 0, "organizations": 0, "locations": 0, "events": 0, "uap_objects": 0}
    for b in agg["people"].values():        write_person(b, cmaps);    written["people"] += 1
    for b in agg["organizations"].values(): write_org(b, cmaps);       written["organizations"] += 1
    for b in agg["locations"].values():     write_location(b, cmaps);  written["locations"] += 1
    for b in agg["events"].values():        write_event(b, cmaps);     written["events"] += 1
    for b in agg["uap_objects"].values():   write_uap_object(b, cmaps);written["uap_objects"] += 1
    for k, n in written.items(): print(f"      {k}: {n}")

    print(f"\n[3/3] Saving relations index to wiki/entities/_relations.json (for downstream ingest)")
    rels_path = ENT / "_relations.json"
    rels_path.write_text(json.dumps({
        "schema_version": SCHEMA_VERSION,
        "rebuilt_at": NOW,
        "count": len(agg["relations"]),
        "relations": agg["relations"],
    }, indent=2, ensure_ascii=False), encoding="utf-8")
    print(f"      saved {len(agg['relations'])} relations to {rels_path}")
    print(f"\n✓ done.")


if __name__ == "__main__":
    main()