Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
447 lines
18 KiB
Python
447 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
30_rebuild_wiki_from_reextract.py — Rebuild wiki/entities/ from scratch using
|
|
the 116 _reextract.json files as the SOLE source of truth.
|
|
|
|
Pipeline:
|
|
1. Load every raw/<doc>--subagent/_reextract.json
|
|
2. Load every raw/<doc>--subagent/_index.json (chunk_id → page map)
|
|
3. Cross-doc dedup:
|
|
person/org/loc: by canonical_name (lowercase, ASCII-fold)
|
|
event: by event_id (EV-YYYY-MM-DD-slug)
|
|
uap_object: per (event, observed_index) — never deduped cross-event
|
|
4. Generate IDs per CLAUDE.md regex
|
|
5. Write wiki/entities/{type}/<id>.md (clean frontmatter + EN/PT-BR body stubs)
|
|
6. Print summary
|
|
|
|
Does NOT touch DB. DB sync is a separate step.
|
|
Idempotent: re-running with same inputs produces same outputs (deterministic).
|
|
"""
|
|
from __future__ import annotations
|
|
import json
|
|
import re
|
|
import sys
|
|
import unicodedata
|
|
from collections import defaultdict
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
|
|
UFO = Path("/Users/guto/ufo")
|
|
RAW = UFO / "raw"
|
|
ENT = UFO / "wiki" / "entities"
|
|
|
|
SCHEMA_VERSION = "0.1.0"
|
|
WIKI_VERSION = "0.1.0"
|
|
NOW = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
|
|
def canonicalize_name(name: str) -> str:
|
|
"""Generic name → kebab-case ASCII-fold id (mirrors scripts/03-dedup-entities.py)."""
|
|
if not name:
|
|
return ""
|
|
nfkd = unicodedata.normalize("NFKD", name)
|
|
ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
|
|
lower = ascii_str.lower()
|
|
replaced = re.sub(r"[^a-z0-9-]", "-", lower)
|
|
collapsed = re.sub(r"-+", "-", replaced).strip("-")
|
|
if collapsed and collapsed[0].isdigit():
|
|
collapsed = "x-" + collapsed
|
|
return collapsed
|
|
|
|
|
|
def event_id_from(label: str, date_start: str | None) -> str:
|
|
slug = canonicalize_name(label or "")[:40].strip("-") or "unlabeled"
|
|
date = date_start or ""
|
|
m = re.match(r"^(\d{4})-(\d{2})-(\d{2})$", date)
|
|
if m: return f"EV-{m.group(1)}-{m.group(2)}-{m.group(3)}-{slug}"
|
|
m = re.match(r"^(\d{4})-(\d{2})$", date)
|
|
if m: return f"EV-{m.group(1)}-{m.group(2)}-XX-{slug}"
|
|
m = re.match(r"^(\d{4})$", date)
|
|
if m: return f"EV-{m.group(1)}-XX-XX-{slug}"
|
|
return f"EV-XXXX-XX-XX-{slug}"
|
|
|
|
|
|
def uap_object_id(event_id: str, index: int) -> str:
|
|
if event_id.startswith("EV-"):
|
|
parts = event_id[3:].split("-", 4)
|
|
if len(parts) >= 4:
|
|
year = parts[0]
|
|
slug = "-".join(parts[3:])
|
|
compact = re.sub(r"[^A-Z0-9]", "", slug.upper())[:20] or "UNK"
|
|
return f"OBJ-EV{year}-{compact}-{index:02d}"
|
|
return f"OBJ-UNK-{index:02d}"
|
|
|
|
|
|
def dump_yaml(obj: dict) -> str:
|
|
"""Stable YAML dump matching existing entity file style."""
|
|
return yaml.safe_dump(obj, sort_keys=False, allow_unicode=True,
|
|
default_flow_style=False, width=10_000).strip()
|
|
|
|
|
|
def write_entity(path: Path, frontmatter: dict, body_title: str) -> None:
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
content = (
|
|
f"---\n{dump_yaml(frontmatter)}\n---\n\n"
|
|
f"# {body_title}\n\n"
|
|
f"## Description (EN)\n\n"
|
|
f"## Descrição (PT-BR)\n"
|
|
)
|
|
path.write_text(content, encoding="utf-8")
|
|
|
|
|
|
def load_chunk_to_page(doc_id: str) -> dict[str, int]:
|
|
idx_path = RAW / f"{doc_id}--subagent" / "_index.json"
|
|
if not idx_path.is_file(): return {}
|
|
try:
|
|
idx = json.loads(idx_path.read_text(encoding="utf-8"))
|
|
return {c.get("chunk_id"): c.get("page") for c in (idx.get("chunks") or [])
|
|
if c.get("chunk_id") and c.get("page") is not None}
|
|
except Exception:
|
|
return {}
|
|
|
|
|
|
def page_refs_for(doc_id: str, chunks: list[str], chunk_to_page: dict[str, int]) -> list[str]:
|
|
pages = set()
|
|
for c in chunks or []:
|
|
p = chunk_to_page.get(c)
|
|
if p is not None: pages.add(int(p))
|
|
return [f"[[{doc_id}/p{p:03d}]]" for p in sorted(pages)]
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# AGGREGATION
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
class EntityBucket:
|
|
"""Aggregates one entity across multiple documents."""
|
|
__slots__ = ("ent_id", "canonical_name", "aliases", "first_class",
|
|
"by_doc", "extra")
|
|
|
|
def __init__(self, ent_id: str, canonical_name: str):
|
|
self.ent_id = ent_id
|
|
self.canonical_name = canonical_name
|
|
self.aliases: set[str] = set()
|
|
self.first_class: str | None = None
|
|
# doc_id → {chunks: list, raw: dict}
|
|
self.by_doc: dict[str, dict] = {}
|
|
self.extra: dict = {} # type-specific scratch (affiliation, geo_class, etc.)
|
|
|
|
def add_occurrence(self, doc_id: str, raw_entity: dict, ent_class: str | None):
|
|
if self.first_class is None and ent_class:
|
|
self.first_class = ent_class
|
|
if raw_entity.get("name") or raw_entity.get("label"):
|
|
self.aliases.add((raw_entity.get("name") or raw_entity.get("label")).strip())
|
|
for a in raw_entity.get("aliases_in_doc") or []:
|
|
if a and a.strip(): self.aliases.add(a.strip())
|
|
self.by_doc.setdefault(doc_id, {"chunks": [], "raw": raw_entity})
|
|
ev = raw_entity.get("evidence_chunks") or []
|
|
self.by_doc[doc_id]["chunks"] = sorted(set(self.by_doc[doc_id]["chunks"]) | set(ev))
|
|
|
|
|
|
def merge_dates(buckets: dict[str, EntityBucket], get_date) -> dict[str, dict]:
|
|
"""Aggregate per-bucket dates from per-doc raw_entity. (For events only.)"""
|
|
out = {}
|
|
for k, b in buckets.items():
|
|
for doc_id, occ in b.by_doc.items():
|
|
d = get_date(occ["raw"])
|
|
if d:
|
|
out.setdefault(k, {}).setdefault("dates", set()).add(d)
|
|
return out
|
|
|
|
|
|
def aggregate_all() -> dict:
|
|
"""Walk all _reextract.json files. Return a structured aggregation."""
|
|
people: dict[str, EntityBucket] = {}
|
|
orgs: dict[str, EntityBucket] = {}
|
|
locs: dict[str, EntityBucket] = {}
|
|
events: dict[str, EntityBucket] = {}
|
|
uap_objs: dict[str, EntityBucket] = {} # per (doc, event, idx) — never deduped
|
|
relations: list[dict] = []
|
|
docs_processed = 0
|
|
chunk_maps: dict[str, dict[str, int]] = {}
|
|
|
|
for jpath in sorted(RAW.glob("*--subagent/_reextract.json")):
|
|
doc_id = jpath.parent.name.removesuffix("--subagent")
|
|
try:
|
|
data = json.loads(jpath.read_text(encoding="utf-8"))
|
|
except Exception as e:
|
|
print(f" skip {doc_id}: {e}", file=sys.stderr); continue
|
|
docs_processed += 1
|
|
chunk_maps[doc_id] = load_chunk_to_page(doc_id)
|
|
|
|
# people
|
|
for p in data.get("people") or []:
|
|
name = (p.get("name") or "").strip()
|
|
if not name or name.lower() == "unknown": continue
|
|
pid = canonicalize_name(name)
|
|
if not pid: continue
|
|
bucket = people.setdefault(pid, EntityBucket(pid, name))
|
|
bucket.add_occurrence(doc_id, p, p.get("person_class"))
|
|
|
|
# organizations
|
|
for o in data.get("organizations") or []:
|
|
name = (o.get("name") or "").strip()
|
|
if not name or name.lower() == "unknown": continue
|
|
oid = canonicalize_name(name)
|
|
if not oid: continue
|
|
bucket = orgs.setdefault(oid, EntityBucket(oid, name))
|
|
bucket.add_occurrence(doc_id, o, o.get("org_class"))
|
|
|
|
# locations
|
|
for l in data.get("locations") or []:
|
|
name = (l.get("name") or "").strip()
|
|
if not name or name.lower() == "unknown": continue
|
|
lid = canonicalize_name(name)
|
|
if not lid: continue
|
|
bucket = locs.setdefault(lid, EntityBucket(lid, name))
|
|
bucket.add_occurrence(doc_id, l, l.get("geo_class"))
|
|
|
|
# events
|
|
for e in data.get("events") or []:
|
|
label = (e.get("label") or "").strip()
|
|
if not label: continue
|
|
eid = event_id_from(label, e.get("date_start"))
|
|
bucket = events.setdefault(eid, EntityBucket(eid, label))
|
|
bucket.add_occurrence(doc_id, e, e.get("event_class"))
|
|
|
|
# uap_objects — never cross-event-deduped; inherit parent event's evidence_chunks
|
|
event_chunks = e.get("evidence_chunks") or []
|
|
for i, u in enumerate(e.get("uap_objects_observed") or [], 1):
|
|
if not isinstance(u, dict): continue
|
|
uid = uap_object_id(eid, i)
|
|
ubucket = uap_objs.setdefault(uid, EntityBucket(uid, f"{label} — object {i}"))
|
|
u_with_evidence = {**u, "evidence_chunks": u.get("evidence_chunks") or event_chunks}
|
|
ubucket.add_occurrence(doc_id, u_with_evidence, u.get("shape"))
|
|
ubucket.extra.setdefault("event_id", eid)
|
|
|
|
# relations — collected raw, mapped to canonical IDs later
|
|
for r in data.get("relations") or []:
|
|
if not isinstance(r, dict): continue
|
|
relations.append({"doc_id": doc_id, **r})
|
|
|
|
return {
|
|
"docs_processed": docs_processed,
|
|
"people": people, "organizations": orgs, "locations": locs,
|
|
"events": events, "uap_objects": uap_objs,
|
|
"relations": relations, "chunk_maps": chunk_maps,
|
|
}
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# WRITERS
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
def write_person(b: EntityBucket, chunk_maps: dict) -> None:
|
|
mentioned_in = sorted({
|
|
ref
|
|
for doc_id, occ in b.by_doc.items()
|
|
for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
|
|
})
|
|
affiliations = sorted({
|
|
(occ["raw"].get("affiliation") or "").strip()
|
|
for occ in b.by_doc.values() if occ["raw"].get("affiliation")
|
|
} - {""})
|
|
roles = sorted({
|
|
(occ["raw"].get("role_at_doc_date") or "").strip()
|
|
for occ in b.by_doc.values() if occ["raw"].get("role_at_doc_date")
|
|
} - {""})
|
|
fm = {
|
|
"schema_version": SCHEMA_VERSION,
|
|
"type": "entity",
|
|
"entity_class": "person",
|
|
"person_id": b.ent_id,
|
|
"canonical_name": b.canonical_name,
|
|
"aliases": sorted(b.aliases),
|
|
"person_class": b.first_class,
|
|
"affiliations": affiliations,
|
|
"roles": roles,
|
|
"mentioned_in": mentioned_in,
|
|
"total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
|
|
"documents_count": len(b.by_doc),
|
|
"enrichment_status": "none",
|
|
"last_ingest": NOW,
|
|
"wiki_version": WIKI_VERSION,
|
|
"source": "reextract-v1",
|
|
}
|
|
write_entity(ENT / "people" / f"{b.ent_id}.md", fm, b.canonical_name)
|
|
|
|
|
|
def write_org(b: EntityBucket, chunk_maps: dict) -> None:
|
|
mentioned_in = sorted({
|
|
ref for doc_id, occ in b.by_doc.items()
|
|
for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
|
|
})
|
|
countries = sorted({
|
|
(occ["raw"].get("country") or "").strip()
|
|
for occ in b.by_doc.values() if occ["raw"].get("country")
|
|
} - {""})
|
|
fm = {
|
|
"schema_version": SCHEMA_VERSION,
|
|
"type": "entity",
|
|
"entity_class": "organization",
|
|
"organization_id": b.ent_id,
|
|
"canonical_name": b.canonical_name,
|
|
"aliases": sorted(b.aliases),
|
|
"org_class": b.first_class,
|
|
"countries": countries,
|
|
"mentioned_in": mentioned_in,
|
|
"total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
|
|
"documents_count": len(b.by_doc),
|
|
"enrichment_status": "none",
|
|
"last_ingest": NOW,
|
|
"wiki_version": WIKI_VERSION,
|
|
"source": "reextract-v1",
|
|
}
|
|
write_entity(ENT / "organizations" / f"{b.ent_id}.md", fm, b.canonical_name)
|
|
|
|
|
|
def write_location(b: EntityBucket, chunk_maps: dict) -> None:
|
|
mentioned_in = sorted({
|
|
ref for doc_id, occ in b.by_doc.items()
|
|
for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
|
|
})
|
|
countries = sorted({
|
|
(occ["raw"].get("country") or "").strip()
|
|
for occ in b.by_doc.values() if occ["raw"].get("country")
|
|
} - {""})
|
|
regions = sorted({
|
|
(occ["raw"].get("region_or_state") or "").strip()
|
|
for occ in b.by_doc.values() if occ["raw"].get("region_or_state")
|
|
} - {""})
|
|
fm = {
|
|
"schema_version": SCHEMA_VERSION,
|
|
"type": "entity",
|
|
"entity_class": "location",
|
|
"location_id": b.ent_id,
|
|
"canonical_name": b.canonical_name,
|
|
"aliases": sorted(b.aliases),
|
|
"geo_class": b.first_class,
|
|
"countries": countries,
|
|
"regions_or_states": regions,
|
|
"mentioned_in": mentioned_in,
|
|
"total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
|
|
"documents_count": len(b.by_doc),
|
|
"enrichment_status": "none",
|
|
"last_ingest": NOW,
|
|
"wiki_version": WIKI_VERSION,
|
|
"source": "reextract-v1",
|
|
}
|
|
write_entity(ENT / "locations" / f"{b.ent_id}.md", fm, b.canonical_name)
|
|
|
|
|
|
def write_event(b: EntityBucket, chunk_maps: dict) -> None:
|
|
mentioned_in = sorted({
|
|
ref for doc_id, occ in b.by_doc.items()
|
|
for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
|
|
})
|
|
date_starts = sorted({occ["raw"].get("date_start") for occ in b.by_doc.values() if occ["raw"].get("date_start")})
|
|
date_ends = sorted({occ["raw"].get("date_end") for occ in b.by_doc.values() if occ["raw"].get("date_end")})
|
|
primary_locs = sorted({
|
|
(occ["raw"].get("primary_location_name") or "").strip()
|
|
for occ in b.by_doc.values() if occ["raw"].get("primary_location_name")
|
|
} - {""})
|
|
geos = sorted({
|
|
occ["raw"].get("primary_location_geo_class")
|
|
for occ in b.by_doc.values() if occ["raw"].get("primary_location_geo_class")
|
|
} - {None})
|
|
# narrative: take the longest non-empty
|
|
def best(field):
|
|
best_val = ""
|
|
for occ in b.by_doc.values():
|
|
v = (occ["raw"].get(field) or "").strip()
|
|
if len(v) > len(best_val): best_val = v
|
|
return best_val or None
|
|
|
|
fm = {
|
|
"schema_version": SCHEMA_VERSION,
|
|
"type": "entity",
|
|
"entity_class": "event",
|
|
"event_id": b.ent_id,
|
|
"canonical_name": b.canonical_name,
|
|
"aliases": sorted(b.aliases),
|
|
"event_class": b.first_class,
|
|
"date_start": date_starts[0] if date_starts else None,
|
|
"date_end": date_ends[-1] if date_ends else None,
|
|
"date_confidence": None,
|
|
"primary_location_names": primary_locs,
|
|
"primary_location_geo_classes": geos,
|
|
"narrative_summary_en": best("narrative_summary"),
|
|
"narrative_summary_pt_br": best("narrative_summary_pt_br"),
|
|
"mentioned_in": mentioned_in,
|
|
"total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
|
|
"documents_count": len(b.by_doc),
|
|
"enrichment_status": "none",
|
|
"last_ingest": NOW,
|
|
"wiki_version": WIKI_VERSION,
|
|
"source": "reextract-v1",
|
|
}
|
|
write_entity(ENT / "events" / f"{b.ent_id}.md", fm, b.canonical_name)
|
|
|
|
|
|
def write_uap_object(b: EntityBucket, chunk_maps: dict) -> None:
|
|
mentioned_in = sorted({
|
|
ref for doc_id, occ in b.by_doc.items()
|
|
for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
|
|
})
|
|
raw_first = next(iter(b.by_doc.values()))["raw"]
|
|
fm = {
|
|
"schema_version": SCHEMA_VERSION,
|
|
"type": "entity",
|
|
"entity_class": "uap_object",
|
|
"uap_object_id": b.ent_id,
|
|
"canonical_name": b.canonical_name,
|
|
"event_id": b.extra.get("event_id"),
|
|
"shape": raw_first.get("shape"),
|
|
"color": raw_first.get("color"),
|
|
"medium": raw_first.get("medium"),
|
|
"size_estimate_m": raw_first.get("size_estimate_m"),
|
|
"altitude_ft": raw_first.get("altitude_ft"),
|
|
"speed_kts": raw_first.get("speed_kts"),
|
|
"maneuver_notes": raw_first.get("maneuver_notes"),
|
|
"mentioned_in": mentioned_in,
|
|
"total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
|
|
"documents_count": len(b.by_doc),
|
|
"last_ingest": NOW,
|
|
"wiki_version": WIKI_VERSION,
|
|
"source": "reextract-v1",
|
|
}
|
|
write_entity(ENT / "uap-objects" / f"{b.ent_id}.md", fm, b.canonical_name)
|
|
|
|
|
|
def main():
|
|
print(f"[1/3] Aggregating from {RAW}/*--subagent/_reextract.json ...")
|
|
agg = aggregate_all()
|
|
print(f" docs processed: {agg['docs_processed']}")
|
|
print(f" unique people: {len(agg['people'])}")
|
|
print(f" unique orgs: {len(agg['organizations'])}")
|
|
print(f" unique locs: {len(agg['locations'])}")
|
|
print(f" unique events: {len(agg['events'])}")
|
|
print(f" uap objects: {len(agg['uap_objects'])}")
|
|
print(f" raw relations: {len(agg['relations'])}")
|
|
|
|
print(f"\n[2/3] Writing entity markdown files ...")
|
|
cmaps = agg["chunk_maps"]
|
|
written = {"people": 0, "organizations": 0, "locations": 0, "events": 0, "uap_objects": 0}
|
|
for b in agg["people"].values(): write_person(b, cmaps); written["people"] += 1
|
|
for b in agg["organizations"].values(): write_org(b, cmaps); written["organizations"] += 1
|
|
for b in agg["locations"].values(): write_location(b, cmaps); written["locations"] += 1
|
|
for b in agg["events"].values(): write_event(b, cmaps); written["events"] += 1
|
|
for b in agg["uap_objects"].values(): write_uap_object(b, cmaps);written["uap_objects"] += 1
|
|
for k, n in written.items(): print(f" {k}: {n}")
|
|
|
|
print(f"\n[3/3] Saving relations index to wiki/entities/_relations.json (for downstream ingest)")
|
|
rels_path = ENT / "_relations.json"
|
|
rels_path.write_text(json.dumps({
|
|
"schema_version": SCHEMA_VERSION,
|
|
"rebuilt_at": NOW,
|
|
"count": len(agg["relations"]),
|
|
"relations": agg["relations"],
|
|
}, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
print(f" saved {len(agg['relations'])} relations to {rels_path}")
|
|
print(f"\n✓ done.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|