disclosure-bureau/scripts/maintain/56_extract_relations.py
Luiz Gustavo a7e9dce6d2 rebuild entity layer from Sonnet-vision reextract pipeline
Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity
JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page
extraction. Add synthesize scripts to regenerate wiki/entities from the 116
_reextract.json (30), aggregate missing page.md from chunks (31), and reprocess
805 pages the doc-rebuilder agent dropped on context overflow (32). Add
maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and
typed relation extraction.

Web: wire relations API + entity-relations component; entity/timeline/doc
pages consume the rebuilt layer.

Note: raw/, processing/, wiki/ remain gitignored (bulk data managed
separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on
disk only. The 27 curated anchor events under wiki/entities/events/ are
preserved.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-21 12:20:24 -03:00

211 lines
7.9 KiB
Python

#!/usr/bin/env python3
"""
Extract typed relations from existing page YAMLs.
For each wiki/pages/<doc>/p<NNN>.md, examines the structured fields the Haiku
extracted (events[], people[], organizations[], primary_location, uap_objects[])
and produces relations:
observers in events → (person, witnessed, event)
primary_location → (event, occurred_at, location)
uap_objects in event → (event, involves_uap, uap_object)
every event on page → (event, documented_in, document)
every person on page → (person, mentioned_by, document)
ID mapping mirrors scripts/03-dedup-entities.py logic:
- person: slugify(name) → person_id
- event: EV-YYYY-MM-DD-slug(label)
- location: slugify(canonical location name)
- uap_object: OBJ-EV<year>-<EVENT>-NN
"""
from __future__ import annotations
import os
import re
import sys
import unicodedata
from datetime import datetime
from pathlib import Path
import psycopg
import yaml
WIKI = Path("/Users/guto/ufo/wiki")
PAGES_BASE = WIKI / "pages"
def ascii_fold(s: str) -> str:
return "".join(c for c in unicodedata.normalize("NFD", s) if not unicodedata.combining(c))
def slugify(s: str) -> str:
s = ascii_fold(s).lower()
s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
return s
def person_id(name: str) -> str | None:
name = (name or "").strip()
if not name: return None
# Strip role prefixes like "Dr.", "Major", "Special Agent"
name = re.sub(
r"^(Mr|Mrs|Ms|Dr|Prof|Sr|Sra|Major|Maj|Col|Colonel|Lt|Lieutenant|Capt|Captain|"
r"Gen|General|Sgt|Sergeant|Agent|Special Agent|SA|Director|Deputy|Rev|Reverend|"
r"Inspector|Det|Detective)\.?\s+", "", name, flags=re.IGNORECASE,
)
return slugify(name) or None
def event_id(label: str, date: str | None) -> str | None:
label = (label or "").strip()
if not label: return None
# Parse year-month-day from date or default to XXXX-XX-XX
y, m, d = "XXXX", "XX", "XX"
if date:
ms = re.search(r"(\d{4})(?:-(\d{2}))?(?:-(\d{2}))?", str(date))
if ms:
y = ms.group(1)
m = ms.group(2) or "XX"
d = ms.group(3) or "XX"
return f"EV-{y}-{m}-{d}-{slugify(label)}"
def location_id(name: str) -> str | None:
name = (name or "").strip()
if not name: return None
return slugify(name)
def parse_page_yaml(path: Path) -> dict | None:
try:
text = path.read_text(encoding="utf-8")
if not text.startswith("---"): return None
return yaml.safe_load(text.split("---")[1]) or {}
except Exception:
return None
def main() -> int:
dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL")
if not dburl: sys.exit("DATABASE_URL not set")
print("Scanning page YAMLs ...")
rows: list[tuple[str, str, str, str, str, str, str, str]] = []
pages_processed = 0
for f in PAGES_BASE.rglob("p*.md"):
fm = parse_page_yaml(f)
if not fm: continue
pages_processed += 1
# Derive doc_id/page_id from path
try:
rel = f.relative_to(PAGES_BASE)
doc_id = str(rel.parent)
page_id = f"{doc_id}/{f.stem}"
except ValueError:
continue
doc_ref = f"[[{page_id}]]"
ents = fm.get("entities_extracted") or {}
events = ents.get("events") or fm.get("events") or []
people = ents.get("people") or fm.get("people") or []
locations_list = ents.get("locations") or fm.get("locations") or []
primary_loc = fm.get("primary_location")
uap_objs = ents.get("uap_objects") or fm.get("uap_objects") or []
# Materialize event_ids on this page
page_event_ids: list[str] = []
for ev in events:
if not isinstance(ev, dict): continue
label = ev.get("label") or ev.get("name")
date = ev.get("date") or ev.get("date_start")
eid = event_id(label, date)
if eid: page_event_ids.append(eid)
# 1. observers in events → witnessed
for ev in events:
if not isinstance(ev, dict): continue
eid = event_id(ev.get("label") or ev.get("name"), ev.get("date") or ev.get("date_start"))
if not eid: continue
observers = ev.get("observers") or []
for obs in observers:
obs_name = obs if isinstance(obs, str) else (obs.get("name") if isinstance(obs, dict) else None)
pid = person_id(obs_name)
if pid:
rows.append(("person", pid, "witnessed", "event", eid, doc_ref, "high", "yaml"))
# 2. people on page mentioned_by document
for p in people:
pname = p if isinstance(p, str) else (p.get("name") if isinstance(p, dict) else None)
pid = person_id(pname)
if pid:
# Use page_id as doc, treating it as a "document" target
rows.append(("person", pid, "mentioned_by", "document", doc_id, doc_ref, "high", "yaml"))
# 3. primary_location relates page events
if primary_loc:
lid = location_id(primary_loc if isinstance(primary_loc, str)
else (primary_loc.get("name") if isinstance(primary_loc, dict) else None))
for eid in page_event_ids:
if lid:
rows.append(("event", eid, "occurred_at", "location", lid, doc_ref, "medium", "yaml"))
# 4. uap_objects in events → involves_uap
if page_event_ids and uap_objs:
first_event = page_event_ids[0]
year_match = re.search(r"EV-(\d{4})-", first_event)
year_token = year_match.group(1) if year_match else "XXXX"
event_slug = first_event.split("-", 4)[-1].upper()
for i, obj in enumerate(uap_objs, 1):
obj_id = f"OBJ-EV{year_token}-{event_slug}-{i:02d}"
rows.append(("event", first_event, "involves_uap", "uap_object", obj_id, doc_ref, "medium", "yaml"))
# 5. events on page → documented_in
for eid in page_event_ids:
rows.append(("event", eid, "documented_in", "document", doc_id, doc_ref, "high", "yaml"))
print(f"Pages processed: {pages_processed}")
print(f"Relations extracted: {len(rows)}")
# Dedupe (same source/relation/target/evidence — keep highest confidence)
seen: set[tuple] = set()
deduped: list[tuple] = []
for r in rows:
key = (r[0], r[1], r[2], r[3], r[4], r[5])
if key in seen: continue
seen.add(key)
deduped.append(r)
print(f"Relations after dedup: {len(deduped)}")
if not deduped:
return 0
with psycopg.connect(dburl) as conn:
with conn.cursor() as cur:
cur.execute("CREATE TEMP TABLE _rel (LIKE public.relations INCLUDING DEFAULTS)")
with cur.copy("""COPY _rel
(source_class, source_id, relation_type,
target_class, target_id, evidence_ref,
confidence, extracted_by)
FROM STDIN""") as cp:
for r in deduped: cp.write_row(r)
cur.execute("""
INSERT INTO public.relations
(source_class, source_id, relation_type,
target_class, target_id, evidence_ref,
confidence, extracted_by)
SELECT source_class, source_id, relation_type,
target_class, target_id, evidence_ref,
confidence, extracted_by
FROM _rel
ON CONFLICT DO NOTHING
""")
print(f" inserted (after ON CONFLICT): {cur.rowcount}")
cur.execute("SELECT relation_type, COUNT(*) FROM public.relations GROUP BY relation_type ORDER BY 2 DESC")
print("\n=== Relation counts ===")
for t, n in cur.fetchall():
print(f" {n:>7} {t}")
conn.commit()
return 0
if __name__ == "__main__":
sys.exit(main())