Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
211 lines
7.9 KiB
Python
211 lines
7.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract typed relations from existing page YAMLs.
|
|
|
|
For each wiki/pages/<doc>/p<NNN>.md, examines the structured fields the Haiku
|
|
extracted (events[], people[], organizations[], primary_location, uap_objects[])
|
|
and produces relations:
|
|
|
|
observers in events → (person, witnessed, event)
|
|
primary_location → (event, occurred_at, location)
|
|
uap_objects in event → (event, involves_uap, uap_object)
|
|
every event on page → (event, documented_in, document)
|
|
every person on page → (person, mentioned_by, document)
|
|
|
|
ID mapping mirrors scripts/03-dedup-entities.py logic:
|
|
- person: slugify(name) → person_id
|
|
- event: EV-YYYY-MM-DD-slug(label)
|
|
- location: slugify(canonical location name)
|
|
- uap_object: OBJ-EV<year>-<EVENT>-NN
|
|
"""
|
|
from __future__ import annotations
|
|
import os
|
|
import re
|
|
import sys
|
|
import unicodedata
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
import psycopg
|
|
import yaml
|
|
|
|
WIKI = Path("/Users/guto/ufo/wiki")
|
|
PAGES_BASE = WIKI / "pages"
|
|
|
|
|
|
def ascii_fold(s: str) -> str:
|
|
return "".join(c for c in unicodedata.normalize("NFD", s) if not unicodedata.combining(c))
|
|
|
|
|
|
def slugify(s: str) -> str:
|
|
s = ascii_fold(s).lower()
|
|
s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
|
|
return s
|
|
|
|
|
|
def person_id(name: str) -> str | None:
|
|
name = (name or "").strip()
|
|
if not name: return None
|
|
# Strip role prefixes like "Dr.", "Major", "Special Agent"
|
|
name = re.sub(
|
|
r"^(Mr|Mrs|Ms|Dr|Prof|Sr|Sra|Major|Maj|Col|Colonel|Lt|Lieutenant|Capt|Captain|"
|
|
r"Gen|General|Sgt|Sergeant|Agent|Special Agent|SA|Director|Deputy|Rev|Reverend|"
|
|
r"Inspector|Det|Detective)\.?\s+", "", name, flags=re.IGNORECASE,
|
|
)
|
|
return slugify(name) or None
|
|
|
|
|
|
def event_id(label: str, date: str | None) -> str | None:
|
|
label = (label or "").strip()
|
|
if not label: return None
|
|
# Parse year-month-day from date or default to XXXX-XX-XX
|
|
y, m, d = "XXXX", "XX", "XX"
|
|
if date:
|
|
ms = re.search(r"(\d{4})(?:-(\d{2}))?(?:-(\d{2}))?", str(date))
|
|
if ms:
|
|
y = ms.group(1)
|
|
m = ms.group(2) or "XX"
|
|
d = ms.group(3) or "XX"
|
|
return f"EV-{y}-{m}-{d}-{slugify(label)}"
|
|
|
|
|
|
def location_id(name: str) -> str | None:
|
|
name = (name or "").strip()
|
|
if not name: return None
|
|
return slugify(name)
|
|
|
|
|
|
def parse_page_yaml(path: Path) -> dict | None:
|
|
try:
|
|
text = path.read_text(encoding="utf-8")
|
|
if not text.startswith("---"): return None
|
|
return yaml.safe_load(text.split("---")[1]) or {}
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def main() -> int:
|
|
dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL")
|
|
if not dburl: sys.exit("DATABASE_URL not set")
|
|
|
|
print("Scanning page YAMLs ...")
|
|
rows: list[tuple[str, str, str, str, str, str, str, str]] = []
|
|
pages_processed = 0
|
|
for f in PAGES_BASE.rglob("p*.md"):
|
|
fm = parse_page_yaml(f)
|
|
if not fm: continue
|
|
pages_processed += 1
|
|
# Derive doc_id/page_id from path
|
|
try:
|
|
rel = f.relative_to(PAGES_BASE)
|
|
doc_id = str(rel.parent)
|
|
page_id = f"{doc_id}/{f.stem}"
|
|
except ValueError:
|
|
continue
|
|
doc_ref = f"[[{page_id}]]"
|
|
|
|
ents = fm.get("entities_extracted") or {}
|
|
events = ents.get("events") or fm.get("events") or []
|
|
people = ents.get("people") or fm.get("people") or []
|
|
locations_list = ents.get("locations") or fm.get("locations") or []
|
|
primary_loc = fm.get("primary_location")
|
|
uap_objs = ents.get("uap_objects") or fm.get("uap_objects") or []
|
|
|
|
# Materialize event_ids on this page
|
|
page_event_ids: list[str] = []
|
|
for ev in events:
|
|
if not isinstance(ev, dict): continue
|
|
label = ev.get("label") or ev.get("name")
|
|
date = ev.get("date") or ev.get("date_start")
|
|
eid = event_id(label, date)
|
|
if eid: page_event_ids.append(eid)
|
|
|
|
# 1. observers in events → witnessed
|
|
for ev in events:
|
|
if not isinstance(ev, dict): continue
|
|
eid = event_id(ev.get("label") or ev.get("name"), ev.get("date") or ev.get("date_start"))
|
|
if not eid: continue
|
|
observers = ev.get("observers") or []
|
|
for obs in observers:
|
|
obs_name = obs if isinstance(obs, str) else (obs.get("name") if isinstance(obs, dict) else None)
|
|
pid = person_id(obs_name)
|
|
if pid:
|
|
rows.append(("person", pid, "witnessed", "event", eid, doc_ref, "high", "yaml"))
|
|
|
|
# 2. people on page mentioned_by document
|
|
for p in people:
|
|
pname = p if isinstance(p, str) else (p.get("name") if isinstance(p, dict) else None)
|
|
pid = person_id(pname)
|
|
if pid:
|
|
# Use page_id as doc, treating it as a "document" target
|
|
rows.append(("person", pid, "mentioned_by", "document", doc_id, doc_ref, "high", "yaml"))
|
|
|
|
# 3. primary_location relates page events
|
|
if primary_loc:
|
|
lid = location_id(primary_loc if isinstance(primary_loc, str)
|
|
else (primary_loc.get("name") if isinstance(primary_loc, dict) else None))
|
|
for eid in page_event_ids:
|
|
if lid:
|
|
rows.append(("event", eid, "occurred_at", "location", lid, doc_ref, "medium", "yaml"))
|
|
|
|
# 4. uap_objects in events → involves_uap
|
|
if page_event_ids and uap_objs:
|
|
first_event = page_event_ids[0]
|
|
year_match = re.search(r"EV-(\d{4})-", first_event)
|
|
year_token = year_match.group(1) if year_match else "XXXX"
|
|
event_slug = first_event.split("-", 4)[-1].upper()
|
|
for i, obj in enumerate(uap_objs, 1):
|
|
obj_id = f"OBJ-EV{year_token}-{event_slug}-{i:02d}"
|
|
rows.append(("event", first_event, "involves_uap", "uap_object", obj_id, doc_ref, "medium", "yaml"))
|
|
|
|
# 5. events on page → documented_in
|
|
for eid in page_event_ids:
|
|
rows.append(("event", eid, "documented_in", "document", doc_id, doc_ref, "high", "yaml"))
|
|
|
|
print(f"Pages processed: {pages_processed}")
|
|
print(f"Relations extracted: {len(rows)}")
|
|
|
|
# Dedupe (same source/relation/target/evidence — keep highest confidence)
|
|
seen: set[tuple] = set()
|
|
deduped: list[tuple] = []
|
|
for r in rows:
|
|
key = (r[0], r[1], r[2], r[3], r[4], r[5])
|
|
if key in seen: continue
|
|
seen.add(key)
|
|
deduped.append(r)
|
|
print(f"Relations after dedup: {len(deduped)}")
|
|
|
|
if not deduped:
|
|
return 0
|
|
|
|
with psycopg.connect(dburl) as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute("CREATE TEMP TABLE _rel (LIKE public.relations INCLUDING DEFAULTS)")
|
|
with cur.copy("""COPY _rel
|
|
(source_class, source_id, relation_type,
|
|
target_class, target_id, evidence_ref,
|
|
confidence, extracted_by)
|
|
FROM STDIN""") as cp:
|
|
for r in deduped: cp.write_row(r)
|
|
cur.execute("""
|
|
INSERT INTO public.relations
|
|
(source_class, source_id, relation_type,
|
|
target_class, target_id, evidence_ref,
|
|
confidence, extracted_by)
|
|
SELECT source_class, source_id, relation_type,
|
|
target_class, target_id, evidence_ref,
|
|
confidence, extracted_by
|
|
FROM _rel
|
|
ON CONFLICT DO NOTHING
|
|
""")
|
|
print(f" inserted (after ON CONFLICT): {cur.rowcount}")
|
|
cur.execute("SELECT relation_type, COUNT(*) FROM public.relations GROUP BY relation_type ORDER BY 2 DESC")
|
|
print("\n=== Relation counts ===")
|
|
for t, n in cur.fetchall():
|
|
print(f" {n:>7} {t}")
|
|
conn.commit()
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|