#!/usr/bin/env python3 """ Extract typed relations from existing page YAMLs. For each wiki/pages//p.md, examines the structured fields the Haiku extracted (events[], people[], organizations[], primary_location, uap_objects[]) and produces relations: observers in events → (person, witnessed, event) primary_location → (event, occurred_at, location) uap_objects in event → (event, involves_uap, uap_object) every event on page → (event, documented_in, document) every person on page → (person, mentioned_by, document) ID mapping mirrors scripts/03-dedup-entities.py logic: - person: slugify(name) → person_id - event: EV-YYYY-MM-DD-slug(label) - location: slugify(canonical location name) - uap_object: OBJ-EV--NN """ from __future__ import annotations import os import re import sys import unicodedata from datetime import datetime from pathlib import Path import psycopg import yaml WIKI = Path("/Users/guto/ufo/wiki") PAGES_BASE = WIKI / "pages" def ascii_fold(s: str) -> str: return "".join(c for c in unicodedata.normalize("NFD", s) if not unicodedata.combining(c)) def slugify(s: str) -> str: s = ascii_fold(s).lower() s = re.sub(r"[^a-z0-9]+", "-", s).strip("-") return s def person_id(name: str) -> str | None: name = (name or "").strip() if not name: return None # Strip role prefixes like "Dr.", "Major", "Special Agent" name = re.sub( r"^(Mr|Mrs|Ms|Dr|Prof|Sr|Sra|Major|Maj|Col|Colonel|Lt|Lieutenant|Capt|Captain|" r"Gen|General|Sgt|Sergeant|Agent|Special Agent|SA|Director|Deputy|Rev|Reverend|" r"Inspector|Det|Detective)\.?\s+", "", name, flags=re.IGNORECASE, ) return slugify(name) or None def event_id(label: str, date: str | None) -> str | None: label = (label or "").strip() if not label: return None # Parse year-month-day from date or default to XXXX-XX-XX y, m, d = "XXXX", "XX", "XX" if date: ms = re.search(r"(\d{4})(?:-(\d{2}))?(?:-(\d{2}))?", str(date)) if ms: y = ms.group(1) m = ms.group(2) or "XX" d = ms.group(3) or "XX" return f"EV-{y}-{m}-{d}-{slugify(label)}" def location_id(name: str) -> str | None: name = (name or "").strip() if not name: return None return slugify(name) def parse_page_yaml(path: Path) -> dict | None: try: text = path.read_text(encoding="utf-8") if not text.startswith("---"): return None return yaml.safe_load(text.split("---")[1]) or {} except Exception: return None def main() -> int: dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL") if not dburl: sys.exit("DATABASE_URL not set") print("Scanning page YAMLs ...") rows: list[tuple[str, str, str, str, str, str, str, str]] = [] pages_processed = 0 for f in PAGES_BASE.rglob("p*.md"): fm = parse_page_yaml(f) if not fm: continue pages_processed += 1 # Derive doc_id/page_id from path try: rel = f.relative_to(PAGES_BASE) doc_id = str(rel.parent) page_id = f"{doc_id}/{f.stem}" except ValueError: continue doc_ref = f"[[{page_id}]]" ents = fm.get("entities_extracted") or {} events = ents.get("events") or fm.get("events") or [] people = ents.get("people") or fm.get("people") or [] locations_list = ents.get("locations") or fm.get("locations") or [] primary_loc = fm.get("primary_location") uap_objs = ents.get("uap_objects") or fm.get("uap_objects") or [] # Materialize event_ids on this page page_event_ids: list[str] = [] for ev in events: if not isinstance(ev, dict): continue label = ev.get("label") or ev.get("name") date = ev.get("date") or ev.get("date_start") eid = event_id(label, date) if eid: page_event_ids.append(eid) # 1. observers in events → witnessed for ev in events: if not isinstance(ev, dict): continue eid = event_id(ev.get("label") or ev.get("name"), ev.get("date") or ev.get("date_start")) if not eid: continue observers = ev.get("observers") or [] for obs in observers: obs_name = obs if isinstance(obs, str) else (obs.get("name") if isinstance(obs, dict) else None) pid = person_id(obs_name) if pid: rows.append(("person", pid, "witnessed", "event", eid, doc_ref, "high", "yaml")) # 2. people on page mentioned_by document for p in people: pname = p if isinstance(p, str) else (p.get("name") if isinstance(p, dict) else None) pid = person_id(pname) if pid: # Use page_id as doc, treating it as a "document" target rows.append(("person", pid, "mentioned_by", "document", doc_id, doc_ref, "high", "yaml")) # 3. primary_location relates page events if primary_loc: lid = location_id(primary_loc if isinstance(primary_loc, str) else (primary_loc.get("name") if isinstance(primary_loc, dict) else None)) for eid in page_event_ids: if lid: rows.append(("event", eid, "occurred_at", "location", lid, doc_ref, "medium", "yaml")) # 4. uap_objects in events → involves_uap if page_event_ids and uap_objs: first_event = page_event_ids[0] year_match = re.search(r"EV-(\d{4})-", first_event) year_token = year_match.group(1) if year_match else "XXXX" event_slug = first_event.split("-", 4)[-1].upper() for i, obj in enumerate(uap_objs, 1): obj_id = f"OBJ-EV{year_token}-{event_slug}-{i:02d}" rows.append(("event", first_event, "involves_uap", "uap_object", obj_id, doc_ref, "medium", "yaml")) # 5. events on page → documented_in for eid in page_event_ids: rows.append(("event", eid, "documented_in", "document", doc_id, doc_ref, "high", "yaml")) print(f"Pages processed: {pages_processed}") print(f"Relations extracted: {len(rows)}") # Dedupe (same source/relation/target/evidence — keep highest confidence) seen: set[tuple] = set() deduped: list[tuple] = [] for r in rows: key = (r[0], r[1], r[2], r[3], r[4], r[5]) if key in seen: continue seen.add(key) deduped.append(r) print(f"Relations after dedup: {len(deduped)}") if not deduped: return 0 with psycopg.connect(dburl) as conn: with conn.cursor() as cur: cur.execute("CREATE TEMP TABLE _rel (LIKE public.relations INCLUDING DEFAULTS)") with cur.copy("""COPY _rel (source_class, source_id, relation_type, target_class, target_id, evidence_ref, confidence, extracted_by) FROM STDIN""") as cp: for r in deduped: cp.write_row(r) cur.execute(""" INSERT INTO public.relations (source_class, source_id, relation_type, target_class, target_id, evidence_ref, confidence, extracted_by) SELECT source_class, source_id, relation_type, target_class, target_id, evidence_ref, confidence, extracted_by FROM _rel ON CONFLICT DO NOTHING """) print(f" inserted (after ON CONFLICT): {cur.rowcount}") cur.execute("SELECT relation_type, COUNT(*) FROM public.relations GROUP BY relation_type ORDER BY 2 DESC") print("\n=== Relation counts ===") for t, n in cur.fetchall(): print(f" {n:>7} {t}") conn.commit() return 0 if __name__ == "__main__": sys.exit(main())