282 lines
10 KiB
Python
282 lines
10 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
57_load_relations_from_json.py — Build typed relations for public.relations from
|
||
|
|
the reextract data, using ONLY verifiable references (Locard / absolute
|
||
|
|
provenance — no fuzzy guessing).
|
||
|
|
|
||
|
|
Two sources, combined and deduped:
|
||
|
|
|
||
|
|
A. STRUCTURAL relations derived from each raw/<doc>--subagent/_reextract.json
|
||
|
|
events[] (deterministic event_id, names resolved against real entities):
|
||
|
|
- event.observers[] → (person, witnessed, event)
|
||
|
|
- event (each) → (event, documented_in, document)
|
||
|
|
- event.uap_objects_observed → (event, involves_uap, uap_object)
|
||
|
|
- event.primary_location → (event, occurred_at, location) [substring
|
||
|
|
match against this doc's locations[]]
|
||
|
|
- people[] (each) → (person, mentioned_by, document)
|
||
|
|
|
||
|
|
B. EXPLICIT relations[] from the same JSON that resolve EXACTLY (both
|
||
|
|
endpoints found in the entity name→id index): captures person↔org
|
||
|
|
(employed_by, signed, authored, commanded), etc.
|
||
|
|
|
||
|
|
ID generation mirrors scripts/synthesize/30_rebuild_wiki_from_reextract.py so
|
||
|
|
event_id / person_id / uap_object_id match the entities table exactly.
|
||
|
|
|
||
|
|
Run (DATABASE_URL must point at target Postgres):
|
||
|
|
DATABASE_URL=postgresql://... python3 scripts/maintain/57_load_relations_from_json.py [--truncate]
|
||
|
|
"""
|
||
|
|
from __future__ import annotations
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import re
|
||
|
|
import sys
|
||
|
|
import unicodedata
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import psycopg
|
||
|
|
import yaml
|
||
|
|
|
||
|
|
UFO = Path(os.environ.get("UFO_ROOT", "/Users/guto/ufo"))
|
||
|
|
RAW = UFO / "raw"
|
||
|
|
ENT = UFO / "wiki" / "entities"
|
||
|
|
|
||
|
|
CLASS_DIR = {
|
||
|
|
"person": "people",
|
||
|
|
"organization": "organizations",
|
||
|
|
"location": "locations",
|
||
|
|
"event": "events",
|
||
|
|
"uap_object": "uap-objects",
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
# ── ID generation (mirror of synthesize/30) ─────────────────────────────────
|
||
|
|
def canonicalize_name(name: str) -> str:
|
||
|
|
if not name:
|
||
|
|
return ""
|
||
|
|
nfd = unicodedata.normalize("NFD", name)
|
||
|
|
ascii_str = "".join(c for c in nfd if not unicodedata.combining(c))
|
||
|
|
low = ascii_str.lower()
|
||
|
|
rep = re.sub(r"[^a-z0-9-]", "-", low)
|
||
|
|
col = re.sub(r"-+", "-", rep).strip("-")
|
||
|
|
if col and col[0].isdigit():
|
||
|
|
col = "x-" + col
|
||
|
|
return col
|
||
|
|
|
||
|
|
|
||
|
|
def event_id_from(label: str, date_start: str | None) -> str:
|
||
|
|
slug = canonicalize_name(label or "")[:40].strip("-") or "unlabeled"
|
||
|
|
date = date_start or ""
|
||
|
|
m = re.match(r"^(\d{4})-(\d{2})-(\d{2})$", date)
|
||
|
|
if m: return f"EV-{m.group(1)}-{m.group(2)}-{m.group(3)}-{slug}"
|
||
|
|
m = re.match(r"^(\d{4})-(\d{2})$", date)
|
||
|
|
if m: return f"EV-{m.group(1)}-{m.group(2)}-XX-{slug}"
|
||
|
|
m = re.match(r"^(\d{4})$", date)
|
||
|
|
if m: return f"EV-{m.group(1)}-XX-XX-{slug}"
|
||
|
|
return f"EV-XXXX-XX-XX-{slug}"
|
||
|
|
|
||
|
|
|
||
|
|
def uap_object_id(event_id: str, index: int) -> str:
|
||
|
|
if event_id.startswith("EV-"):
|
||
|
|
parts = event_id[3:].split("-", 4)
|
||
|
|
if len(parts) >= 4:
|
||
|
|
year = parts[0]
|
||
|
|
slug = "-".join(parts[3:])
|
||
|
|
compact = re.sub(r"[^A-Z0-9]", "", slug.upper())[:20] or "UNK"
|
||
|
|
return f"OBJ-EV{year}-{compact}-{index:02d}"
|
||
|
|
return f"OBJ-UNK-{index:02d}"
|
||
|
|
|
||
|
|
|
||
|
|
def lower(s: str) -> str:
|
||
|
|
return (s or "").strip().lower()
|
||
|
|
|
||
|
|
|
||
|
|
def parse_frontmatter(path: Path) -> dict | None:
|
||
|
|
try:
|
||
|
|
text = path.read_text(encoding="utf-8")
|
||
|
|
if not text.startswith("---"):
|
||
|
|
return None
|
||
|
|
return yaml.safe_load(text.split("---", 2)[1]) or {}
|
||
|
|
except Exception:
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def build_name_index() -> dict[str, dict[str, str]]:
|
||
|
|
"""Per class: {name_or_alias_lower: entity_id} from real entity files."""
|
||
|
|
index: dict[str, dict[str, str]] = {c: {} for c in CLASS_DIR}
|
||
|
|
for cls, dirname in CLASS_DIR.items():
|
||
|
|
d = ENT / dirname
|
||
|
|
if not d.is_dir():
|
||
|
|
continue
|
||
|
|
for f in d.glob("*.md"):
|
||
|
|
eid = f.stem
|
||
|
|
fm = parse_frontmatter(f)
|
||
|
|
if not fm:
|
||
|
|
index[cls].setdefault(eid, eid)
|
||
|
|
continue
|
||
|
|
for n in [fm.get("canonical_name")] + (fm.get("aliases") or []):
|
||
|
|
k = lower(n)
|
||
|
|
if k and k not in index[cls]:
|
||
|
|
index[cls][k] = eid
|
||
|
|
return index
|
||
|
|
|
||
|
|
|
||
|
|
def entity_id_sets(index) -> dict[str, set]:
|
||
|
|
return {cls: set(m.values()) for cls, m in index.items()}
|
||
|
|
|
||
|
|
|
||
|
|
def main() -> int:
|
||
|
|
truncate = "--truncate" in sys.argv
|
||
|
|
dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL")
|
||
|
|
if not dburl:
|
||
|
|
sys.exit("DATABASE_URL not set")
|
||
|
|
|
||
|
|
print("Building name→id index from wiki/entities ...")
|
||
|
|
index = build_name_index()
|
||
|
|
ids = entity_id_sets(index)
|
||
|
|
for cls in CLASS_DIR:
|
||
|
|
print(f" {cls}: {len(index[cls])} keys / {len(ids[cls])} ids")
|
||
|
|
|
||
|
|
rows: list[tuple] = []
|
||
|
|
|
||
|
|
def add(sc, sid, rtype, tc, tid, doc_id, conf):
|
||
|
|
if not (sid and tid):
|
||
|
|
return
|
||
|
|
ev = f"[[{doc_id}]]" if doc_id else None
|
||
|
|
rows.append((sc, sid, rtype, tc, tid, ev, conf, "reextract"))
|
||
|
|
|
||
|
|
def resolve(cls, name):
|
||
|
|
if cls == "document":
|
||
|
|
return (name or "").strip() or None
|
||
|
|
return index.get(cls, {}).get(lower(name))
|
||
|
|
|
||
|
|
n_docs = 0
|
||
|
|
for jf in sorted(RAW.glob("*--subagent/_reextract.json")):
|
||
|
|
doc_id = jf.parent.name.removesuffix("--subagent")
|
||
|
|
try:
|
||
|
|
d = json.loads(jf.read_text(encoding="utf-8"))
|
||
|
|
except Exception:
|
||
|
|
continue
|
||
|
|
n_docs += 1
|
||
|
|
|
||
|
|
# locations declared in this doc (clean names) → for substring match
|
||
|
|
doc_locs = []
|
||
|
|
for l in d.get("locations") or []:
|
||
|
|
nm = (l.get("name") or "").strip()
|
||
|
|
lid = canonicalize_name(nm)
|
||
|
|
if nm and lid in ids["location"]:
|
||
|
|
doc_locs.append((nm.lower(), lid))
|
||
|
|
# longest names first (more specific match)
|
||
|
|
doc_locs.sort(key=lambda x: -len(x[0]))
|
||
|
|
|
||
|
|
# A. structural from events[]
|
||
|
|
for e in d.get("events") or []:
|
||
|
|
label = (e.get("label") or "").strip()
|
||
|
|
if not label:
|
||
|
|
continue
|
||
|
|
eid = event_id_from(label, e.get("date_start"))
|
||
|
|
if eid not in ids["event"]:
|
||
|
|
continue # event entity must exist
|
||
|
|
conf = e.get("confidence") or "medium"
|
||
|
|
|
||
|
|
# event documented_in document
|
||
|
|
add("event", eid, "documented_in", "document", doc_id, doc_id, "high")
|
||
|
|
|
||
|
|
# observers witnessed event
|
||
|
|
for o in e.get("observers") or []:
|
||
|
|
nm = o.get("name") if isinstance(o, dict) else o
|
||
|
|
if nm and lower(nm) != "unknown":
|
||
|
|
pid = index["person"].get(lower(nm)) or (
|
||
|
|
canonicalize_name(nm) if canonicalize_name(nm) in ids["person"] else None
|
||
|
|
)
|
||
|
|
if pid:
|
||
|
|
add("person", pid, "witnessed", "event", eid, doc_id, conf)
|
||
|
|
|
||
|
|
# uap_objects involves_uap
|
||
|
|
for i, u in enumerate(e.get("uap_objects_observed") or [], 1):
|
||
|
|
if not isinstance(u, dict):
|
||
|
|
continue
|
||
|
|
oid = uap_object_id(eid, i)
|
||
|
|
if oid in ids["uap_object"]:
|
||
|
|
add("event", eid, "involves_uap", "uap_object", oid, doc_id, conf)
|
||
|
|
|
||
|
|
# event occurred_at location (substring match of doc locations)
|
||
|
|
ploc = lower(e.get("primary_location_name"))
|
||
|
|
if ploc:
|
||
|
|
for lname, lid in doc_locs:
|
||
|
|
if lname and lname in ploc:
|
||
|
|
add("event", eid, "occurred_at", "location", lid, doc_id, "medium")
|
||
|
|
break
|
||
|
|
|
||
|
|
# people mentioned_by document
|
||
|
|
for p in d.get("people") or []:
|
||
|
|
nm = (p.get("name") or "").strip()
|
||
|
|
if nm and lower(nm) != "unknown":
|
||
|
|
pid = index["person"].get(lower(nm))
|
||
|
|
if pid:
|
||
|
|
add("person", pid, "mentioned_by", "document", doc_id, doc_id, "medium")
|
||
|
|
|
||
|
|
# B. explicit relations[] that resolve exactly
|
||
|
|
for r in d.get("relations") or []:
|
||
|
|
if not isinstance(r, dict):
|
||
|
|
continue
|
||
|
|
sc, tc, rtype = r.get("source_class"), r.get("target_class"), r.get("type")
|
||
|
|
if not (sc and tc and rtype):
|
||
|
|
continue
|
||
|
|
# skip the structural types already covered to avoid noise dup
|
||
|
|
sid = resolve(sc, r.get("source_name"))
|
||
|
|
tid = resolve(tc, r.get("target_name"))
|
||
|
|
if sid and tid:
|
||
|
|
add(sc, sid, rtype, tc, tid, doc_id, r.get("confidence") or "medium")
|
||
|
|
|
||
|
|
print(f"\nProcessed {n_docs} docs; raw relation rows: {len(rows)}")
|
||
|
|
|
||
|
|
# dedupe by (source, type, target) — keep first (evidence may vary)
|
||
|
|
seen: set[tuple] = set()
|
||
|
|
deduped: list[tuple] = []
|
||
|
|
for row in rows:
|
||
|
|
key = (row[0], row[1], row[2], row[3], row[4])
|
||
|
|
if key in seen:
|
||
|
|
continue
|
||
|
|
seen.add(key)
|
||
|
|
deduped.append(row)
|
||
|
|
print(f"After dedup: {len(deduped)}")
|
||
|
|
if not deduped:
|
||
|
|
return 0
|
||
|
|
|
||
|
|
with psycopg.connect(dburl) as conn:
|
||
|
|
with conn.cursor() as cur:
|
||
|
|
if truncate:
|
||
|
|
cur.execute("TRUNCATE public.relations")
|
||
|
|
print(" TRUNCATEd public.relations")
|
||
|
|
cur.execute("CREATE TEMP TABLE _rel (LIKE public.relations INCLUDING DEFAULTS)")
|
||
|
|
with cur.copy(
|
||
|
|
"""COPY _rel (source_class, source_id, relation_type,
|
||
|
|
target_class, target_id, evidence_ref,
|
||
|
|
confidence, extracted_by) FROM STDIN"""
|
||
|
|
) as cp:
|
||
|
|
for row in deduped:
|
||
|
|
cp.write_row(row)
|
||
|
|
cur.execute(
|
||
|
|
"""INSERT INTO public.relations
|
||
|
|
(source_class, source_id, relation_type,
|
||
|
|
target_class, target_id, evidence_ref,
|
||
|
|
confidence, extracted_by)
|
||
|
|
SELECT source_class, source_id, relation_type,
|
||
|
|
target_class, target_id, evidence_ref,
|
||
|
|
confidence, extracted_by
|
||
|
|
FROM _rel ON CONFLICT DO NOTHING"""
|
||
|
|
)
|
||
|
|
print(f"Inserted (after ON CONFLICT): {cur.rowcount}")
|
||
|
|
cur.execute(
|
||
|
|
"SELECT relation_type, COUNT(*) FROM public.relations GROUP BY relation_type ORDER BY 2 DESC"
|
||
|
|
)
|
||
|
|
print("\n=== Relation counts in DB ===")
|
||
|
|
for t, n in cur.fetchall():
|
||
|
|
print(f" {n:>7} {t}")
|
||
|
|
conn.commit()
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
sys.exit(main())
|