#!/usr/bin/env python3 """ 06-graph-export.py — Export a graph JSON of the wiki for client-side viz Walks wiki/ and case/, builds: nodes: - one per document - one per entity (person, organization, location, event, uap_object, vehicle, operation, concept) - one per gap, evidence, hypothesis, witness, profile (case artifacts) edges: - document → page (contains) - page → entity (mentions, via entities_extracted) - entity → entity (related_*, observed_in_event, primary_location, etc.) - relation node nodes[] → its members (connect-the-dots) - gap.detected_in[] → page/document Output: wiki/graph.json The JSON shape is friendly to Cytoscape / Sigma.js / react-flow. Each node carries `type`, `label`, `entity_class` (when applicable), and `data` with the frontmatter fields useful for filters (collection, country, date, confidence, etc.). Usage: ./06-graph-export.py ./06-graph-export.py --out /path/to/graph.json """ from __future__ import annotations import argparse import json import re import sys from collections import defaultdict from datetime import datetime, timezone from pathlib import Path try: import yaml except ImportError: sys.stderr.write("Missing pyyaml. Run: pip3 install pyyaml\n") sys.exit(1) UFO_ROOT = Path("/Users/guto/ufo") WIKI_BASE = UFO_ROOT / "wiki" CASE_BASE = UFO_ROOT / "case" DEFAULT_OUT = WIKI_BASE / "graph.json" LOG_PATH = WIKI_BASE / "log.md" WIKI_LINK_RE = re.compile(r"\[\[([^\]|]+)(?:\|[^\]]+)?\]\]") def utc_now_iso() -> str: return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def read_md(path: Path) -> tuple[dict, str]: c = path.read_text(encoding="utf-8") if not c.startswith("---"): return {}, c end = c.find("---", 4) if end == -1: return {}, c try: return (yaml.safe_load(c[3:end].strip()) or {}), c[end + 3 :] except yaml.YAMLError: return {}, c[end + 3 :] def parse_wiki_link(target: str) -> tuple[str, str] | None: """Extract (namespace, id) from a wiki-link target string. Returns None for unrecognized targets. """ t = target.strip() # Page link: doc-id/pNNN m = re.match(r"^([a-z0-9][a-z0-9-]*)/p\d{3}$", t) if m: return ("page", t) if "/" in t: ns, rest = t.split("/", 1) return (ns, rest) # Bare doc_id return ("document", t) def node_id_from_link(target: str) -> str | None: """Compute the canonical node id used in the graph from a wiki-link target.""" parsed = parse_wiki_link(target) if not parsed: return None ns, rest = parsed return f"{ns}:{rest}" def make_node(node_id: str, ntype: str, label: str, **extra) -> dict: n = {"id": node_id, "type": ntype, "label": label} if extra: n["data"] = extra return n def make_edge(source: str, target: str, kind: str, weight: float = 1.0) -> dict: return {"source": source, "target": target, "kind": kind, "weight": weight} # ---------------------------------------------------------------------- def collect_documents(graph: dict): docs_dir = WIKI_BASE / "documents" for p in sorted(docs_dir.glob("*.md")): fm, _ = read_md(p) if fm.get("type") != "document": continue doc_id = fm.get("doc_id", p.stem) node_id = f"document:{doc_id}" graph["nodes"][node_id] = make_node( node_id, "document", fm.get("canonical_title", doc_id), collection=fm.get("collection"), document_class=fm.get("document_class"), page_count=fm.get("page_count"), content_classification=fm.get("content_classification"), document_date=fm.get("document_date"), highest_classification=fm.get("highest_classification"), ) # document → page edges for page_ref in (fm.get("pages") or []): page_id_link = page_ref.get("page_id") if isinstance(page_ref, dict) else None if not page_id_link: continue # extract the [[doc/pNNN]] target m = WIKI_LINK_RE.search(page_id_link) if not m: continue target = node_id_from_link(m.group(1)) if target: graph["edges"].append(make_edge(node_id, target, "contains")) # document → key entities key_entities = fm.get("key_entities") or {} for cls, refs in key_entities.items(): for ref in (refs or []): if not isinstance(ref, str): continue m = WIKI_LINK_RE.search(ref) if m: tgt = node_id_from_link(m.group(1)) if tgt: graph["edges"].append(make_edge(node_id, tgt, "key-entity")) # gaps_flagged for ref in (fm.get("gaps_flagged") or []): if isinstance(ref, str): m = WIKI_LINK_RE.search(ref) if m: tgt = node_id_from_link(m.group(1)) if tgt: graph["edges"].append(make_edge(node_id, tgt, "flags-gap")) def collect_pages(graph: dict): for p in sorted((WIKI_BASE / "pages").rglob("*.md")): fm, _ = read_md(p) if fm.get("type") != "page": continue page_id = fm.get("page_id") if not page_id: continue node_id = f"page:{page_id}" graph["nodes"][node_id] = make_node( node_id, "page", f"{fm.get('doc_id', '?')} p{fm.get('page_number', '?'):>03}", page_number=fm.get("page_number"), page_type=fm.get("page_type"), content_classification=fm.get("content_classification"), language_detected=fm.get("language_detected"), ) # page → entities (mentions, via entities_extracted) page_entity_map = { "people": "people", "organizations": "org", "locations": "loc", "vehicles": "vehicle", "operations": "op", "concepts": "concept", } ents = fm.get("entities_extracted") or {} # We can't easily resolve the canonicalized id here without doing the # alias-match lookup. The lint script's `mentioned_in[]` is the # source-of-truth for who-mentions-who, so we'll add edges from # entity → page later, not page → entity here. def collect_entities(graph: dict): entities_root = WIKI_BASE / "entities" for p in sorted(entities_root.rglob("*.md")): fm, _ = read_md(p) if fm.get("type") != "entity": continue ec = fm.get("entity_class") id_field_map = { "person": "person_id", "organization": "organization_id", "location": "location_id", "event": "event_id", "uap_object": "uap_object_id", "vehicle": "vehicle_id", "operation": "operation_id", "concept": "concept_id", } ns_map = { "person": "people", "organization": "org", "location": "loc", "event": "event", "uap_object": "uap", "vehicle": "vehicle", "operation": "op", "concept": "concept", } id_field = id_field_map.get(ec) ns = ns_map.get(ec) if not id_field or not ns: continue eid = fm.get(id_field) or p.stem node_id = f"{ns}:{eid}" graph["nodes"][node_id] = make_node( node_id, ns, fm.get("canonical_name", eid), entity_class=ec, aliases=fm.get("aliases"), country=fm.get("country"), location_type=fm.get("location_type"), organization_type=fm.get("organization_type"), shape=fm.get("shape"), color=fm.get("color"), event_class=fm.get("event_class"), date_start=fm.get("date_start"), concept_class=fm.get("concept_class"), total_mentions=fm.get("total_mentions"), enrichment_status=fm.get("enrichment_status"), ) # entity → page edges (via mentioned_in[]) for m in (fm.get("mentioned_in") or []): if not isinstance(m, dict): continue link = m.get("page") if not link: continue mm = WIKI_LINK_RE.search(link) if mm: tgt = node_id_from_link(mm.group(1)) if tgt: graph["edges"].append(make_edge( node_id, tgt, "mentioned-in", weight=m.get("mention_count", 1), )) # event-specific links if ec == "event": pl = fm.get("primary_location") if isinstance(pl, str): mm = WIKI_LINK_RE.search(pl) if mm: tgt = node_id_from_link(mm.group(1)) if tgt: graph["edges"].append(make_edge(node_id, tgt, "occurred-at")) for obj in (fm.get("uap_objects") or []): if isinstance(obj, str): mm = WIKI_LINK_RE.search(obj) if mm: tgt = node_id_from_link(mm.group(1)) if tgt: graph["edges"].append(make_edge(node_id, tgt, "observed-uap")) # uap_object → event if ec == "uap_object": ev = fm.get("observed_in_event") if isinstance(ev, str): mm = WIKI_LINK_RE.search(ev) if mm: tgt = node_id_from_link(mm.group(1)) if tgt: graph["edges"].append(make_edge(node_id, tgt, "observed-in-event")) def collect_case_artifacts(graph: dict): """Add gaps, evidence, witnesses, hypotheses, profiles, relations.""" type_to_ns = { "gap": "gap", "evidence": "evidence", "witness_analysis": "witness", "hypothesis": "hypothesis", "actor_profile": "profile", "relation": "relation", "case_report": "case", "residual_uncertainty": "case", "timeline": "timeline", } for p in sorted(CASE_BASE.rglob("*.md")): fm, _ = read_md(p) t = fm.get("type") if t not in type_to_ns: continue ns = type_to_ns[t] # ID detection id_field = { "gap": "gap_id", "evidence": "evidence_id", "witness_analysis": "witness_id", "hypothesis": "hypothesis_id", "actor_profile": "actor_profile_id", "relation": "relation_id", "case_report": "case_id", "timeline": "scope_id", }.get(t) eid = (fm.get(id_field) if id_field else None) or p.stem node_id = f"{ns}:{eid}" graph["nodes"][node_id] = make_node( node_id, ns, fm.get("canonical_title", eid), t=t, severity=fm.get("severity"), evidence_grade=fm.get("evidence_grade"), status=fm.get("status"), verdict=fm.get("verdict"), connection_strength=fm.get("connection_strength"), ) # gap → detected_in (pages) if t == "gap": for ref in (fm.get("detected_in") or []): if isinstance(ref, str): mm = WIKI_LINK_RE.search(ref) if mm: tgt = node_id_from_link(mm.group(1)) if tgt: graph["edges"].append(make_edge(node_id, tgt, "detected-in")) # relation → nodes[] if t == "relation": for ref in (fm.get("nodes") or []): if isinstance(ref, str): mm = WIKI_LINK_RE.search(ref) if mm: tgt = node_id_from_link(mm.group(1)) if tgt: graph["edges"].append(make_edge(node_id, tgt, "relates")) # ---------------------------------------------------------------------- def main(): ap = argparse.ArgumentParser(description="Export wiki graph (nodes + edges) as JSON.") ap.add_argument("--out", default=str(DEFAULT_OUT), help=f"output path (default: {DEFAULT_OUT})") args = ap.parse_args() graph: dict = { "generated_at": utc_now_iso(), "wiki_version": "0.1.0", "nodes": {}, # dict keyed by node_id for dedup, flattened to list at end "edges": [], } collect_documents(graph) collect_pages(graph) collect_entities(graph) collect_case_artifacts(graph) # Dedup edges edge_seen = set() deduped_edges = [] for e in graph["edges"]: key = (e["source"], e["target"], e["kind"]) if key in edge_seen: continue edge_seen.add(key) deduped_edges.append(e) graph["edges"] = deduped_edges # Filter out edges pointing to nodes we don't have (broken refs) valid_ids = set(graph["nodes"].keys()) pre = len(graph["edges"]) graph["edges"] = [e for e in graph["edges"] if e["source"] in valid_ids and e["target"] in valid_ids] dropped = pre - len(graph["edges"]) nodes_list = sorted(graph["nodes"].values(), key=lambda n: (n["type"], n["id"])) graph["nodes"] = nodes_list # Summary by type by_type: dict[str, int] = defaultdict(int) for n in nodes_list: by_type[n["type"]] += 1 edges_by_kind: dict[str, int] = defaultdict(int) for e in graph["edges"]: edges_by_kind[e["kind"]] += 1 out_path = Path(args.out) out_path.parent.mkdir(parents=True, exist_ok=True) out_path.write_text(json.dumps(graph, indent=2, ensure_ascii=False), encoding="utf-8") print(f"Graph written to {out_path}", flush=True) print(f" nodes total: {len(nodes_list)}", flush=True) for t, n in sorted(by_type.items()): print(f" {t}: {n}", flush=True) print(f" edges total: {len(graph['edges'])} (dropped {dropped} dangling)", flush=True) for k, n in sorted(edges_by_kind.items()): print(f" {k}: {n}", flush=True) with open(LOG_PATH, "a", encoding="utf-8") as fh: fh.write(f"\n## {utc_now_iso()} — GRAPH EXPORT\n") fh.write(f"- operator: archivist\n- script: scripts/06-graph-export.py\n") fh.write(f"- output: {out_path.relative_to(UFO_ROOT)}\n") fh.write(f"- nodes: {len(nodes_list)}\n- edges: {len(graph['edges'])}\n") if __name__ == "__main__": main()