#!/usr/bin/env python3 """ 20_entity_summary.py — Synthesise narrative_summary EN+PT-BR for entities with total_mentions >= threshold, via Claude Code OAuth subprocess (Sonnet). Strategy per entity: 1. Query DB for top-K verbatim chunk snippets that mention the entity (joined via public.entity_mentions + public.chunks). K=10 default. 2. Build a Holmes-Watson voice prompt with the entity's canonical_name, class, alias list, and the verbatim snippets. 3. Call `claude -p --model sonnet --output-format json` → JSON with narrative_summary + narrative_summary_pt_br. 4. Update wiki/entities//.md frontmatter: - narrative_summary, narrative_summary_pt_br - summary_status: 'synthesized' - summary_confidence: 'medium' - last_lint: now() Idempotent: entities with summary_status in {'synthesized','curated','red_teamed'} are skipped. Re-run safely advances any new ones. Throttle: 1 entity at a time (sequential). Max 20x plan: 5h window. Usage: ./20_entity_summary.py --min-mentions 20 --limit 200 # top entities ./20_entity_summary.py --classes person,organization # subset ./20_entity_summary.py --dry-run --limit 5 # preview """ from __future__ import annotations import argparse import json import os import re import subprocess import sys import time from datetime import datetime, timezone from pathlib import Path try: import yaml import psycopg except ImportError as e: sys.stderr.write(f"pip3 install pyyaml psycopg[binary] # missing: {e}\n") sys.exit(1) UFO_ROOT = Path(__file__).resolve().parents[2] ENTITIES_BASE = UFO_ROOT / "wiki" / "entities" LOG_PATH = UFO_ROOT / "wiki" / "log.md" DATABASE_URL = os.getenv("DATABASE_URL") or os.getenv("SUPABASE_DB_URL") # Map DB entity_class → filesystem folder CLASS_FOLDER = { "person": "people", "organization": "organizations", "location": "locations", "event": "events", "uap_object": "uap-objects", "vehicle": "vehicles", "operation": "operations", "concept": "concepts", } def utc_iso() -> str: return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") PROMPT_TEMPLATE = """You are writing an encyclopedic entry for an investigative UAP/UFO wiki ("The Disclosure Bureau"). Voice rules: - Holmes–Watson narrator: precise, fact-dense, no hype, no breathless language. - Open with what this entity is and how it figures in the corpus. Cite who/where/when. Optionally mention notable patterns across the snippets. - 3–6 sentences. No editorial speculation beyond what the snippets support. - Original-language verbatim quotes stay as-is; the EN summary is in English, the PT-BR summary in Brazilian Portuguese (with full UTF-8 accents). - If snippets contradict each other or are sparse, say so plainly. - NEVER include placeholder text like "Will be enriched in Phase N", "[REDACTED]", or markdown headings — pure prose only. ENTITY: - Class: {entity_class} - Canonical name: {name} - Aliases: {aliases} - Total mentions across corpus: {total_mentions} - Documents that mention it: {documents_count} TOP {n_snippets} VERBATIM SNIPPETS FROM THE CORPUS: {snippets} OUTPUT (STRICT JSON, no markdown fences, no commentary): {{ "narrative_summary": "", "narrative_summary_pt_br": "" }}""" def call_sonnet(prompt: str, timeout_s: int = 180) -> dict: """claude -p --model sonnet --output-format json subprocess.""" try: res = subprocess.run( ["claude", "-p", "--model", "sonnet", "--output-format", "json"], input=prompt, capture_output=True, text=True, timeout=timeout_s, check=False, ) except subprocess.TimeoutExpired: raise RuntimeError(f"claude subprocess timed out after {timeout_s}s") if res.returncode != 0: raise RuntimeError(f"claude exit {res.returncode}: {res.stderr[:300]}") try: env = json.loads(res.stdout) except json.JSONDecodeError as e: raise RuntimeError(f"unparseable claude envelope: {e} :: {res.stdout[:300]}") txt = env.get("result") or env.get("response") or env.get("content") or "" txt = re.sub(r"^```(?:json)?\s*|\s*```$", "", txt.strip(), flags=re.MULTILINE).strip() try: return json.loads(txt) except json.JSONDecodeError: m = re.search(r"\{.*?\"narrative_summary\".*\}", txt, flags=re.DOTALL) if not m: raise RuntimeError(f"no JSON object in claude output: {txt[:300]}") return json.loads(m.group(0)) def load_md(path: Path) -> tuple[dict, str]: raw = path.read_text(encoding="utf-8") if not raw.startswith("---"): return {}, raw end = raw.find("---", 4) fm = yaml.safe_load(raw[3:end].strip()) or {} body = raw[end + 3:].lstrip("\n") return fm, body def write_md(path: Path, fm: dict, body: str) -> None: yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False) sep = "" if body.startswith("\n") else "\n" path.write_text(f"---\n{yaml_str}---\n{sep}{body}", encoding="utf-8") def fetch_top_entities(conn, min_mentions: int, limit: int, classes: list[str] | None, require_status_none: bool): sql = """ SELECT entity_pk, entity_class, entity_id, canonical_name, COALESCE(aliases, ARRAY[]::text[]) AS aliases, total_mentions, documents_count FROM public.entities WHERE total_mentions >= %s """ params: list = [min_mentions] if classes: sql += " AND entity_class = ANY(%s)" params.append(classes) sql += " ORDER BY total_mentions DESC LIMIT %s" params.append(limit) with conn.cursor() as cur: cur.execute(sql, params) return cur.fetchall() def fetch_snippets(conn, entity_pk: int, k: int = 10) -> list[str]: """Top-K longest chunks (by content length) mentioning the entity.""" sql = """ SELECT c.content_pt, c.content_en, c.doc_id, c.page, c.type FROM public.entity_mentions em JOIN public.chunks c ON c.chunk_pk = em.chunk_pk WHERE em.entity_pk = %s ORDER BY GREATEST(COALESCE(LENGTH(c.content_pt),0), COALESCE(LENGTH(c.content_en),0)) DESC LIMIT %s """ with conn.cursor() as cur: cur.execute(sql, (entity_pk, k)) rows = cur.fetchall() out = [] for pt, en, doc, page, typ in rows: body = (pt or en or "").strip() if not body: continue # Cap each snippet so the prompt stays compact body = body[:600] out.append(f"- ({doc} p{page} · {typ}) {body}") return out def resolve_path(entity_class: str, entity_id: str) -> Path: folder = CLASS_FOLDER.get(entity_class) if not folder: raise ValueError(f"unknown entity_class {entity_class}") return ENTITIES_BASE / folder / f"{entity_id}.md" def synthesise_one(conn, row, dry_run: bool, verbose: bool) -> str: entity_pk, entity_class, entity_id, canonical_name, aliases, total_mentions, documents_count = row path = resolve_path(entity_class, entity_id) if not path.exists(): return "skipped (file missing)" fm, body = load_md(path) status = fm.get("summary_status") if status in ("synthesized", "curated", "red_teamed"): return f"skipped (already {status})" snippets = fetch_snippets(conn, entity_pk, k=10) if not snippets: return "skipped (no snippets)" prompt = PROMPT_TEMPLATE.format( entity_class=entity_class, name=canonical_name, aliases=", ".join((aliases or [])[:8]) or "—", total_mentions=total_mentions, documents_count=documents_count, n_snippets=len(snippets), snippets="\n".join(snippets), ) if dry_run: return f"ok (dry — {len(snippets)} snippets, {len(prompt)} chars prompt)" if verbose: print(f" → calling sonnet ({len(snippets)} snippets, {len(prompt)} chars)...", flush=True) out = call_sonnet(prompt) narr_en = (out.get("narrative_summary") or "").strip() narr_pt = (out.get("narrative_summary_pt_br") or "").strip() if len(narr_en) < 40 or len(narr_pt) < 40: return f"empty/short output (en={len(narr_en)}, pt={len(narr_pt)})" fm["narrative_summary"] = narr_en fm["narrative_summary_pt_br"] = narr_pt fm["summary_status"] = "synthesized" fm["summary_confidence"] = "medium" fm["last_lint"] = utc_iso() # Refresh canonical mention counts from DB so the wiki agrees with retrieval fm["total_mentions"] = int(total_mentions) fm["documents_count"] = int(documents_count) write_md(path, fm, body) return "ok" def main() -> int: p = argparse.ArgumentParser() p.add_argument("--min-mentions", type=int, default=20) p.add_argument("--limit", type=int, default=200) p.add_argument("--classes", default=None, help="comma-separated subset (e.g. 'person,organization,location')") p.add_argument("--dry-run", action="store_true") p.add_argument("--verbose", action="store_true") p.add_argument("--sleep", type=float, default=0.5, help="seconds between calls (respect Max 20x rate)") args = p.parse_args() if not DATABASE_URL: sys.stderr.write("DATABASE_URL not set\n") return 1 classes = [c.strip() for c in args.classes.split(",")] if args.classes else None print(f"connecting → {DATABASE_URL.split('@')[-1]}") with psycopg.connect(DATABASE_URL) as conn: rows = fetch_top_entities(conn, args.min_mentions, args.limit, classes, require_status_none=True) print(f"candidates: {len(rows)} (min_mentions={args.min_mentions}, limit={args.limit})") done = 0 skipped = 0 errors = 0 for i, row in enumerate(rows, 1): entity_pk, entity_class, entity_id, canonical_name, _, total_mentions, _ = row label = f"[{i:>3}/{len(rows)}] {entity_class}/{entity_id} ({total_mentions}m) — {canonical_name[:40]}" try: msg = synthesise_one(conn, row, args.dry_run, args.verbose) except Exception as e: errors += 1 print(f" ✗ {label} — ERROR: {e}", flush=True) continue if msg.startswith("ok"): done += 1 print(f" ✓ {label} — {msg}", flush=True) else: skipped += 1 print(f" · {label} — {msg}", flush=True) if not args.dry_run and args.sleep > 0: time.sleep(args.sleep) print(f"\ndone={done} skipped={skipped} errors={errors}") if not args.dry_run and done > 0: with LOG_PATH.open("a", encoding="utf-8") as f: f.write( f"\n## {utc_iso()} · SYNTHESIZE_ENTITY_SUMMARIES\n" f"- script: scripts/synthesize/20_entity_summary.py\n" f"- min_mentions: {args.min_mentions}\n" f"- limit: {args.limit}\n" f"- synthesised: {done}\n" f"- skipped: {skipped}\n" f"- errors: {errors}\n" f"- model: claude-sonnet (via CLAUDE_CODE_OAUTH_TOKEN)\n" ) return 0 if __name__ == "__main__": sys.exit(main())