disclosure-bureau/scripts/17-enrich-entities.py

#!/usr/bin/env python3
"""
17-enrich-entities.py — Fase 6 — Enrichment externo de entidades

Para cada entidade em wiki/entities/<class>/<id>.md:
  - total_mentions >= 3  → enrichment_status: deep   (WebSearch + WebFetch + >=2 sources)
  - total_mentions 1-2   → enrichment_status: shallow (1 query + conhecimento interno)
  - total_mentions == 0  → enrichment_status: none   (skip)

Usa Claude CLI (`claude -p --model haiku`) com tools WebSearch e WebFetch,
mesmo padrão de OAuth/plano Max que 02-vision-page.py.

Pede ao modelo JSON estruturado com:
  - biographical_summary EN + PT-BR
  - external_sources[] (URL + título + publisher + key_facts + reliability_band)
  - additional_aliases, verified_facts
  - class-specific (dates pessoa, org_type, coordinates loc, etc.)

Atualiza:
  - frontmatter: enrichment_status, external_sources, last_enriched, +campos específicos
  - corpo: insere/atualiza seção "## Enrichment (EN)" + "## Enriquecimento (PT-BR)"
    PRESERVANDO descrição original (mantém marcador `<!-- enrichment:start -->` ...
    `<!-- enrichment:end -->` para idempotência)

Idempotente:
  - pula se `last_enriched` < ENRICHMENT_TTL_DAYS atrás (a menos que --force)
  - re-rodar não duplica seção (substitui entre marcadores)

Wrap em ThreadPoolExecutor por entidade (timeout 240s) — evita hang do CLI.

Uso:
  ./17-enrich-entities.py --all [--workers 3] [--force] [--max N] [--tier deep|shallow|all]
  ./17-enrich-entities.py --class people   # apenas pessoas
  ./17-enrich-entities.py --entity-id j-edgar-hoover
"""
from __future__ import annotations

import argparse
import concurrent.futures
import json
import re
import subprocess
import sys
import threading
import time
from datetime import datetime, timedelta, timezone
from pathlib import Path

try:
    import yaml
except ImportError:
    sys.stderr.write("Missing pyyaml. pip3 install pyyaml\n")
    sys.exit(1)


UFO_ROOT = Path("/Users/guto/ufo")
ENTITIES_BASE = UFO_ROOT / "wiki" / "entities"
LOG_PATH = UFO_ROOT / "wiki" / "log.md"

MODEL = "haiku"
WIKI_VERSION = "0.1.0"
ENRICHMENT_TTL_DAYS = 30
DEFAULT_WORKERS = 3
DEFAULT_TIMEOUT_S = 240
DEEP_THRESHOLD = 3   # >= 3 mentions = deep tier

ENRICH_START = "<!-- enrichment:start -->"
ENRICH_END = "<!-- enrichment:end -->"

# Class folder names under wiki/entities/
ENTITY_DIRS = ["people", "organizations", "locations", "events",
               "uap-objects", "vehicles", "operations", "concepts"]

_print_lock = threading.Lock()


def safe_print(*args, **kwargs):
    with _print_lock:
        print(*args, **kwargs, flush=True)


def utc_now_iso() -> str:
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def read_md(path: Path) -> tuple[dict, str]:
    c = path.read_text(encoding="utf-8")
    if not c.startswith("---"):
        return {}, c
    end = c.find("---", 4)
    if end == -1:
        return {}, c
    try:
        return (yaml.safe_load(c[3:end].strip()) or {}), c[end + 3:].lstrip("\n")
    except yaml.YAMLError:
        return {}, c[end + 3:].lstrip("\n")


def write_md(path: Path, fm: dict, body: str) -> bool:
    yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
    new = f"---\n{yaml_str}---\n\n{body}" if not body.startswith("\n") else f"---\n{yaml_str}---\n{body}"
    if path.exists() and path.read_text(encoding="utf-8") == new:
        return False
    path.write_text(new, encoding="utf-8")
    return True


def extract_json(text: str) -> dict:
    """Strip ```json fences then parse. Robust to leading/trailing junk."""
    t = text.strip()
    t = re.sub(r"^```(?:json)?\s*", "", t)
    t = re.sub(r"\s*```$", "", t)
    # Try direct
    try:
        return json.loads(t)
    except json.JSONDecodeError:
        pass
    # Try to find first { ... } balanced block
    start = t.find("{")
    if start == -1:
        raise json.JSONDecodeError("no { in response", t, 0)
    depth = 0
    for i in range(start, len(t)):
        if t[i] == "{":
            depth += 1
        elif t[i] == "}":
            depth -= 1
            if depth == 0:
                return json.loads(t[start:i + 1])
    raise json.JSONDecodeError("unbalanced braces", t, 0)


def is_stale(last_enriched: str | None) -> bool:
    if not last_enriched:
        return True
    try:
        ts = datetime.strptime(last_enriched, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
    except ValueError:
        return True
    return (datetime.now(timezone.utc) - ts) > timedelta(days=ENRICHMENT_TTL_DAYS)


def build_prompt(entity_class: str, fm: dict, tier: str) -> str:
    canonical_name = fm.get("canonical_name") or fm.get("entity_id") or "?"
    aliases = fm.get("aliases") or []
    total_mentions = fm.get("total_mentions", 0)

    # Class-specific context hints
    role_hints = {
        "person":       "Look up biographical info, role, organization, dates of activity. Distinguish from people with same name (disambiguation).",
        "organization": "Look up organization type, founding date, country, mission, leadership. Note any UAP/UFO involvement.",
        "location":     "Look up coordinates (decimal lat/lon), country, region, type (city/airbase/sea/etc.), notable UAP-related history if any.",
        "event":        "Look up historical accounts of this event — date, location, official statements, primary sources.",
        "uap_object":   "External enrichment usually not applicable. Mark enrichment_status: none and explain why in summary.",
        "vehicle":      "Look up vehicle/aircraft model, operator, specs (if applicable).",
        "operation":    "Look up operation type (program/task-force/exercise), agency, date range, public knowledge.",
        "concept":      "Look up canonical definition, legal/scientific context, related programs.",
    }
    class_hint = role_hints.get(entity_class, "Look up authoritative info; cite sources.")

    deep_block = (
        "Use the WebSearch tool with 2-4 queries to find authoritative sources "
        "(Wikipedia, official government sites, peer-reviewed sources, established news outlets). "
        "Use WebFetch on the 2-3 best results to extract key facts. "
        "Provide >=2 distinct sources in external_sources[]."
    ) if tier == "deep" else (
        "Use the WebSearch tool with 1 query to confirm/disambiguate. "
        "Rely primarily on your own pretraining knowledge for the summary, but cite the 1 web source "
        "in external_sources[] (if found). External_sources may be empty if no reliable source surfaced."
    )

    aliases_str = "\n".join(f"  - {a}" for a in aliases[:8]) or "  (none)"

    prompt = f"""You are an OSINT analyst for the Investigation Bureau — enriching one entity from a US Department of War UAP/UFO archive.

ENTITY CONTEXT:
- Class: {entity_class}
- Canonical name: {canonical_name}
- Aliases / variants in corpus:
{aliases_str}
- Total mentions across corpus: {total_mentions}
- Tier: {tier} (>= {DEEP_THRESHOLD} mentions = deep)

GUIDANCE:
{class_hint}

RESEARCH PROTOCOL:
{deep_block}

Output ONE JSON object only (no markdown fence, no commentary, no preamble). Schema:

{{
  "enrichment_status": "{tier}",
  "disambiguation_note": "Brief note distinguishing from similar names (e.g., 'NOT to be confused with X who is Y'). Empty string if not applicable.",
  "biographical_summary_en": "3-6 sentences English. Focus on identity, role, period of activity, UAP relevance (if any). If genuinely cannot identify the entity (too generic, no public record), say so explicitly.",
  "biographical_summary_pt_br": "Same content in Brazilian Portuguese (pt-br, NOT European Portuguese). Preserve UTF-8 accents (ç, ã, é, etc.). Keep proper nouns and English-language verbatim quotes in English.",
  "additional_aliases": ["any alternative names, transliterations, common nicknames not already in the aliases list"],
  "verified_facts": [
    {{ "fact": "single verifiable claim", "source_url": "URL where it was found", "confidence_band": "high|medium|low" }}
  ],
  "external_sources": [
    {{ "url": "https://...", "title": "Page title", "publisher": "Wikipedia | NYT | DoD | etc.", "accessed_at": "{utc_now_iso()}", "key_facts": ["short fact 1", "short fact 2"], "reliability_band": "high|medium|low" }}
  ],
  "class_specific": {{
    "person":       {{"dates": {{"born": "YYYY-MM-DD or null", "died": "YYYY-MM-DD or null"}}, "primary_role": "...", "primary_organization": "..."}},
    "organization": {{"organization_type": "intelligence-agency|military-branch|civilian-agency|private-company|ngo|other", "country": "ISO-2 or descriptor", "founded": "YYYY or null"}},
    "location":     {{"coordinates": {{"lat": 0.0, "lon": 0.0}}, "location_type": "city|airbase|sea|...", "country": ["ISO-2 codes"]}},
    "event":        {{"date_start": "YYYY-MM-DD or YYYY or null", "primary_location": "...", "event_class": "uap-encounter|disclosure|legal-filing|other"}},
    "uap_object":   {{"note": "External enrichment usually not applicable for UAP objects."}},
    "vehicle":      {{"vehicle_class": "aircraft|ship|...", "operator": "...", "model": "..."}},
    "operation":    {{"operation_type": "military-operation|research-program|task-force|exercise|other", "status": "active|concluded|classified|unknown"}},
    "concept":      {{"concept_class": "legal-instrument|phenomenon-type|doctrine|scientific-term|jargon|program-name|other", "definition_short_en": "1 sentence", "definition_short_pt_br": "1 frase em pt-br"}}
  }}
}}

Rules:
- Provide ONLY the class_specific entry for `{entity_class}`. Other class entries can be omitted.
- If the entity is impossible to identify externally (generic descriptor, common name, redacted), set `external_sources: []` and explain in `biographical_summary_en`.
- ALWAYS preserve UTF-8 accents in PT-BR. Brazilian Portuguese, NOT European.
- Output ONLY the JSON. No fence, no preamble.
"""
    return prompt


def call_claude(prompt: str, timeout: int = DEFAULT_TIMEOUT_S) -> tuple[dict, dict]:
    """Invoke claude CLI with WebSearch + WebFetch. Wrapped in ThreadPoolExecutor for hard timeout."""

    def _run():
        cmd = [
            "claude", "-p", "--model", MODEL,
            "--output-format", "json",
            "--max-turns", "8",
            "--allowedTools", "WebSearch,WebFetch",
            "--",
            prompt,
        ]
        return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout + 30, check=False)

    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
        future = ex.submit(_run)
        try:
            res = future.result(timeout=timeout)
        except concurrent.futures.TimeoutError:
            raise RuntimeError(f"claude CLI hung > {timeout}s — aborted")

    if res.returncode != 0:
        raise RuntimeError(f"claude CLI rc={res.returncode}: {res.stderr[-1000:]}")
    cli_out = json.loads(res.stdout)
    if cli_out.get("is_error"):
        raise RuntimeError(f"claude reported error: {cli_out.get('result', '')[:300]}")
    enriched = extract_json(cli_out.get("result", ""))
    meta = {
        "duration_ms": cli_out.get("duration_ms"),
        "total_cost_usd": cli_out.get("total_cost_usd"),
        "num_turns": cli_out.get("num_turns"),
        "session_id": cli_out.get("session_id"),
    }
    return enriched, meta


def merge_into_frontmatter(fm: dict, enriched: dict, tier: str, now_iso: str) -> dict:
    """Update fm in-place with enrichment results. Returns fm."""
    cs = (enriched.get("class_specific") or {}).copy()
    # class_specific arrives as a single-key dict in many cases; flatten it
    class_specific_payload = {}
    if isinstance(cs, dict):
        # If it's nested {person: {...}} unwrap; otherwise treat as direct
        for v in cs.values():
            if isinstance(v, dict) and v:
                class_specific_payload = v
                break
        if not class_specific_payload:
            # Maybe already flat
            if any(k in cs for k in ("dates", "primary_role", "organization_type", "coordinates",
                                     "date_start", "vehicle_class", "operation_type", "concept_class")):
                class_specific_payload = cs

    fm["enrichment_status"] = enriched.get("enrichment_status") or tier
    fm["last_enriched"] = now_iso

    # external_sources (replace, not append — we want a fresh enrichment)
    fm["external_sources"] = enriched.get("external_sources") or []
    fm["disambiguation_note"] = enriched.get("disambiguation_note") or fm.get("disambiguation_note", "")
    fm["verified_facts"] = enriched.get("verified_facts") or []

    # Aliases: union
    existing_aliases = set(fm.get("aliases") or [])
    for a in (enriched.get("additional_aliases") or []):
        if isinstance(a, str) and a.strip():
            existing_aliases.add(a.strip())
    fm["aliases"] = sorted(existing_aliases)

    # Class-specific merges
    cls = fm.get("entity_class")
    if cls == "person" and class_specific_payload:
        if class_specific_payload.get("dates"):
            fm["dates"] = class_specific_payload["dates"]
        if class_specific_payload.get("primary_role"):
            fm["primary_role"] = class_specific_payload["primary_role"]
        if class_specific_payload.get("primary_organization"):
            fm["primary_organization"] = class_specific_payload["primary_organization"]
    elif cls == "organization" and class_specific_payload:
        for k in ("organization_type", "country", "founded"):
            if class_specific_payload.get(k) and not fm.get(k):
                fm[k] = class_specific_payload[k]
    elif cls == "location" and class_specific_payload:
        if class_specific_payload.get("coordinates") and not fm.get("coordinates"):
            fm["coordinates"] = class_specific_payload["coordinates"]
        for k in ("location_type", "country"):
            if class_specific_payload.get(k) and not fm.get(k):
                fm[k] = class_specific_payload[k]
    elif cls == "event" and class_specific_payload:
        for k in ("date_start", "primary_location", "event_class"):
            v = class_specific_payload.get(k)
            if v and (not fm.get(k) or fm.get(k) in ("NA", "uap-encounter", None)):
                fm[k] = v
    elif cls == "vehicle" and class_specific_payload:
        for k in ("vehicle_class", "operator", "model"):
            if class_specific_payload.get(k) and not fm.get(k):
                fm[k] = class_specific_payload[k]
    elif cls == "operation" and class_specific_payload:
        for k in ("operation_type", "status"):
            if class_specific_payload.get(k) and not fm.get(k):
                fm[k] = class_specific_payload[k]
    elif cls == "concept" and class_specific_payload:
        if class_specific_payload.get("concept_class"):
            fm["concept_class"] = class_specific_payload["concept_class"]
        if class_specific_payload.get("definition_short_en"):
            fm["definition_short"] = class_specific_payload["definition_short_en"]
        if class_specific_payload.get("definition_short_pt_br"):
            fm["definition_short_pt_br"] = class_specific_payload["definition_short_pt_br"]

    return fm


def upsert_enrichment_section(body: str, enriched: dict) -> str:
    """Replace (or insert before "## Appearances in Corpus" / at end) a bilingual
    enrichment section enclosed between ENRICH_START / ENRICH_END markers."""
    en = (enriched.get("biographical_summary_en") or "").strip()
    pt = (enriched.get("biographical_summary_pt_br") or "").strip()
    disamb = (enriched.get("disambiguation_note") or "").strip()
    sources = enriched.get("external_sources") or []

    section_lines = [ENRICH_START, "## Enrichment (EN)", ""]
    if disamb:
        section_lines.extend([f"> **Disambiguation:** {disamb}", ""])
    section_lines.extend([en or "_No external enrichment available._", "", "## Enriquecimento (PT-BR)", ""])
    if disamb:
        section_lines.extend([f"> **Desambiguação:** {disamb}", ""])
    section_lines.extend([pt or "_Sem enriquecimento externo disponível._", ""])

    if sources:
        section_lines.extend(["## External Sources", ""])
        for s in sources:
            url = s.get("url", "")
            title = s.get("title", "")
            pub = s.get("publisher", "")
            rel = s.get("reliability_band", "?")
            key = "; ".join(s.get("key_facts", []) or [])
            line = f"- [{title or url}]({url}) · _{pub}_ · reliability: `{rel}`"
            if key:
                line += f" — {key}"
            section_lines.append(line)
        section_lines.append("")

    section_lines.append(ENRICH_END)
    new_section = "\n".join(section_lines) + "\n"

    # If markers exist, replace between them
    if ENRICH_START in body and ENRICH_END in body:
        pattern = re.compile(re.escape(ENRICH_START) + r".*?" + re.escape(ENRICH_END) + r"\n?", re.DOTALL)
        return pattern.sub(new_section, body)

    # Otherwise insert before "## Appearances in Corpus" if present, else append
    marker = "## Appearances in Corpus"
    if marker in body:
        return body.replace(marker, new_section + "\n" + marker)
    if not body.endswith("\n"):
        body += "\n"
    return body + "\n" + new_section


def list_entity_files(class_filter: str | None, entity_id_filter: str | None) -> list[Path]:
    """List entity .md paths, filtered by class and/or entity_id."""
    files: list[Path] = []
    dirs = [class_filter] if class_filter else ENTITY_DIRS
    for d in dirs:
        p = ENTITIES_BASE / d
        if not p.exists():
            continue
        for f in sorted(p.glob("*.md")):
            if entity_id_filter and f.stem != entity_id_filter:
                continue
            files.append(f)
    return files


def tier_for(total_mentions: int) -> str:
    if total_mentions >= DEEP_THRESHOLD:
        return "deep"
    if total_mentions >= 1:
        return "shallow"
    return "none"


def process_entity(path: Path, *, force: bool, tier_filter: str, timeout: int) -> tuple[str, str, float]:
    """Returns (action, tier, cost_usd)."""
    fm, body = read_md(path)
    if not fm:
        return ("skip-no-fm", "none", 0.0)
    cls = fm.get("entity_class")
    if not cls:
        return ("skip-no-class", "none", 0.0)
    total = int(fm.get("total_mentions") or 0)
    tier = tier_for(total)
    if tier == "none":
        return ("skip-zero", tier, 0.0)
    if tier_filter != "all" and tier_filter != tier:
        return ("skip-tier-filter", tier, 0.0)
    if not force and not is_stale(fm.get("last_enriched")):
        return ("skip-fresh", tier, 0.0)

    prompt = build_prompt(cls, fm, tier)
    t0 = time.time()
    enriched, meta = call_claude(prompt, timeout=timeout)
    dt = time.time() - t0

    new_fm = merge_into_frontmatter(dict(fm), enriched, tier, utc_now_iso())
    new_body = upsert_enrichment_section(body, enriched)
    changed = write_md(path, new_fm, new_body)
    cost = float(meta.get("total_cost_usd") or 0.0)

    safe_print(f"  {'✓' if changed else '·'} {path.parent.name}/{path.stem} ({tier}, {dt:.1f}s, ${cost:.4f})")
    return ("written" if changed else "unchanged", tier, cost)


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--all", action="store_true", help="enrich every entity (or use --class / --entity-id)")
    ap.add_argument("--class", dest="class_filter", choices=ENTITY_DIRS, help="restrict to one class")
    ap.add_argument("--entity-id", help="restrict to one entity stem (filename without .md)")
    ap.add_argument("--tier", choices=["all", "deep", "shallow"], default="all")
    ap.add_argument("--workers", type=int, default=DEFAULT_WORKERS)
    ap.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT_S)
    ap.add_argument("--force", action="store_true", help="re-enrich even if last_enriched is fresh")
    ap.add_argument("--max", type=int, default=0, help="limit to N entities (0 = no limit)")
    args = ap.parse_args()

    if not (args.all or args.class_filter or args.entity_id):
        ap.error("provide --all, --class, or --entity-id")

    files = list_entity_files(args.class_filter, args.entity_id)
    if args.max:
        files = files[:args.max]
    if not files:
        print("No entities found.", file=sys.stderr)
        return

    print(f"Enriching {len(files)} entit(y/ies) with {args.workers} workers, tier={args.tier}, "
          f"force={args.force}", flush=True)

    stats = {"written": 0, "unchanged": 0, "skip-fresh": 0, "skip-tier-filter": 0,
             "skip-zero": 0, "skip-no-fm": 0, "skip-no-class": 0, "errors": 0}
    total_cost = 0.0
    t_start = time.time()

    with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as pool:
        futures = {pool.submit(process_entity, p, force=args.force,
                               tier_filter=args.tier, timeout=args.timeout): p for p in files}
        for fut in concurrent.futures.as_completed(futures):
            p = futures[fut]
            try:
                action, _tier, cost = fut.result()
                stats[action] = stats.get(action, 0) + 1
                total_cost += cost
            except Exception as e:
                stats["errors"] += 1
                safe_print(f"  ✗ {p.parent.name}/{p.stem}: {type(e).__name__}: {e}")

    dt = time.time() - t_start
    print(f"\nDone in {dt:.0f}s. Stats: {stats} · total_cost=${total_cost:.2f}", flush=True)

    if stats.get("written") or stats.get("errors"):
        with open(LOG_PATH, "a", encoding="utf-8") as fh:
            fh.write(
                f"\n## {utc_now_iso()} — ENRICH (Phase 6)\n"
                f"- operator: profiler\n- script: scripts/17-enrich-entities.py\n"
                f"- tier_filter: {args.tier}\n- workers: {args.workers}\n"
                f"- written: {stats.get('written', 0)}\n"
                f"- unchanged: {stats.get('unchanged', 0)}\n"
                f"- skipped_fresh: {stats.get('skip-fresh', 0)}\n"
                f"- errors: {stats.get('errors', 0)}\n"
                f"- total_cost_usd: {total_cost:.4f}\n"
            )


if __name__ == "__main__":
    main()