disclosure-bureau/scripts/21-reextract-entities-gemini.py

#!/usr/bin/env python3
"""
21-reextract-entities-gemini.py — Phase 2: rewrite ONLY `entities_extracted` in
each page.md using Gemini 3.0 Flash with explicit anti-fragmentation rules.

Fixes the Haiku extraction bugs:
  - "Gudauta Base" was split into location "Gudauta" + organization "Base"
  - "Chief Tereoken" was split into "Chief" + "Tereoken"
  - Bare common nouns ("Base", "Chief", "Department") promoted to standalone entities
  - Variants of same entity ("FBI" / "F-B-I" / "Federal Bureau") not normalized at source

Preserves everything else in the page.md frontmatter.

Usage:
  ./21-reextract-entities-gemini.py --all --workers 20
  ./21-reextract-entities-gemini.py --doc-id <id>
  ./21-reextract-entities-gemini.py --page <doc-id>/p007  # quick test
"""
from __future__ import annotations

import argparse
import concurrent.futures
import json
import os
import re
import sys
import time
from datetime import datetime, timezone
from pathlib import Path

try:
    import yaml
except ImportError:
    sys.stderr.write("pip3 install pyyaml\n"); sys.exit(1)

try:
    from google import genai
    from google.genai import types as genai_types
except ImportError:
    sys.stderr.write("pip3 install google-genai\n"); sys.exit(1)


UFO_ROOT = Path("/Users/guto/ufo")
PAGES = UFO_ROOT / "wiki" / "pages"
PNG_BASE = UFO_ROOT / "processing" / "png"

DEFAULT_MODEL = "gemini-3-flash-preview"
TIMEOUT_S = 180


PROMPT = """You are an OSINT investigator extracting entities from one page of a US Department of War declassified UAP/UFO document. A prior extraction pass made systematic mistakes; your job is to do it right.

CRITICAL RULES — NON-NEGOTIABLE:

1. **Compound names stay together.** "Gudauta Base" is ONE location, not two entities "Gudauta" + "Base". "Chief Tereoken" is ONE person, not "Chief" + "Tereoken". "FBI Seattle Field Office" is ONE organization. NEVER split a compound name into separate entities.

2. **Titles + names are ONE person.** "Chief Tereoken", "LCDR Smith", "Mr. Johnson", "Capt. Davis", "Mrs. Anderson", "Dr. Hynek", "General Marshall" — each is ONE single person entity, with the title as part of the canonical name.

3. **NEVER extract bare common nouns as entities.** Skip: "Chief", "Base", "Department", "Office", "Agent", "Bureau", "Captain", "Officer", "File", "Subject", "Memo", "Letter", "Report", "Page", "Bag", "Stamp", "Signature", "Carbon Copy". These are only meaningful when COMBINED with a proper name.

4. **Normalize variants at the source.** "F.B.I.", "F-B-I", "FBI", "Federal Bureau of Investigation" → all return as the SINGLE canonical form "Federal Bureau of Investigation" (with "FBI" added to aliases).

5. **Distinguish entity types precisely:**
   - `locations`: physical places (cities, countries, military bases, geographic features). "Gudauta Base" → location (it's a military base). "Adapazari, Turkey" → location.
   - `organizations`: institutions, agencies, branches, companies. "FBI", "USAF", "CIA Foreign Branch".
   - `people`: humans with names (titles ok). "J. Edgar Hoover", "Chief Tereoken".
   - `events`: dated incidents with a date or short label. "Tic-Tac Nimitz 2004", "Roswell 1947".
   - `uap_objects`: described UAP themselves. Shape + color + size description.
   - `vehicles`: aircraft, ships, vehicles by model/name. "USS Princeton", "F-18".
   - `operations`: programs, missions, protocols by name. "Project Blue Book", "Operation Mainbrace".
   - `concepts`: legal/scientific/jargon. "FOIA exemption (b)(1)", "GENTEXT", "compartmentalization".

Output ONE JSON object only (no markdown fence, no preamble) with this exact schema:

{
  "entities_extracted": {
    "people": [{"name": "Full canonical name with title", "role_in_page": "subject|witness|author|signer|mentioned", "aliases": ["alt spellings"]}],
    "organizations": [{"name": "Canonical org name", "aliases": ["FBI", "F.B.I."], "type": "intelligence-agency|military-branch|civilian-agency|...|other"}],
    "locations": [{"name": "Canonical place name including any qualifier (Gudauta Base, not just Gudauta)", "type": "city|region|country|sea|strait|airbase|naval-base|mountain|desert|building|other"}],
    "events": [{"label": "Short distinctive label", "date": "YYYY-MM-DD|YYYY|NA"}],
    "uap_objects": [{"shape": "...", "color": "...", "size_estimate": "..."}],
    "vehicles": [{"name": "...", "class": "aircraft|ship|submarine|spacecraft|satellite|ground|other"}],
    "operations": [{"name": "...", "type": "military-operation|reporting-protocol|research-program|task-force|foia-disclosure|other"}],
    "concepts": [{"name": "...", "class": "legal-instrument|phenomenon-type|doctrine|scientific-term|jargon|program-name|other"}]
  }
}

If a category has no entries, return an empty array. PRESERVE original spelling (do not translate names). Output ONLY the JSON."""


def utc_now_iso() -> str:
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def read_fm(path: Path) -> tuple[dict, str]:
    c = path.read_text(encoding="utf-8")
    if not c.startswith("---"):
        return {}, c
    end = c.find("---", 4)
    if end < 0:
        return {}, c
    try:
        fm = yaml.safe_load(c[3:end].strip()) or {}
    except yaml.YAMLError:
        fm = {}
    return fm, c[end + 3:].lstrip("\n")


def write_fm(path: Path, fm: dict, body: str) -> None:
    new_yaml = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
    sep = "\n" if body.startswith("\n") else "\n\n"
    path.write_text(f"---\n{new_yaml}---{sep}{body}", encoding="utf-8")


def call_gemini(client, png_path: Path, model: str, max_tokens: int = 32768):
    content = [
        genai_types.Part.from_bytes(data=png_path.read_bytes(), mime_type="image/png"),
        PROMPT,
    ]
    def _call():
        return client.models.generate_content(
            model=model,
            contents=content,
            config=genai_types.GenerateContentConfig(
                response_mime_type="application/json",
                temperature=0.1,
                max_output_tokens=max_tokens,
            ),
        )
    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
        future = ex.submit(_call)
        try:
            return future.result(timeout=TIMEOUT_S).text
        except concurrent.futures.TimeoutError:
            raise RuntimeError(f"Gemini hung >{TIMEOUT_S}s")


def parse_json_lenient(text: str) -> dict:
    t = text.strip()
    t = re.sub(r"^```(?:json)?\s*", "", t)
    t = re.sub(r"\s*```$", "", t)
    return json.loads(t)


def process_page(client, page_md: Path, model: str) -> str:
    fm, body = read_fm(page_md)
    if not fm:
        return "no-fm"
    doc_id = fm.get("doc_id", "")
    page_num = int(fm.get("page_number", 0))
    if not doc_id or not page_num:
        return "bad-fm"
    padded = f"{page_num:03d}"
    png = PNG_BASE / doc_id / f"p-{padded}.png"
    if not png.exists():
        return "no-png"

    # Two attempts with progressively higher token budgets
    revision = None
    for tok in (32768, 65536):
        try:
            raw = call_gemini(client, png, model, max_tokens=tok)
            revision = parse_json_lenient(raw)
            break
        except json.JSONDecodeError:
            continue
        except Exception as e:
            sys.stderr.write(f"  ✗ {page_md.relative_to(UFO_ROOT)}: {type(e).__name__}: {e}\n")
            return "error"
    if revision is None:
        return "bad-json"

    ee = revision.get("entities_extracted")
    if not isinstance(ee, dict):
        return "bad-shape"

    # Quick stats for reporting
    old_ee = fm.get("entities_extracted") or {}
    old_n = sum(len(old_ee.get(k) or []) for k in old_ee)
    new_n = sum(len(ee.get(k) or []) for k in ee)

    fm["entities_extracted"] = ee
    fm["last_entity_extraction_model"] = model
    fm["last_entity_extraction_at"] = utc_now_iso()
    write_fm(page_md, fm, body)

    rel = str(page_md.relative_to(UFO_ROOT))
    print(f"  ✓ {rel}: entities {old_n} → {new_n}", flush=True)
    return "ok"


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--doc-id")
    ap.add_argument("--page", help="<doc-id>/pNNN")
    ap.add_argument("--all", action="store_true")
    ap.add_argument("--pages-file")
    ap.add_argument("--model", default=DEFAULT_MODEL)
    ap.add_argument("--workers", type=int, default=20)
    ap.add_argument("--max", type=int, default=0)
    args = ap.parse_args()

    api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
    if not api_key:
        sys.stderr.write("✗ GEMINI_API_KEY not set\n"); sys.exit(1)
    client = genai.Client(api_key=api_key)

    if args.page:
        parts = args.page.split("/")
        if len(parts) != 2:
            ap.error("--page must be <doc-id>/pNNN")
        targets = [PAGES / parts[0] / f"{parts[1]}.md"]
    elif args.doc_id:
        targets = sorted((PAGES / args.doc_id).glob("p*.md"))
    elif args.pages_file:
        targets = [Path(line.strip() if line.strip().startswith("/") else UFO_ROOT / line.strip())
                   for line in Path(args.pages_file).read_text().splitlines() if line.strip()]
    elif args.all:
        targets = sorted(PAGES.glob("*/p*.md"))
    else:
        ap.error("provide --doc-id, --page, --all, or --pages-file")

    if args.max:
        targets = targets[:args.max]

    print(f"Re-extracting entities from {len(targets)} page(s) with {args.model} ({args.workers} workers)")
    stats = {"ok": 0, "error": 0, "no-png": 0, "no-fm": 0, "bad-fm": 0, "bad-json": 0, "bad-shape": 0}

    with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as pool:
        futs = {pool.submit(process_page, client, p, args.model): p for p in targets if p.exists()}
        for fut in concurrent.futures.as_completed(futs):
            try:
                r = fut.result()
                stats[r] = stats.get(r, 0) + 1
            except Exception as e:
                sys.stderr.write(f"✗ {futs[fut]}: {e}\n")
                stats["error"] += 1

    print(f"\nDone. {stats}")


if __name__ == "__main__":
    main()