#!/usr/bin/env python3 """ 21-reextract-entities-gemini.py — Phase 2: rewrite ONLY `entities_extracted` in each page.md using Gemini 3.0 Flash with explicit anti-fragmentation rules. Fixes the Haiku extraction bugs: - "Gudauta Base" was split into location "Gudauta" + organization "Base" - "Chief Tereoken" was split into "Chief" + "Tereoken" - Bare common nouns ("Base", "Chief", "Department") promoted to standalone entities - Variants of same entity ("FBI" / "F-B-I" / "Federal Bureau") not normalized at source Preserves everything else in the page.md frontmatter. Usage: ./21-reextract-entities-gemini.py --all --workers 20 ./21-reextract-entities-gemini.py --doc-id ./21-reextract-entities-gemini.py --page /p007 # quick test """ from __future__ import annotations import argparse import concurrent.futures import json import os import re import sys import time from datetime import datetime, timezone from pathlib import Path try: import yaml except ImportError: sys.stderr.write("pip3 install pyyaml\n"); sys.exit(1) try: from google import genai from google.genai import types as genai_types except ImportError: sys.stderr.write("pip3 install google-genai\n"); sys.exit(1) UFO_ROOT = Path("/Users/guto/ufo") PAGES = UFO_ROOT / "wiki" / "pages" PNG_BASE = UFO_ROOT / "processing" / "png" DEFAULT_MODEL = "gemini-3-flash-preview" TIMEOUT_S = 180 PROMPT = """You are an OSINT investigator extracting entities from one page of a US Department of War declassified UAP/UFO document. A prior extraction pass made systematic mistakes; your job is to do it right. CRITICAL RULES — NON-NEGOTIABLE: 1. **Compound names stay together.** "Gudauta Base" is ONE location, not two entities "Gudauta" + "Base". "Chief Tereoken" is ONE person, not "Chief" + "Tereoken". "FBI Seattle Field Office" is ONE organization. NEVER split a compound name into separate entities. 2. **Titles + names are ONE person.** "Chief Tereoken", "LCDR Smith", "Mr. Johnson", "Capt. Davis", "Mrs. Anderson", "Dr. Hynek", "General Marshall" — each is ONE single person entity, with the title as part of the canonical name. 3. **NEVER extract bare common nouns as entities.** Skip: "Chief", "Base", "Department", "Office", "Agent", "Bureau", "Captain", "Officer", "File", "Subject", "Memo", "Letter", "Report", "Page", "Bag", "Stamp", "Signature", "Carbon Copy". These are only meaningful when COMBINED with a proper name. 4. **Normalize variants at the source.** "F.B.I.", "F-B-I", "FBI", "Federal Bureau of Investigation" → all return as the SINGLE canonical form "Federal Bureau of Investigation" (with "FBI" added to aliases). 5. **Distinguish entity types precisely:** - `locations`: physical places (cities, countries, military bases, geographic features). "Gudauta Base" → location (it's a military base). "Adapazari, Turkey" → location. - `organizations`: institutions, agencies, branches, companies. "FBI", "USAF", "CIA Foreign Branch". - `people`: humans with names (titles ok). "J. Edgar Hoover", "Chief Tereoken". - `events`: dated incidents with a date or short label. "Tic-Tac Nimitz 2004", "Roswell 1947". - `uap_objects`: described UAP themselves. Shape + color + size description. - `vehicles`: aircraft, ships, vehicles by model/name. "USS Princeton", "F-18". - `operations`: programs, missions, protocols by name. "Project Blue Book", "Operation Mainbrace". - `concepts`: legal/scientific/jargon. "FOIA exemption (b)(1)", "GENTEXT", "compartmentalization". Output ONE JSON object only (no markdown fence, no preamble) with this exact schema: { "entities_extracted": { "people": [{"name": "Full canonical name with title", "role_in_page": "subject|witness|author|signer|mentioned", "aliases": ["alt spellings"]}], "organizations": [{"name": "Canonical org name", "aliases": ["FBI", "F.B.I."], "type": "intelligence-agency|military-branch|civilian-agency|...|other"}], "locations": [{"name": "Canonical place name including any qualifier (Gudauta Base, not just Gudauta)", "type": "city|region|country|sea|strait|airbase|naval-base|mountain|desert|building|other"}], "events": [{"label": "Short distinctive label", "date": "YYYY-MM-DD|YYYY|NA"}], "uap_objects": [{"shape": "...", "color": "...", "size_estimate": "..."}], "vehicles": [{"name": "...", "class": "aircraft|ship|submarine|spacecraft|satellite|ground|other"}], "operations": [{"name": "...", "type": "military-operation|reporting-protocol|research-program|task-force|foia-disclosure|other"}], "concepts": [{"name": "...", "class": "legal-instrument|phenomenon-type|doctrine|scientific-term|jargon|program-name|other"}] } } If a category has no entries, return an empty array. PRESERVE original spelling (do not translate names). Output ONLY the JSON.""" def utc_now_iso() -> str: return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def read_fm(path: Path) -> tuple[dict, str]: c = path.read_text(encoding="utf-8") if not c.startswith("---"): return {}, c end = c.find("---", 4) if end < 0: return {}, c try: fm = yaml.safe_load(c[3:end].strip()) or {} except yaml.YAMLError: fm = {} return fm, c[end + 3:].lstrip("\n") def write_fm(path: Path, fm: dict, body: str) -> None: new_yaml = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False) sep = "\n" if body.startswith("\n") else "\n\n" path.write_text(f"---\n{new_yaml}---{sep}{body}", encoding="utf-8") def call_gemini(client, png_path: Path, model: str, max_tokens: int = 32768): content = [ genai_types.Part.from_bytes(data=png_path.read_bytes(), mime_type="image/png"), PROMPT, ] def _call(): return client.models.generate_content( model=model, contents=content, config=genai_types.GenerateContentConfig( response_mime_type="application/json", temperature=0.1, max_output_tokens=max_tokens, ), ) with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex: future = ex.submit(_call) try: return future.result(timeout=TIMEOUT_S).text except concurrent.futures.TimeoutError: raise RuntimeError(f"Gemini hung >{TIMEOUT_S}s") def parse_json_lenient(text: str) -> dict: t = text.strip() t = re.sub(r"^```(?:json)?\s*", "", t) t = re.sub(r"\s*```$", "", t) return json.loads(t) def process_page(client, page_md: Path, model: str) -> str: fm, body = read_fm(page_md) if not fm: return "no-fm" doc_id = fm.get("doc_id", "") page_num = int(fm.get("page_number", 0)) if not doc_id or not page_num: return "bad-fm" padded = f"{page_num:03d}" png = PNG_BASE / doc_id / f"p-{padded}.png" if not png.exists(): return "no-png" # Two attempts with progressively higher token budgets revision = None for tok in (32768, 65536): try: raw = call_gemini(client, png, model, max_tokens=tok) revision = parse_json_lenient(raw) break except json.JSONDecodeError: continue except Exception as e: sys.stderr.write(f" ✗ {page_md.relative_to(UFO_ROOT)}: {type(e).__name__}: {e}\n") return "error" if revision is None: return "bad-json" ee = revision.get("entities_extracted") if not isinstance(ee, dict): return "bad-shape" # Quick stats for reporting old_ee = fm.get("entities_extracted") or {} old_n = sum(len(old_ee.get(k) or []) for k in old_ee) new_n = sum(len(ee.get(k) or []) for k in ee) fm["entities_extracted"] = ee fm["last_entity_extraction_model"] = model fm["last_entity_extraction_at"] = utc_now_iso() write_fm(page_md, fm, body) rel = str(page_md.relative_to(UFO_ROOT)) print(f" ✓ {rel}: entities {old_n} → {new_n}", flush=True) return "ok" def main(): ap = argparse.ArgumentParser() ap.add_argument("--doc-id") ap.add_argument("--page", help="/pNNN") ap.add_argument("--all", action="store_true") ap.add_argument("--pages-file") ap.add_argument("--model", default=DEFAULT_MODEL) ap.add_argument("--workers", type=int, default=20) ap.add_argument("--max", type=int, default=0) args = ap.parse_args() api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY") if not api_key: sys.stderr.write("✗ GEMINI_API_KEY not set\n"); sys.exit(1) client = genai.Client(api_key=api_key) if args.page: parts = args.page.split("/") if len(parts) != 2: ap.error("--page must be /pNNN") targets = [PAGES / parts[0] / f"{parts[1]}.md"] elif args.doc_id: targets = sorted((PAGES / args.doc_id).glob("p*.md")) elif args.pages_file: targets = [Path(line.strip() if line.strip().startswith("/") else UFO_ROOT / line.strip()) for line in Path(args.pages_file).read_text().splitlines() if line.strip()] elif args.all: targets = sorted(PAGES.glob("*/p*.md")) else: ap.error("provide --doc-id, --page, --all, or --pages-file") if args.max: targets = targets[:args.max] print(f"Re-extracting entities from {len(targets)} page(s) with {args.model} ({args.workers} workers)") stats = {"ok": 0, "error": 0, "no-png": 0, "no-fm": 0, "bad-fm": 0, "bad-json": 0, "bad-shape": 0} with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as pool: futs = {pool.submit(process_page, client, p, args.model): p for p in targets if p.exists()} for fut in concurrent.futures.as_completed(futs): try: r = fut.result() stats[r] = stats.get(r, 0) + 1 except Exception as e: sys.stderr.write(f"✗ {futs[fut]}: {e}\n") stats["error"] += 1 print(f"\nDone. {stats}") if __name__ == "__main__": main()