disclosure-bureau/scripts/21-reextract-entities-gemini.py

243 lines
9.9 KiB
Python
Executable file

#!/usr/bin/env python3
"""
21-reextract-entities-gemini.py — Phase 2: rewrite ONLY `entities_extracted` in
each page.md using Gemini 3.0 Flash with explicit anti-fragmentation rules.
Fixes the Haiku extraction bugs:
- "Gudauta Base" was split into location "Gudauta" + organization "Base"
- "Chief Tereoken" was split into "Chief" + "Tereoken"
- Bare common nouns ("Base", "Chief", "Department") promoted to standalone entities
- Variants of same entity ("FBI" / "F-B-I" / "Federal Bureau") not normalized at source
Preserves everything else in the page.md frontmatter.
Usage:
./21-reextract-entities-gemini.py --all --workers 20
./21-reextract-entities-gemini.py --doc-id <id>
./21-reextract-entities-gemini.py --page <doc-id>/p007 # quick test
"""
from __future__ import annotations
import argparse
import concurrent.futures
import json
import os
import re
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
try:
import yaml
except ImportError:
sys.stderr.write("pip3 install pyyaml\n"); sys.exit(1)
try:
from google import genai
from google.genai import types as genai_types
except ImportError:
sys.stderr.write("pip3 install google-genai\n"); sys.exit(1)
UFO_ROOT = Path("/Users/guto/ufo")
PAGES = UFO_ROOT / "wiki" / "pages"
PNG_BASE = UFO_ROOT / "processing" / "png"
DEFAULT_MODEL = "gemini-3-flash-preview"
TIMEOUT_S = 180
PROMPT = """You are an OSINT investigator extracting entities from one page of a US Department of War declassified UAP/UFO document. A prior extraction pass made systematic mistakes; your job is to do it right.
CRITICAL RULES — NON-NEGOTIABLE:
1. **Compound names stay together.** "Gudauta Base" is ONE location, not two entities "Gudauta" + "Base". "Chief Tereoken" is ONE person, not "Chief" + "Tereoken". "FBI Seattle Field Office" is ONE organization. NEVER split a compound name into separate entities.
2. **Titles + names are ONE person.** "Chief Tereoken", "LCDR Smith", "Mr. Johnson", "Capt. Davis", "Mrs. Anderson", "Dr. Hynek", "General Marshall" — each is ONE single person entity, with the title as part of the canonical name.
3. **NEVER extract bare common nouns as entities.** Skip: "Chief", "Base", "Department", "Office", "Agent", "Bureau", "Captain", "Officer", "File", "Subject", "Memo", "Letter", "Report", "Page", "Bag", "Stamp", "Signature", "Carbon Copy". These are only meaningful when COMBINED with a proper name.
4. **Normalize variants at the source.** "F.B.I.", "F-B-I", "FBI", "Federal Bureau of Investigation" → all return as the SINGLE canonical form "Federal Bureau of Investigation" (with "FBI" added to aliases).
5. **Distinguish entity types precisely:**
- `locations`: physical places (cities, countries, military bases, geographic features). "Gudauta Base" → location (it's a military base). "Adapazari, Turkey" → location.
- `organizations`: institutions, agencies, branches, companies. "FBI", "USAF", "CIA Foreign Branch".
- `people`: humans with names (titles ok). "J. Edgar Hoover", "Chief Tereoken".
- `events`: dated incidents with a date or short label. "Tic-Tac Nimitz 2004", "Roswell 1947".
- `uap_objects`: described UAP themselves. Shape + color + size description.
- `vehicles`: aircraft, ships, vehicles by model/name. "USS Princeton", "F-18".
- `operations`: programs, missions, protocols by name. "Project Blue Book", "Operation Mainbrace".
- `concepts`: legal/scientific/jargon. "FOIA exemption (b)(1)", "GENTEXT", "compartmentalization".
Output ONE JSON object only (no markdown fence, no preamble) with this exact schema:
{
"entities_extracted": {
"people": [{"name": "Full canonical name with title", "role_in_page": "subject|witness|author|signer|mentioned", "aliases": ["alt spellings"]}],
"organizations": [{"name": "Canonical org name", "aliases": ["FBI", "F.B.I."], "type": "intelligence-agency|military-branch|civilian-agency|...|other"}],
"locations": [{"name": "Canonical place name including any qualifier (Gudauta Base, not just Gudauta)", "type": "city|region|country|sea|strait|airbase|naval-base|mountain|desert|building|other"}],
"events": [{"label": "Short distinctive label", "date": "YYYY-MM-DD|YYYY|NA"}],
"uap_objects": [{"shape": "...", "color": "...", "size_estimate": "..."}],
"vehicles": [{"name": "...", "class": "aircraft|ship|submarine|spacecraft|satellite|ground|other"}],
"operations": [{"name": "...", "type": "military-operation|reporting-protocol|research-program|task-force|foia-disclosure|other"}],
"concepts": [{"name": "...", "class": "legal-instrument|phenomenon-type|doctrine|scientific-term|jargon|program-name|other"}]
}
}
If a category has no entries, return an empty array. PRESERVE original spelling (do not translate names). Output ONLY the JSON."""
def utc_now_iso() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def read_fm(path: Path) -> tuple[dict, str]:
c = path.read_text(encoding="utf-8")
if not c.startswith("---"):
return {}, c
end = c.find("---", 4)
if end < 0:
return {}, c
try:
fm = yaml.safe_load(c[3:end].strip()) or {}
except yaml.YAMLError:
fm = {}
return fm, c[end + 3:].lstrip("\n")
def write_fm(path: Path, fm: dict, body: str) -> None:
new_yaml = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
sep = "\n" if body.startswith("\n") else "\n\n"
path.write_text(f"---\n{new_yaml}---{sep}{body}", encoding="utf-8")
def call_gemini(client, png_path: Path, model: str, max_tokens: int = 32768):
content = [
genai_types.Part.from_bytes(data=png_path.read_bytes(), mime_type="image/png"),
PROMPT,
]
def _call():
return client.models.generate_content(
model=model,
contents=content,
config=genai_types.GenerateContentConfig(
response_mime_type="application/json",
temperature=0.1,
max_output_tokens=max_tokens,
),
)
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
future = ex.submit(_call)
try:
return future.result(timeout=TIMEOUT_S).text
except concurrent.futures.TimeoutError:
raise RuntimeError(f"Gemini hung >{TIMEOUT_S}s")
def parse_json_lenient(text: str) -> dict:
t = text.strip()
t = re.sub(r"^```(?:json)?\s*", "", t)
t = re.sub(r"\s*```$", "", t)
return json.loads(t)
def process_page(client, page_md: Path, model: str) -> str:
fm, body = read_fm(page_md)
if not fm:
return "no-fm"
doc_id = fm.get("doc_id", "")
page_num = int(fm.get("page_number", 0))
if not doc_id or not page_num:
return "bad-fm"
padded = f"{page_num:03d}"
png = PNG_BASE / doc_id / f"p-{padded}.png"
if not png.exists():
return "no-png"
# Two attempts with progressively higher token budgets
revision = None
for tok in (32768, 65536):
try:
raw = call_gemini(client, png, model, max_tokens=tok)
revision = parse_json_lenient(raw)
break
except json.JSONDecodeError:
continue
except Exception as e:
sys.stderr.write(f"{page_md.relative_to(UFO_ROOT)}: {type(e).__name__}: {e}\n")
return "error"
if revision is None:
return "bad-json"
ee = revision.get("entities_extracted")
if not isinstance(ee, dict):
return "bad-shape"
# Quick stats for reporting
old_ee = fm.get("entities_extracted") or {}
old_n = sum(len(old_ee.get(k) or []) for k in old_ee)
new_n = sum(len(ee.get(k) or []) for k in ee)
fm["entities_extracted"] = ee
fm["last_entity_extraction_model"] = model
fm["last_entity_extraction_at"] = utc_now_iso()
write_fm(page_md, fm, body)
rel = str(page_md.relative_to(UFO_ROOT))
print(f"{rel}: entities {old_n}{new_n}", flush=True)
return "ok"
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--doc-id")
ap.add_argument("--page", help="<doc-id>/pNNN")
ap.add_argument("--all", action="store_true")
ap.add_argument("--pages-file")
ap.add_argument("--model", default=DEFAULT_MODEL)
ap.add_argument("--workers", type=int, default=20)
ap.add_argument("--max", type=int, default=0)
args = ap.parse_args()
api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
if not api_key:
sys.stderr.write("✗ GEMINI_API_KEY not set\n"); sys.exit(1)
client = genai.Client(api_key=api_key)
if args.page:
parts = args.page.split("/")
if len(parts) != 2:
ap.error("--page must be <doc-id>/pNNN")
targets = [PAGES / parts[0] / f"{parts[1]}.md"]
elif args.doc_id:
targets = sorted((PAGES / args.doc_id).glob("p*.md"))
elif args.pages_file:
targets = [Path(line.strip() if line.strip().startswith("/") else UFO_ROOT / line.strip())
for line in Path(args.pages_file).read_text().splitlines() if line.strip()]
elif args.all:
targets = sorted(PAGES.glob("*/p*.md"))
else:
ap.error("provide --doc-id, --page, --all, or --pages-file")
if args.max:
targets = targets[:args.max]
print(f"Re-extracting entities from {len(targets)} page(s) with {args.model} ({args.workers} workers)")
stats = {"ok": 0, "error": 0, "no-png": 0, "no-fm": 0, "bad-fm": 0, "bad-json": 0, "bad-shape": 0}
with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as pool:
futs = {pool.submit(process_page, client, p, args.model): p for p in targets if p.exists()}
for fut in concurrent.futures.as_completed(futs):
try:
r = fut.result()
stats[r] = stats.get(r, 0) + 1
except Exception as e:
sys.stderr.write(f"{futs[fut]}: {e}\n")
stats["error"] += 1
print(f"\nDone. {stats}")
if __name__ == "__main__":
main()