disclosure-bureau/scripts/20-reanalyze-vision-gemini.py

289 lines
11 KiB
Python
Executable file

#!/usr/bin/env python3
"""
20-reanalyze-vision-gemini.py — Fallback re-vision via Gemini 3.1 Pro.
When Haiku exaggerates (e.g., claims "45% redaction-heavy" on a clearly
readable page), this script re-analyzes via Gemini 3.1 Pro and rewrites the
page.md frontmatter (vision_description, vision_description_pt_br, redactions).
Targets:
--doc-id <id> --page p173 → single page
--doc-id <id> → entire doc
--flagged → all pages with flags: ["vision-redaction-mismatch"]
--all → every page (slow + costly; use sparingly)
Anti-hang: ThreadPoolExecutor + future.result(timeout=180s) per memory
`feedback-gemini-sdk-hangs.md`.
Output: overwrites page.md frontmatter fields (vision_description,
vision_description_pt_br, redactions). Preserves everything else. Adds
`last_reanalysis_model` and `last_reanalysis_at`.
Usage:
GEMINI_API_KEY=... ./20-reanalyze-vision-gemini.py --doc-id <id> --page p173
"""
from __future__ import annotations
import argparse
import concurrent.futures
import json
import os
import re
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
try:
import yaml
except ImportError:
sys.stderr.write("pip3 install pyyaml\n"); sys.exit(1)
try:
from google import genai
from google.genai import types as genai_types
except ImportError:
sys.stderr.write("pip3 install google-genai\n"); sys.exit(1)
UFO_ROOT = Path("/Users/guto/ufo")
PAGES = UFO_ROOT / "wiki" / "pages"
PNG_BASE = UFO_ROOT / "processing" / "png"
DEFAULT_MODEL = "gemini-3.1-pro-preview"
FALLBACK = ["gemini-3-pro-preview", "gemini-3.1-flash-lite"]
TIMEOUT_S = 180
PROMPT = """You are re-analyzing one page of a US Department of War declassified UAP/UFO document. A previous Haiku pass produced an EXAGGERATED description (it claimed heavy redaction coverage when the actual page was largely readable). Your job: produce a PRECISE replacement.
GROUND RULES:
- Count redactions EXACTLY. Each redaction is a solid black bar / opaque cover blocking specific text.
- Do NOT call a page "heavy redaction" unless >30% of its visible area is genuinely obscured.
- For each redaction, return a tight bbox (normalized 0..1 coords) that covers ONLY the black bar, not the whole line.
- If the page has NO redactions, return an empty array. If it has thin strips, give them small bboxes.
Output ONE JSON object (no fence, no preamble):
{
"vision_description": "2-5 sentences in English. Describe what is actually visible: layout, content category, classification markings, any redaction precisely quantified. Use plain language, no hyperbole.",
"vision_description_pt_br": "Mesmo conteúdo em português brasileiro (pt-br). Preserve acentos UTF-8. Mantenha citações verbatim do documento em inglês (não traduza texto que está dentro do documento).",
"redactions": [
{"code": "(b)(1) 1.4(a)|(b)(3)|(b)(6)|other", "description": "what field/text was obscured", "bbox": {"x": 0.0, "y": 0.0, "w": 0.0, "h": 0.0}, "text_inferred": null}
],
"content_classification": ["text-only"|"contains-photos"|"contains-sketches"|"contains-diagrams"|"contains-maps"|"contains-tables"|"contains-signatures"|"contains-stamps"|"redaction-heavy"|"mixed"|"blank"],
"page_type": "cover|toc|body|signature|photo|sketch|map|stamp|blank|appendix|redaction-heavy|table-page|mixed",
"reanalysis_confidence": 0.0
}
"""
def utc_now_iso() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def read_fm(path: Path) -> tuple[dict, str]:
c = path.read_text(encoding="utf-8")
if not c.startswith("---"):
return {}, c
end = c.find("---", 4)
if end < 0:
return {}, c
try:
fm = yaml.safe_load(c[3:end].strip()) or {}
except yaml.YAMLError:
fm = {}
return fm, c[end + 3 :].lstrip("\n")
def write_fm(path: Path, fm: dict, body: str) -> None:
new_yaml = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
sep = "\n" if body.startswith("\n") else "\n\n"
path.write_text(f"---\n{new_yaml}---{sep}{body}", encoding="utf-8")
def call_gemini(client, png_path: Path, model: str, attempt: int = 1):
"""Vision call with thread-based timeout (anti-hang)."""
content = [
genai_types.Part.from_bytes(data=png_path.read_bytes(), mime_type="image/png"),
PROMPT,
]
def _call():
return client.models.generate_content(
model=model,
contents=content,
config=genai_types.GenerateContentConfig(
response_mime_type="application/json",
temperature=0.2,
max_output_tokens=16384, # bumped iteratively (4096 → 8192 → 16384) for verbose pages
),
)
try:
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
future = ex.submit(_call)
try:
resp = future.result(timeout=TIMEOUT_S)
except concurrent.futures.TimeoutError:
raise RuntimeError(f"Gemini hung >{TIMEOUT_S}s")
return resp.text, model
except Exception as e:
if attempt < len(FALLBACK) + 1:
next_m = FALLBACK[attempt - 1] if attempt <= len(FALLBACK) else None
if next_m:
sys.stderr.write(f"{model} failed ({type(e).__name__}); fallback → {next_m}\n")
return call_gemini(client, png_path, next_m, attempt + 1)
raise
def parse_json_lenient(text: str) -> dict:
t = text.strip()
t = re.sub(r"^```(?:json)?\s*", "", t)
t = re.sub(r"\s*```$", "", t)
return json.loads(t)
def process_page(client, page_md: Path, dry_run: bool) -> str:
fm, body = read_fm(page_md)
if not fm:
return "no-fm"
doc_id = fm.get("doc_id", "")
page_num = int(fm.get("page_number", 0))
if not doc_id or not page_num:
return "bad-fm"
padded = f"{page_num:03d}"
png = PNG_BASE / doc_id / f"p-{padded}.png"
if not png.exists():
return "no-png"
print(f"{page_md.relative_to(UFO_ROOT)} (Gemini 3.1 Pro)", flush=True)
t0 = time.time()
try:
raw, model_used = call_gemini(client, png, DEFAULT_MODEL)
except Exception as e:
print(f" ✗ Gemini failed: {type(e).__name__}: {e}", flush=True)
return "error"
dt = time.time() - t0
try:
revision = parse_json_lenient(raw)
except json.JSONDecodeError as e:
print(f" ✗ JSON parse failed: {e}; raw[:200]={raw[:200]!r}", flush=True)
return "bad-json"
# Before/after summary
old_n = len(fm.get("redactions") or [])
new_n = len(revision.get("redactions") or [])
old_desc = (fm.get("vision_description") or "")[:90]
new_desc = (revision.get("vision_description") or "")[:90]
print(f" redactions: {old_n}{new_n}")
print(f" OLD desc: {old_desc}")
print(f" NEW desc: {new_desc}")
if dry_run:
return "dry"
# Apply revision
if revision.get("vision_description"):
fm["vision_description"] = revision["vision_description"]
if revision.get("vision_description_pt_br"):
fm["vision_description_pt_br"] = revision["vision_description_pt_br"]
if "redactions" in revision:
fm["redactions"] = revision["redactions"]
if revision.get("content_classification"):
fm["content_classification"] = revision["content_classification"]
if revision.get("page_type"):
fm["page_type"] = revision["page_type"]
fm["last_reanalysis_model"] = model_used
fm["last_reanalysis_at"] = utc_now_iso()
fm["last_reanalysis_confidence"] = revision.get("reanalysis_confidence")
# Remove the mismatch flag now that it's been corrected
flags = list(fm.get("flags") or [])
if "vision-redaction-mismatch" in flags:
flags.remove("vision-redaction-mismatch")
fm["flags"] = flags
write_fm(page_md, fm, body)
print(f" ✓ wrote (took {dt:.1f}s)", flush=True)
return "ok"
def main():
global DEFAULT_MODEL
ap = argparse.ArgumentParser()
ap.add_argument("--doc-id")
ap.add_argument("--page", help="specific page stem, e.g. p173 (requires --doc-id)")
ap.add_argument("--flagged", action="store_true", help="all pages with vision-redaction-mismatch")
ap.add_argument("--redaction-heavy", action="store_true", help="all pages currently classified redaction-heavy (re-triage)")
ap.add_argument("--all", action="store_true")
ap.add_argument("--pages-file", help="newline-separated list of page paths (relative to /Users/guto/ufo/ or absolute)")
ap.add_argument("--model", default=DEFAULT_MODEL, help=f"override model (default {DEFAULT_MODEL})")
ap.add_argument("--workers", type=int, default=1, help="parallel workers (raise for Flash Lite, keep 1 for Pro free tier)")
ap.add_argument("--max", type=int, default=0, help="cap targets (0 = unlimited)")
ap.add_argument("--dry-run", action="store_true")
args = ap.parse_args()
api_key = os.environ.get("GEMINI_API_KEY")
if not api_key:
sys.stderr.write("✗ GEMINI_API_KEY not set\n"); sys.exit(1)
client = genai.Client(api_key=api_key)
if args.doc_id and args.page:
targets = [PAGES / args.doc_id / f"{args.page}.md"]
elif args.doc_id:
targets = sorted((PAGES / args.doc_id).glob("p*.md"))
elif args.flagged:
targets = []
for p in PAGES.glob("*/p*.md"):
fm, _ = read_fm(p)
if "vision-redaction-mismatch" in (fm.get("flags") or []):
targets.append(p)
elif args.redaction_heavy:
targets = []
for p in PAGES.glob("*/p*.md"):
fm, _ = read_fm(p)
if "redaction-heavy" in (fm.get("content_classification") or []):
targets.append(p)
elif args.all:
targets = sorted(PAGES.glob("*/p*.md"))
elif args.pages_file:
targets = []
for line in Path(args.pages_file).read_text().splitlines():
s = line.strip()
if not s:
continue
p = Path(s) if s.startswith("/") else UFO_ROOT / s
targets.append(p)
else:
ap.error("provide --doc-id (+ --page), --flagged, --redaction-heavy, --all, or --pages-file")
if args.max:
targets = targets[:args.max]
DEFAULT_MODEL = args.model
print(f"Processing {len(targets)} page(s) with {DEFAULT_MODEL} ({args.workers} worker(s))...")
stats = {"ok": 0, "error": 0, "dry": 0, "no-png": 0, "no-fm": 0, "bad-fm": 0, "bad-json": 0}
if args.workers <= 1:
for p in targets:
if not p.exists(): stats["no-fm"] += 1; continue
r = process_page(client, p, args.dry_run)
stats[r] = stats.get(r, 0) + 1
else:
import concurrent.futures as cf
with cf.ThreadPoolExecutor(max_workers=args.workers) as pool:
futs = {pool.submit(process_page, client, p, args.dry_run): p for p in targets if p.exists()}
for fut in cf.as_completed(futs):
try:
r = fut.result()
stats[r] = stats.get(r, 0) + 1
except Exception as e:
sys.stderr.write(f"{futs[fut]}: {e}\n")
stats["error"] += 1
print(f"\nDone. {stats}")
if __name__ == "__main__":
main()