289 lines
11 KiB
Python
Executable file
289 lines
11 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
20-reanalyze-vision-gemini.py — Fallback re-vision via Gemini 3.1 Pro.
|
|
|
|
When Haiku exaggerates (e.g., claims "45% redaction-heavy" on a clearly
|
|
readable page), this script re-analyzes via Gemini 3.1 Pro and rewrites the
|
|
page.md frontmatter (vision_description, vision_description_pt_br, redactions).
|
|
|
|
Targets:
|
|
--doc-id <id> --page p173 → single page
|
|
--doc-id <id> → entire doc
|
|
--flagged → all pages with flags: ["vision-redaction-mismatch"]
|
|
--all → every page (slow + costly; use sparingly)
|
|
|
|
Anti-hang: ThreadPoolExecutor + future.result(timeout=180s) per memory
|
|
`feedback-gemini-sdk-hangs.md`.
|
|
|
|
Output: overwrites page.md frontmatter fields (vision_description,
|
|
vision_description_pt_br, redactions). Preserves everything else. Adds
|
|
`last_reanalysis_model` and `last_reanalysis_at`.
|
|
|
|
Usage:
|
|
GEMINI_API_KEY=... ./20-reanalyze-vision-gemini.py --doc-id <id> --page p173
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import concurrent.futures
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import yaml
|
|
except ImportError:
|
|
sys.stderr.write("pip3 install pyyaml\n"); sys.exit(1)
|
|
|
|
try:
|
|
from google import genai
|
|
from google.genai import types as genai_types
|
|
except ImportError:
|
|
sys.stderr.write("pip3 install google-genai\n"); sys.exit(1)
|
|
|
|
|
|
UFO_ROOT = Path("/Users/guto/ufo")
|
|
PAGES = UFO_ROOT / "wiki" / "pages"
|
|
PNG_BASE = UFO_ROOT / "processing" / "png"
|
|
|
|
DEFAULT_MODEL = "gemini-3.1-pro-preview"
|
|
FALLBACK = ["gemini-3-pro-preview", "gemini-3.1-flash-lite"]
|
|
TIMEOUT_S = 180
|
|
|
|
|
|
PROMPT = """You are re-analyzing one page of a US Department of War declassified UAP/UFO document. A previous Haiku pass produced an EXAGGERATED description (it claimed heavy redaction coverage when the actual page was largely readable). Your job: produce a PRECISE replacement.
|
|
|
|
GROUND RULES:
|
|
- Count redactions EXACTLY. Each redaction is a solid black bar / opaque cover blocking specific text.
|
|
- Do NOT call a page "heavy redaction" unless >30% of its visible area is genuinely obscured.
|
|
- For each redaction, return a tight bbox (normalized 0..1 coords) that covers ONLY the black bar, not the whole line.
|
|
- If the page has NO redactions, return an empty array. If it has thin strips, give them small bboxes.
|
|
|
|
Output ONE JSON object (no fence, no preamble):
|
|
|
|
{
|
|
"vision_description": "2-5 sentences in English. Describe what is actually visible: layout, content category, classification markings, any redaction precisely quantified. Use plain language, no hyperbole.",
|
|
"vision_description_pt_br": "Mesmo conteúdo em português brasileiro (pt-br). Preserve acentos UTF-8. Mantenha citações verbatim do documento em inglês (não traduza texto que está dentro do documento).",
|
|
"redactions": [
|
|
{"code": "(b)(1) 1.4(a)|(b)(3)|(b)(6)|other", "description": "what field/text was obscured", "bbox": {"x": 0.0, "y": 0.0, "w": 0.0, "h": 0.0}, "text_inferred": null}
|
|
],
|
|
"content_classification": ["text-only"|"contains-photos"|"contains-sketches"|"contains-diagrams"|"contains-maps"|"contains-tables"|"contains-signatures"|"contains-stamps"|"redaction-heavy"|"mixed"|"blank"],
|
|
"page_type": "cover|toc|body|signature|photo|sketch|map|stamp|blank|appendix|redaction-heavy|table-page|mixed",
|
|
"reanalysis_confidence": 0.0
|
|
}
|
|
"""
|
|
|
|
|
|
def utc_now_iso() -> str:
|
|
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
|
|
def read_fm(path: Path) -> tuple[dict, str]:
|
|
c = path.read_text(encoding="utf-8")
|
|
if not c.startswith("---"):
|
|
return {}, c
|
|
end = c.find("---", 4)
|
|
if end < 0:
|
|
return {}, c
|
|
try:
|
|
fm = yaml.safe_load(c[3:end].strip()) or {}
|
|
except yaml.YAMLError:
|
|
fm = {}
|
|
return fm, c[end + 3 :].lstrip("\n")
|
|
|
|
|
|
def write_fm(path: Path, fm: dict, body: str) -> None:
|
|
new_yaml = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
sep = "\n" if body.startswith("\n") else "\n\n"
|
|
path.write_text(f"---\n{new_yaml}---{sep}{body}", encoding="utf-8")
|
|
|
|
|
|
def call_gemini(client, png_path: Path, model: str, attempt: int = 1):
|
|
"""Vision call with thread-based timeout (anti-hang)."""
|
|
content = [
|
|
genai_types.Part.from_bytes(data=png_path.read_bytes(), mime_type="image/png"),
|
|
PROMPT,
|
|
]
|
|
def _call():
|
|
return client.models.generate_content(
|
|
model=model,
|
|
contents=content,
|
|
config=genai_types.GenerateContentConfig(
|
|
response_mime_type="application/json",
|
|
temperature=0.2,
|
|
max_output_tokens=16384, # bumped iteratively (4096 → 8192 → 16384) for verbose pages
|
|
),
|
|
)
|
|
try:
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
|
|
future = ex.submit(_call)
|
|
try:
|
|
resp = future.result(timeout=TIMEOUT_S)
|
|
except concurrent.futures.TimeoutError:
|
|
raise RuntimeError(f"Gemini hung >{TIMEOUT_S}s")
|
|
return resp.text, model
|
|
except Exception as e:
|
|
if attempt < len(FALLBACK) + 1:
|
|
next_m = FALLBACK[attempt - 1] if attempt <= len(FALLBACK) else None
|
|
if next_m:
|
|
sys.stderr.write(f" ⚠ {model} failed ({type(e).__name__}); fallback → {next_m}\n")
|
|
return call_gemini(client, png_path, next_m, attempt + 1)
|
|
raise
|
|
|
|
|
|
def parse_json_lenient(text: str) -> dict:
|
|
t = text.strip()
|
|
t = re.sub(r"^```(?:json)?\s*", "", t)
|
|
t = re.sub(r"\s*```$", "", t)
|
|
return json.loads(t)
|
|
|
|
|
|
def process_page(client, page_md: Path, dry_run: bool) -> str:
|
|
fm, body = read_fm(page_md)
|
|
if not fm:
|
|
return "no-fm"
|
|
doc_id = fm.get("doc_id", "")
|
|
page_num = int(fm.get("page_number", 0))
|
|
if not doc_id or not page_num:
|
|
return "bad-fm"
|
|
padded = f"{page_num:03d}"
|
|
png = PNG_BASE / doc_id / f"p-{padded}.png"
|
|
if not png.exists():
|
|
return "no-png"
|
|
|
|
print(f" → {page_md.relative_to(UFO_ROOT)} (Gemini 3.1 Pro)", flush=True)
|
|
t0 = time.time()
|
|
try:
|
|
raw, model_used = call_gemini(client, png, DEFAULT_MODEL)
|
|
except Exception as e:
|
|
print(f" ✗ Gemini failed: {type(e).__name__}: {e}", flush=True)
|
|
return "error"
|
|
dt = time.time() - t0
|
|
try:
|
|
revision = parse_json_lenient(raw)
|
|
except json.JSONDecodeError as e:
|
|
print(f" ✗ JSON parse failed: {e}; raw[:200]={raw[:200]!r}", flush=True)
|
|
return "bad-json"
|
|
|
|
# Before/after summary
|
|
old_n = len(fm.get("redactions") or [])
|
|
new_n = len(revision.get("redactions") or [])
|
|
old_desc = (fm.get("vision_description") or "")[:90]
|
|
new_desc = (revision.get("vision_description") or "")[:90]
|
|
print(f" redactions: {old_n} → {new_n}")
|
|
print(f" OLD desc: {old_desc}…")
|
|
print(f" NEW desc: {new_desc}…")
|
|
|
|
if dry_run:
|
|
return "dry"
|
|
|
|
# Apply revision
|
|
if revision.get("vision_description"):
|
|
fm["vision_description"] = revision["vision_description"]
|
|
if revision.get("vision_description_pt_br"):
|
|
fm["vision_description_pt_br"] = revision["vision_description_pt_br"]
|
|
if "redactions" in revision:
|
|
fm["redactions"] = revision["redactions"]
|
|
if revision.get("content_classification"):
|
|
fm["content_classification"] = revision["content_classification"]
|
|
if revision.get("page_type"):
|
|
fm["page_type"] = revision["page_type"]
|
|
|
|
fm["last_reanalysis_model"] = model_used
|
|
fm["last_reanalysis_at"] = utc_now_iso()
|
|
fm["last_reanalysis_confidence"] = revision.get("reanalysis_confidence")
|
|
|
|
# Remove the mismatch flag now that it's been corrected
|
|
flags = list(fm.get("flags") or [])
|
|
if "vision-redaction-mismatch" in flags:
|
|
flags.remove("vision-redaction-mismatch")
|
|
fm["flags"] = flags
|
|
|
|
write_fm(page_md, fm, body)
|
|
print(f" ✓ wrote (took {dt:.1f}s)", flush=True)
|
|
return "ok"
|
|
|
|
|
|
def main():
|
|
global DEFAULT_MODEL
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--doc-id")
|
|
ap.add_argument("--page", help="specific page stem, e.g. p173 (requires --doc-id)")
|
|
ap.add_argument("--flagged", action="store_true", help="all pages with vision-redaction-mismatch")
|
|
ap.add_argument("--redaction-heavy", action="store_true", help="all pages currently classified redaction-heavy (re-triage)")
|
|
ap.add_argument("--all", action="store_true")
|
|
ap.add_argument("--pages-file", help="newline-separated list of page paths (relative to /Users/guto/ufo/ or absolute)")
|
|
ap.add_argument("--model", default=DEFAULT_MODEL, help=f"override model (default {DEFAULT_MODEL})")
|
|
ap.add_argument("--workers", type=int, default=1, help="parallel workers (raise for Flash Lite, keep 1 for Pro free tier)")
|
|
ap.add_argument("--max", type=int, default=0, help="cap targets (0 = unlimited)")
|
|
ap.add_argument("--dry-run", action="store_true")
|
|
args = ap.parse_args()
|
|
|
|
api_key = os.environ.get("GEMINI_API_KEY")
|
|
if not api_key:
|
|
sys.stderr.write("✗ GEMINI_API_KEY not set\n"); sys.exit(1)
|
|
client = genai.Client(api_key=api_key)
|
|
|
|
if args.doc_id and args.page:
|
|
targets = [PAGES / args.doc_id / f"{args.page}.md"]
|
|
elif args.doc_id:
|
|
targets = sorted((PAGES / args.doc_id).glob("p*.md"))
|
|
elif args.flagged:
|
|
targets = []
|
|
for p in PAGES.glob("*/p*.md"):
|
|
fm, _ = read_fm(p)
|
|
if "vision-redaction-mismatch" in (fm.get("flags") or []):
|
|
targets.append(p)
|
|
elif args.redaction_heavy:
|
|
targets = []
|
|
for p in PAGES.glob("*/p*.md"):
|
|
fm, _ = read_fm(p)
|
|
if "redaction-heavy" in (fm.get("content_classification") or []):
|
|
targets.append(p)
|
|
elif args.all:
|
|
targets = sorted(PAGES.glob("*/p*.md"))
|
|
elif args.pages_file:
|
|
targets = []
|
|
for line in Path(args.pages_file).read_text().splitlines():
|
|
s = line.strip()
|
|
if not s:
|
|
continue
|
|
p = Path(s) if s.startswith("/") else UFO_ROOT / s
|
|
targets.append(p)
|
|
else:
|
|
ap.error("provide --doc-id (+ --page), --flagged, --redaction-heavy, --all, or --pages-file")
|
|
|
|
if args.max:
|
|
targets = targets[:args.max]
|
|
|
|
DEFAULT_MODEL = args.model
|
|
|
|
print(f"Processing {len(targets)} page(s) with {DEFAULT_MODEL} ({args.workers} worker(s))...")
|
|
stats = {"ok": 0, "error": 0, "dry": 0, "no-png": 0, "no-fm": 0, "bad-fm": 0, "bad-json": 0}
|
|
|
|
if args.workers <= 1:
|
|
for p in targets:
|
|
if not p.exists(): stats["no-fm"] += 1; continue
|
|
r = process_page(client, p, args.dry_run)
|
|
stats[r] = stats.get(r, 0) + 1
|
|
else:
|
|
import concurrent.futures as cf
|
|
with cf.ThreadPoolExecutor(max_workers=args.workers) as pool:
|
|
futs = {pool.submit(process_page, client, p, args.dry_run): p for p in targets if p.exists()}
|
|
for fut in cf.as_completed(futs):
|
|
try:
|
|
r = fut.result()
|
|
stats[r] = stats.get(r, 0) + 1
|
|
except Exception as e:
|
|
sys.stderr.write(f"✗ {futs[fut]}: {e}\n")
|
|
stats["error"] += 1
|
|
|
|
print(f"\nDone. {stats}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|