disclosure-bureau/scripts/13-analyze-loose-images.py

#!/usr/bin/env python3
"""
13-analyze-loose-images.py — Sherlock analysis for standalone images in raw/

Targets the loose image files in /Users/guto/ufo/raw/ that are NOT bundled in
PDFs (so they don't go through Phase 2 conversion). Currently:

  FBI-Photo-A1.png .. FBI-Photo-A8.png  (8 PNGs)
  NASA-UAP-VM1-Apollo-12-1969.jpg .. NASA-UAP-VM6-Apollo-17-1972.jpg  (6 JPGs)

For each, calls Gemini 3.1 Pro with a Sherlock-style prompt to extract:
  - forensic_description (rich English)
  - forensic_description_pt_br (Brazilian Portuguese)
  - what_is_visible, classification_markings, redactions
  - UAP morphology if applicable
  - sherlock_observations (Holmes/Poirot/Dupin/Locard lenses)
  - entities_extracted (people, places, equipment, UAP objects)
  - quality_signals + flags

Output:
  processing/image-analysis/<image-id>.json    (raw analysis)
  wiki/images-direct/<image-id>.md             (bilingual frontmatter + body)

Usage:
  ./13-analyze-loose-images.py                    # all
  ./13-analyze-loose-images.py --image <name>     # single file
  ./13-analyze-loose-images.py --max-files N      # cap for testing
  ./13-analyze-loose-images.py --force            # re-run
"""
from __future__ import annotations

import argparse
import hashlib
import json
import os
import re
import sys
import time
import unicodedata
from datetime import datetime, timezone
from pathlib import Path

try:
    from google import genai
    from google.genai import types as genai_types
except ImportError:
    sys.stderr.write("Missing google-genai. pip3 install google-genai\n")
    sys.exit(1)

try:
    import yaml
except ImportError:
    sys.stderr.write("Missing pyyaml. pip3 install pyyaml\n")
    sys.exit(1)


UFO_ROOT = Path("/Users/guto/ufo")
RAW_DIR = UFO_ROOT / "raw"
ANALYSIS_DIR = UFO_ROOT / "processing" / "image-analysis"
WIKI_IMAGES_DIR = UFO_ROOT / "wiki" / "images-direct"
LOG_PATH = UFO_ROOT / "wiki" / "log.md"

DEFAULT_MODEL = "gemini-3.1-pro-preview"
FALLBACK = ["gemini-3-pro-preview", "gemini-3.1-flash-lite"]
SCHEMA_VERSION = "0.1.0"
WIKI_VERSION = "0.1.0"


SHERLOCK_IMAGE_PROMPT = """You are an evidence officer in the Investigation Bureau analyzing a single standalone image released by the U.S. government as part of a UAP/UFO disclosure (war.gov/ufo). Apply the combined methodologies of Sherlock Holmes (observation + elimination), Hercule Poirot (psychology of behavior), Auguste Dupin (ratiocination), and Edmond Locard (trace evidence).

Output ONE JSON object (no markdown fence, no preamble):

{
  "image_overview": {
    "primary_subject": "what the image is fundamentally showing, one sentence",
    "camera_perspective": "ground | aerial | satellite | cockpit | underwater | unknown",
    "sensor_or_medium": "color_photograph | bw_photograph | infrared_FLIR | radar_screen | sketch_handdrawn | document_scan | screen_capture | unknown",
    "platform_inferred": "F/A-18 | helicopter | observer-handheld | naval ship | satellite | unknown",
    "estimated_era": "1940s-50s | 1960s-70s | 1980s-90s | 2000s | 2010s | 2020s | unknown"
  },

  "forensic_description": "Comprehensive English description, 8-15 sentences. Describe everything: composition, persons present, equipment, geography/landmarks, atmospheric conditions, any text/labels visible, any UAP and its morphology, photographic anomalies (lens flare, dust spot, motion blur), any visible processing marks (scanner artifacts, fold lines, redaction tape). Cite verbatim any text visible on the image.",

  "forensic_description_pt_br": "Same content as forensic_description, translated to Brazilian Portuguese (pt-br). Brazilian vocabulary and spelling. Preserve UTF-8 accents. Keep verbatim quoted text in original language.",

  "classification_markings_visible": [
    {"level": "UNCLASSIFIED | CUI | CONFIDENTIAL | SECRET | TOP SECRET", "caveats": ["NOFORN"], "location_on_image": "header | footer | corner | watermark | stamp"}
  ],

  "redactions_visible": [
    {"code": "(b)(1) 1.4(a) | (b)(3) | (b)(6) | other_or_unknown", "description": "what is being obscured", "bbox_normalized": {"x": 0.0, "y": 0.0, "w": 0.0, "h": 0.0}}
  ],

  "entities_extracted": {
    "people": [{"label": "Subject 1 | Officer X", "role": "subject | photographer | bystander | unknown", "facing_camera": true}],
    "organizations": [{"name": "FBI | USAF | NASA | ...", "evidence_for": "patch visible | uniform | logo | scanner stamp"}],
    "locations": [{"name": "where", "evidence_for": "landmark | sign | coordinates"}],
    "events": [{"label": "...", "date": "YYYY-MM-DD | YYYY | NA"}],
    "uap_objects": [{"shape": "sphere | disc | triangle | cylinder | cube | elongated-ellipsoid | cigar | irregular | unknown", "color": "...", "size_estimate": "...", "motion_descriptors": []}],
    "vehicles": [{"name": "...", "class": "aircraft | ship | submarine | spacecraft | satellite | ground | other"}],
    "equipment_visible": [{"name": "binoculars | sensor pod | camera | ...", "purpose": "..."}],
    "concepts": [{"name": "FOIA exemption | sketch | photograph | ...", "class": "legal-instrument | jargon | scientific-term | other"}]
  },

  "uap_observation_fields": {
    "shape": "...",
    "color": "...",
    "size_estimate": "...",
    "altitude_ft": null,
    "speed_kts": null,
    "bearing_deg": null,
    "distance_nm": null,
    "coordinates": {"lat": null, "lon": null, "raw_text": "..."},
    "morphology_notes": "describe any details of the apparent object",
    "kinematic_anomalies": "anything physics-defying inferable from the still"
  },

  "sherlock_observations": [
    {
      "detective_lens": "holmes | poirot | dupin | locard",
      "observation": "Non-obvious detail. e.g. 'The shadow direction does not match the apparent sun angle suggested by the object highlight, indicating either composite imaging or a light source different from the sun.'",
      "implication": "why this matters investigatively",
      "confidence_band": "high | medium | low | speculation"
    }
  ],

  "anomalies_detected": [
    {
      "kind": "photographic_artifact | optical_illusion | film_processing | hoax_indicator | inconsistency | unredacted_slip | morphological_anomaly",
      "description": "...",
      "candidate_explanations": ["lens-flare", "double-exposure", "physical-object", "post-processing", "atmospheric"],
      "confidence_band": "high | medium | low | speculation"
    }
  ],

  "executive_summary_en": "3-5 sentence English summary suitable for citation in a chat reply.",
  "executive_summary_pt_br": "Same in Brazilian Portuguese (pt-br).",

  "quality_signals": {
    "image_quality_overall": "high | medium | low",
    "resolution_apparent": "high | medium | low",
    "redaction_density": "none | light | heavy | full-blackout",
    "completeness": "complete | truncated | partial",
    "extraction_confidence": "high | medium | low"
  },

  "flags": ["sketch-handdrawn", "redaction-heavy", "low-resolution", "monochrome", "darkened", "scanner-artifact", "fold-marks", "stamp-overlay"]
}

Rules:
- Output ONLY the JSON. No fence. No preamble.
- Empty arrays / null for not applicable. Never omit keys.
- ALL extracted text in ORIGINAL language. Do NOT translate.
- ONLY `forensic_description_pt_br` and `executive_summary_pt_br` are translated to Brazilian Portuguese.
- bbox_normalized is 0..1 (x,y,w,h) relative to the image.
- Aim for ≥4 sherlock_observations including subtle photographic details.
- For anomalies, list ≥3 candidate explanations including a mundane one.
"""


def filename_to_image_id(name: str) -> str:
    base = name.rsplit(".", 1)[0]
    nfkd = unicodedata.normalize("NFKD", base)
    ascii_s = "".join(c for c in nfkd if not unicodedata.combining(c))
    lower = ascii_s.lower()
    replaced = re.sub(r"[^a-z0-9-]", "-", lower)
    collapsed = re.sub(r"-+", "-", replaced).strip("-")
    if collapsed and collapsed[0].isdigit():
        collapsed = "img-" + collapsed
    return collapsed


def sha256_file(p: Path) -> str:
    h = hashlib.sha256()
    with open(p, "rb") as fh:
        for chunk in iter(lambda: fh.read(65536), b""):
            h.update(chunk)
    return h.hexdigest()


def utc_now_iso() -> str:
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def call_gemini_image(client, image_path: Path, model: str, attempt: int = 1, timeout: int = 180):
    """Thread-based timeout to prevent SDK hangs on rate limit / network issues."""
    import concurrent.futures
    try:
        if image_path.stat().st_size > 19 * 1024 * 1024:
            file = client.files.upload(file=str(image_path))
            while file.state.name == "PROCESSING":
                time.sleep(2)
                file = client.files.get(name=file.name)
            content = [file, SHERLOCK_IMAGE_PROMPT]
        else:
            mime = "image/png" if image_path.suffix.lower() == ".png" else "image/jpeg"
            content = [
                genai_types.Part.from_bytes(data=image_path.read_bytes(), mime_type=mime),
                SHERLOCK_IMAGE_PROMPT,
            ]

        def _call():
            return client.models.generate_content(
                model=model,
                contents=content,
                config=genai_types.GenerateContentConfig(
                    response_mime_type="application/json",
                    temperature=0.2,
                    max_output_tokens=16384,
                ),
            )

        with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
            future = ex.submit(_call)
            try:
                resp = future.result(timeout=timeout)
            except concurrent.futures.TimeoutError:
                raise RuntimeError(f"Gemini hung >{timeout}s — treating as failure")
        return resp.text, model
    except Exception as e:
        if attempt < len(FALLBACK):
            next_m = FALLBACK[attempt - 1]
            print(f"  ⚠ {model} failed ({e}); fallback {next_m}", flush=True)
            return call_gemini_image(client, image_path, next_m, attempt + 1, timeout)
        raise


def render_image_md(image_id: str, image_path: Path, analysis: dict, meta: dict, now_iso: str) -> str:
    fm = {
        "schema_version": SCHEMA_VERSION,
        "type": "image",
        "image_id": image_id,
        "image_class": "standalone",  # vs "page-extract"
        "original_filename": image_path.name,
        "raw_path": f"../../raw/{image_path.name}",
        "sha256": sha256_file(image_path),
        "size_bytes": image_path.stat().st_size,
        "vision_model": meta.get("model"),
        "analyzed_at": now_iso,
        **{f"overview_{k}": v for k, v in (analysis.get("image_overview") or {}).items()},
        "uap_observation_fields": analysis.get("uap_observation_fields"),
        "classification_markings_visible": analysis.get("classification_markings_visible") or [],
        "redactions_visible": analysis.get("redactions_visible") or [],
        "entities_extracted": analysis.get("entities_extracted") or {},
        "anomalies_detected": analysis.get("anomalies_detected") or [],
        "sherlock_observations": analysis.get("sherlock_observations") or [],
        "forensic_description": analysis.get("forensic_description", ""),
        "forensic_description_pt_br": analysis.get("forensic_description_pt_br", ""),
        "executive_summary_en": analysis.get("executive_summary_en", ""),
        "executive_summary_pt_br": analysis.get("executive_summary_pt_br", ""),
        "quality_signals": analysis.get("quality_signals") or {},
        "flags": analysis.get("flags") or [],
        "last_ingest": now_iso,
        "wiki_version": WIKI_VERSION,
    }
    yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
    body = f"""# Image Analysis — {image_id}

> Source: `raw/{image_path.name}` · Gemini model: `{meta.get("model")}` · Analyzed: {now_iso}

![image](../../raw/{image_path.name})

## Executive Summary (EN)

{(analysis.get("executive_summary_en") or "").strip()}

## Sumário Executivo (PT-BR)

{(analysis.get("executive_summary_pt_br") or "").strip()}

## Forensic Description (EN)

{(analysis.get("forensic_description") or "").strip()}

## Descrição Forense (PT-BR)

{(analysis.get("forensic_description_pt_br") or "").strip()}

## Sherlock Observations

"""
    for o in (analysis.get("sherlock_observations") or []):
        body += f"- **[{o.get('detective_lens','?')}]** {o.get('observation','')}\n  - _Implication:_ {o.get('implication','')}\n  - _Confidence:_ `{o.get('confidence_band','?')}`\n\n"

    body += "## Anomalies Detected\n\n"
    for a in (analysis.get("anomalies_detected") or []):
        body += f"- **{a.get('kind','?')}**: {a.get('description','')}\n  - _Candidates:_ {', '.join(a.get('candidate_explanations', []))}\n  - _Confidence:_ `{a.get('confidence_band','?')}`\n\n"

    return f"---\n{yaml_str}---\n\n{body}"


def process_image(client, image_path: Path, model: str, force: bool) -> bool:
    image_id = filename_to_image_id(image_path.name)
    json_out = ANALYSIS_DIR / f"{image_id}.json"
    md_out = WIKI_IMAGES_DIR / f"{image_id}.md"

    if not force and json_out.exists() and md_out.exists():
        print(f"  skip {image_id} (already processed)", flush=True)
        return True

    json_out.parent.mkdir(parents=True, exist_ok=True)
    md_out.parent.mkdir(parents=True, exist_ok=True)

    print(f"\n=== {image_path.name} → {image_id} ===", flush=True)
    t0 = time.time()
    try:
        text, model_used = call_gemini_image(client, image_path, model)
    except Exception as e:
        print(f"  ✗ generation failed: {e}", flush=True)
        return False
    text = text.strip()
    if text.startswith("```"):
        text = re.sub(r"^```(?:json)?\s*", "", text)
        text = re.sub(r"\s*```$", "", text)
    try:
        analysis = json.loads(text)
    except json.JSONDecodeError as e:
        print(f"  ✗ JSON parse failed: {e}", flush=True)
        json_out.with_suffix(".raw.txt").write_text(text, encoding="utf-8")
        return False

    meta = {"model": model_used, "duration_seconds": round(time.time() - t0, 1)}
    json_out.write_text(json.dumps({"analysis": analysis, "meta": meta}, indent=2, ensure_ascii=False), encoding="utf-8")
    md = render_image_md(image_id, image_path, analysis, meta, utc_now_iso())
    md_out.write_text(md, encoding="utf-8")
    print(f"  ✓ {image_id} ({time.time() - t0:.1f}s)", flush=True)
    return True


def find_loose_images() -> list[Path]:
    images = []
    for ext in (".png", ".jpg", ".jpeg"):
        for p in RAW_DIR.glob(f"*{ext}"):
            # Skip PDFs (FBI-Photo-B* etc. are PDFs)
            images.append(p)
    return sorted(images)


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--image", help="single image filename in raw/")
    ap.add_argument("--max-files", type=int, default=None)
    ap.add_argument("--model", default=DEFAULT_MODEL)
    ap.add_argument("--force", action="store_true")
    args = ap.parse_args()

    api_key = os.environ.get("GEMINI_API_KEY")
    if not api_key:
        sys.stderr.write("GEMINI_API_KEY not set\n"); sys.exit(2)
    client = genai.Client(api_key=api_key)

    if args.image:
        targets = [RAW_DIR / args.image]
    else:
        targets = find_loose_images()
    if args.max_files:
        targets = targets[: args.max_files]

    print(f"Processing {len(targets)} image(s) with {args.model}")
    ok = 0
    fail = []
    for p in targets:
        if process_image(client, p, args.model, args.force):
            ok += 1
        else:
            fail.append(p.name)

    print(f"\nDone. ok={ok}, failed={len(fail)}")
    if fail:
        print("failed:", fail)
    if ok > 0:
        with open(LOG_PATH, "a", encoding="utf-8") as fh:
            fh.write(
                f"\n## {utc_now_iso()} — IMAGE ANALYSIS (Phase 4.7)\n"
                f"- operator: archivist + evidence-officer\n- script: scripts/13-analyze-loose-images.py\n"
                f"- model: {args.model}\n- images_ok: {ok}\n- images_failed: {len(fail)}\n"
            )


if __name__ == "__main__":
    main()