disclosure-bureau/scripts/11-generate-case-images.py

#!/usr/bin/env python3
"""
11-generate-case-images.py — Generate "case images" (Nano Banana + Codex) per entity

For each completed video (wiki/videos/<id>.md) OR document (wiki/documents/<id>.md),
generate TWO conceptual images representing the case, using the executive_summary
and UAP observation fields as the prompt seed:

  processing/case-images/<entity-id>/case-nanobanana.png
  processing/case-images/<entity-id>/case-codex.png

These are "what the case might look like" reproductions — NOT evidence, NOT
real-data reconstructions. They are speculative visualizations for the chat UI
to display alongside citations (the future Sherlock chat app).

All output is tagged `synthetic: true` in the entity markdown and gets a
`synthesis_warnings` block.

Usage:
  ./11-generate-case-images.py --kind videos              # process all wiki/videos/*.md
  ./11-generate-case-images.py --kind documents           # process all wiki/documents/*.md
  ./11-generate-case-images.py --kind both                # both
  ./11-generate-case-images.py --entity-id dod-111689005  # single entity (video or doc)
  ./11-generate-case-images.py --skip-codex               # only Nano Banana (cheaper)
  ./11-generate-case-images.py --force                    # re-generate
"""
from __future__ import annotations

import argparse
import os
import re
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path

try:
    import yaml
except ImportError:
    sys.stderr.write("Missing pyyaml. Run: pip3 install pyyaml\n")
    sys.exit(1)

try:
    from google import genai
    from PIL import Image as PILImage
    from io import BytesIO
except ImportError:
    sys.stderr.write("Missing google-genai or pillow. Run: pip3 install google-genai pillow\n")
    sys.exit(1)


UFO_ROOT = Path("/Users/guto/ufo")
NANO_BANANA_MODEL = "gemini-3-pro-image-preview"
WIKI_VIDEOS_DIR = UFO_ROOT / "wiki" / "videos"
WIKI_DOCS_DIR = UFO_ROOT / "wiki" / "documents"
CASE_IMAGES_DIR = UFO_ROOT / "processing" / "case-images"
FRAMES_DIR = UFO_ROOT / "processing" / "uap-frames"
LOG_PATH = UFO_ROOT / "wiki" / "log.md"

NANO_BANANA_SCRIPT = Path.home() / ".claude" / "skills" / "nano-banana-pro" / "scripts" / "generate_image.py"


def find_best_frame(video_id: str) -> Path | None:
    """Return the most representative single frame jpg for a video."""
    d = FRAMES_DIR / video_id
    if not d.exists():
        return None
    candidates = sorted(d.glob("*.jpg"))
    if not candidates:
        return None
    for keyword in ("-mi.jpg", "-firs.jpg", "-las.jpg", "-sample"):
        for p in candidates:
            if keyword in p.name:
                return p
    return candidates[0]


def find_all_frames(video_id: str, max_n: int = 5) -> list[Path]:
    """Return up to max_n frames covering the UAP timeline (first, mid, last, samples).
    Order: first → samples → mid → last (chronological)."""
    d = FRAMES_DIR / video_id
    if not d.exists():
        return []
    all_jpgs = sorted(d.glob("*.jpg"))
    if not all_jpgs:
        return []
    # Order by timestamp encoded in filename: frame-MM-SS_NN-label.jpg
    def t_of(p: Path):
        import re as _re
        m = _re.match(r"frame-(\d+)-(\d+)_(\d+)", p.name)
        if not m:
            return 0
        return int(m.group(1)) * 60 + int(m.group(2)) + int(m.group(3)) / 100
    sorted_by_time = sorted(all_jpgs, key=t_of)
    if len(sorted_by_time) <= max_n:
        return sorted_by_time
    # Evenly subsample
    step = len(sorted_by_time) / max_n
    indices = [int(i * step) for i in range(max_n)]
    return [sorted_by_time[i] for i in indices]


def read_md(path: Path) -> tuple[dict, str]:
    c = path.read_text(encoding="utf-8")
    if not c.startswith("---"):
        return {}, c
    end = c.find("---", 4)
    if end == -1:
        return {}, c
    try:
        return (yaml.safe_load(c[3:end].strip()) or {}), c[end + 3 :].lstrip("\n")
    except yaml.YAMLError:
        return {}, c[end + 3 :].lstrip("\n")


def build_case_prompt(fm: dict, kind: str, has_reference_frame: bool) -> str:
    """Build the descriptive prompt for image generation. If has_reference_frame=True,
    the prompt instructs to enhance the actual extracted frame; otherwise it's a
    text-only conceptual scene."""
    if kind == "video":
        title = fm.get("video_id", "unknown")
        summary = (fm.get("executive_summary_en", "") or "").strip()
        uap = fm.get("uap_observation_fields") or {}
        overview = {k.replace("overview_", ""): v for k, v in fm.items() if k.startswith("overview_")}
        location_hint = uap.get("coordinates", {}).get("raw_text") or ""
        sherlock = fm.get("sherlock_observations") or []
        sherlock_summary = "; ".join(o.get("observation", "")[:120] for o in sherlock[:3])
    else:  # document
        title = fm.get("canonical_title") or fm.get("doc_id", "unknown")
        summary = (fm.get("executive_summary", "") or "").strip()
        uap = {}
        overview = {
            "primary_subject": fm.get("document_class") or "",
            "incident_date": (fm.get("war_gov") or {}).get("incident_date_official") or "",
            "incident_location": (fm.get("war_gov") or {}).get("incident_location_official") or "",
        }
        location_hint = overview["incident_location"]
        sherlock_summary = ""

    shape = uap.get("shape") or "unknown"
    color = uap.get("color") or "unknown"
    altitude = uap.get("altitude_ft") or "unknown"
    speed = uap.get("speed_kts") or "unknown"
    maneuvers = ", ".join(uap.get("maneuver_descriptors") or []) or "no specific maneuvers reported"
    sensor = (uap.get("sensor_observations") or [{}])
    sensor_str = sensor[0].get("sensor", "unknown sensor") if sensor else "unknown sensor"

    if has_reference_frame:
        intro = f"""USE THE ATTACHED REFERENCE FRAME as your visual starting point. This is an actual frame extracted from the original UAP video at a moment when the UAP is visible. Enhance and re-interpret this exact scene cinematically while keeping ALL the real visual elements: same camera angle, same terrain/sensor view, same UAP position, same scale, same lighting conditions of the IR/FLIR/visible sensor.

The output should look like a CINEMATIC VERSION of the same moment captured in the frame — same scene, same UAP, but rendered with higher production value and atmospheric depth. DO NOT change the location of the UAP. DO NOT invent buildings, terrain, or atmosphere that aren't in the reference frame."""
    else:
        intro = """Create a photorealistic conceptual reproduction of a UAP/UFO incident scene from a U.S. Department of War declassified case."""

    return f"""{intro}

CASE METADATA:
- title: {title}
- narrative: {summary[:600]}
- location: {location_hint or 'unknown'}
- primary subject: {overview.get('primary_subject', '')}
- camera vantage: {overview.get('camera_perspective', 'aerial')}
- sensor depicted: {sensor_str}

UAP CHARACTERISTICS:
- shape: {shape}
- color: {color}
- altitude: {altitude}
- speed: {speed}
- maneuvers: {maneuvers}

KEY OBSERVATIONS: {sherlock_summary[:400]}

ABSOLUTE RULES:
- Do NOT add any HUD telemetry text, altitude readouts, headings, coordinates, callsigns, or date/time stamps. These would be fabricated.
- Do NOT add classification banners with specific levels (SECRET, NOFORN, etc).
- Do NOT add ANY text at all.
- Cinematic photorealism, IMAX documentary aesthetic, somber investigative mood.
- 16:9 aspect ratio.

This is a CONCEPTUAL VISUALIZATION — artistic interpretation, not evidence."""


def build_diagram_prompt(fm: dict) -> str:
    """Sherlock investigation board annotation prompt — requires reference frame."""
    sherlock = fm.get("sherlock_observations") or []
    sherlock_text = " | ".join(
        f"[{o.get('detective_lens','?')}] {o.get('observation','')[:100]}"
        for o in sherlock[:4]
    )
    anomalies = fm.get("anomalies_detected") or []
    anomaly_text = " | ".join(
        f"{a.get('kind','?')}: {a.get('description','')[:80]}"
        for a in anomalies[:2]
    )
    return f"""USE THE ATTACHED REFERENCE FRAME from the UAP video. Transform it into a Sherlock Holmes investigative diagram board. Keep the underlying scene (slightly brightened for legibility), and overlay handwritten-style red-pen detective annotations.

CONTEXT FROM ANALYSIS:
- Sherlock observations: {sherlock_text[:500]}
- Anomalies detected: {anomaly_text[:200]}

OVERLAY ANNOTATIONS (hand-drawn in red pen on transparent overlay):
- ◯ "UAP TARGET" circled around the most likely UAP position with arrow pointing to it
- ◯ "TRACKING LOCK" or "CROSSHAIR" if a tracking marker is visible in the frame
- ◯ "OBSERVED FROM" labeling the camera vantage (cockpit, ground, etc.)
- A dashed yellow-highlighter arrow showing the inferred motion direction with label "APPROXIMATE FLIGHT PATH"
- Bottom-left annotation in small red text summarizing ONE key observation (e.g. "IR signature: linear motion, no visible exhaust — anomalous")
- Top-right small annotation: "SOURCE: DOD VIDEO, DECLASSIFIED"

STYLE:
- Annotations look hand-drawn, slightly imperfect, like a real detective pinned the photo on a corkboard and circled clues
- The base scene from the frame stays intact (don't replace it, just annotate)
- Corkboard pins in the corners
- Slight grungy texture overlay
- Forensic investigation board / vintage detective work

ABSOLUTE RULES:
- Do NOT invent specific telemetry numbers (altitude, heading, coords, timestamps)
- All annotations are INTERPRETATIONS of what's visible, not data extracted from HUD
- Do NOT remove or alter the actual scene content"""


# Gemini SDK client (lazy)
_gemini_client = None
def _get_gemini_client():
    global _gemini_client
    if _gemini_client is None:
        api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
        if not api_key:
            raise RuntimeError("GEMINI_API_KEY / GOOGLE_API_KEY not set")
        _gemini_client = genai.Client(api_key=api_key)
    return _gemini_client


def call_nano_banana(prompt: str, out_path: Path, input_images: list[Path] | None = None, resolution: str = "2K") -> bool:
    """Direct API call to Nano Banana Pro (Gemini 3 Pro Image) with support for
    MULTIPLE reference images, which the official skill script does not support."""
    # If 0 or 1 image, falls back to the simpler skill script (lets it handle resolution etc.)
    if not input_images or len(input_images) <= 1:
        cmd = [
            "uv", "run", str(NANO_BANANA_SCRIPT),
            "--prompt", prompt,
            "--filename", str(out_path),
            "--resolution", resolution,
        ]
        if input_images:
            cmd.extend(["--input-image", str(input_images[0])])
        res = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
        if res.returncode != 0:
            sys.stderr.write(f"  ✗ Nano Banana (skill) failed: {res.stderr[-400:]}\n")
            return False
        return out_path.exists() and out_path.stat().st_size > 0

    # Multi-image path: direct SDK call
    try:
        client = _get_gemini_client()
        pil_images = [PILImage.open(p) for p in input_images if p.exists()]
        contents = [*pil_images, prompt]
        response = client.models.generate_content(
            model=NANO_BANANA_MODEL,
            contents=contents,
        )
        # Extract image bytes from response.candidates[0].content.parts
        for part in response.candidates[0].content.parts:
            if hasattr(part, "inline_data") and part.inline_data and part.inline_data.data:
                out_path.parent.mkdir(parents=True, exist_ok=True)
                img = PILImage.open(BytesIO(part.inline_data.data))
                img.save(out_path, "PNG")
                return True
        sys.stderr.write(f"  ✗ Nano Banana: no image in response\n")
        return False
    except Exception as e:
        sys.stderr.write(f"  ✗ Nano Banana (SDK multi-image) failed: {e}\n")
        return False


def call_codex(prompt: str, out_path: Path, input_images: list[Path] | None = None) -> bool:
    """Use codex CLI (logged via ChatGPT subscription) to generate the image.
    Supports multiple reference frames — copies them all into the working dir
    and references them in the prompt for image-to-image with timeline context."""
    ref_section = ""
    if input_images:
        import shutil
        existing_frames = []
        for p in input_images:
            if p.exists():
                local = out_path.parent / p.name
                if not local.exists():
                    shutil.copy(p, local)
                existing_frames.append(p.name)
        if existing_frames:
            file_list = ", ".join(f"'{n}'" for n in existing_frames)
            ref_section = f"""

REFERENCE FRAMES (in order of timeline): {file_list}.
These are real frames extracted from the original UAP video at different timestamps.
USE THEM as visual input for gpt-image-1's image edit/composition endpoint.
The UAP appears in these frames — preserve its position, scale, and the scene composition.
Use the multiple frames to understand UAP motion / trajectory and convey a coherent moment in the cinematic output.
"""

    codex_instruction = f"""Generate ONE high-quality image and save it to '{out_path.name}' in the current directory.{ref_section}

PROMPT:
{prompt}

Use gpt-image-1's image edit (image-to-image) capability with the reference frame(s) above. Combine them as multi-image input if your tool supports it; otherwise pick the most representative one. Output resolution at least 1024x1024. Save only ONE PNG with the exact filename '{out_path.name}'. Confirm the filename after saving."""
    cmd = [
        "codex", "exec",
        "--skip-git-repo-check",
        "--sandbox", "workspace-write",
        "--cd", str(out_path.parent),
        codex_instruction,
    ]
    res = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
    if res.returncode != 0:
        sys.stderr.write(f"  ✗ Codex failed: {res.stderr[-400:]}\n")
        return False
    return out_path.exists() and out_path.stat().st_size > 0


def append_case_image_refs(md_path: Path, nano_path: Path | None, codex_path: Path | None, diagram_path: Path | None, ref_frame: Path | None):
    """Add case_images block to entity frontmatter."""
    fm, body = read_md(md_path)
    case_images = {}
    if nano_path and nano_path.exists():
        case_images["nano_banana"] = {
            "path": str(nano_path.relative_to(UFO_ROOT)),
            "model": "gemini-3-pro-image",
            "synthetic": True,
            "factual_data_extraction": "NONE",
            "reference_frame": str(ref_frame.relative_to(UFO_ROOT)) if ref_frame else None,
        }
    if codex_path and codex_path.exists():
        case_images["codex"] = {
            "path": str(codex_path.relative_to(UFO_ROOT)),
            "model": "gpt-image-1",
            "synthetic": True,
            "factual_data_extraction": "NONE",
            "reference_frame": str(ref_frame.relative_to(UFO_ROOT)) if ref_frame else None,
        }
    if diagram_path and diagram_path.exists():
        case_images["investigation_diagram"] = {
            "path": str(diagram_path.relative_to(UFO_ROOT)),
            "model": "gemini-3-pro-image",
            "synthetic": True,
            "factual_data_extraction": "NONE",
            "reference_frame": str(ref_frame.relative_to(UFO_ROOT)) if ref_frame else None,
            "annotation_style": "sherlock-holmes-investigation-board",
        }
    if not case_images:
        return
    fm["case_images"] = case_images
    fm["case_images_warnings"] = [
        "Conceptual visualizations only — not evidence.",
        "Do NOT extract numerical claims (altitude, coords, timestamps) from these images.",
        "AI-enhanced from a real video frame; UAP position and scene composition come from the frame, but rendering and any annotations are interpretive.",
    ]
    fm["case_images_generated_at"] = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
    yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
    new = f"---\n{yaml_str}---\n\n{body}" if not body.startswith("\n") else f"---\n{yaml_str}---\n{body}"
    md_path.write_text(new, encoding="utf-8")


def process_entity(md_path: Path, kind: str, force: bool, skip_codex: bool, skip_nano: bool):
    fm, _ = read_md(md_path)
    entity_id = fm.get("video_id") if kind == "video" else fm.get("doc_id")
    if not entity_id:
        sys.stderr.write(f"  ✗ no entity id in {md_path.name}\n")
        return
    out_dir = CASE_IMAGES_DIR / entity_id
    out_dir.mkdir(parents=True, exist_ok=True)
    nano_out = out_dir / "case-nanobanana.png"
    codex_out = out_dir / "case-codex.png"
    diagram_out = out_dir / "investigation-diagram.png"

    # Collect frames as references (videos only). Multi-image lets the model
    # understand motion/trajectory across the UAP timeline.
    if kind == "video":
        ref_frames = find_all_frames(entity_id, max_n=5)
    else:
        ref_frames = []
    primary_frame = ref_frames[len(ref_frames) // 2] if ref_frames else None  # for metadata

    print(f"\n=== {entity_id} ({kind}) ===", flush=True)
    if ref_frames:
        print(f"  reference frames ({len(ref_frames)}): {[p.name for p in ref_frames]}", flush=True)
    else:
        print(f"  (no reference frames — text-only generation)", flush=True)

    case_prompt = build_case_prompt(fm, kind, has_reference_frame=bool(ref_frames))

    if not skip_nano and (force or not nano_out.exists()):
        print(f"  → Nano Banana (case, {len(ref_frames)} frames)…", flush=True)
        if call_nano_banana(case_prompt, nano_out, input_images=ref_frames):
            print(f"  ✓ {nano_out.relative_to(UFO_ROOT)}", flush=True)

    if not skip_codex and (force or not codex_out.exists()):
        print(f"  → Codex (case, {len(ref_frames)} frames)…", flush=True)
        if call_codex(case_prompt, codex_out, input_images=ref_frames):
            print(f"  ✓ {codex_out.relative_to(UFO_ROOT)}", flush=True)

    # Investigation diagram — Nano Banana, multi-image (videos only)
    if ref_frames and not skip_nano and (force or not diagram_out.exists()):
        diagram_prompt = build_diagram_prompt(fm)
        print(f"  → Nano Banana (investigation diagram, {len(ref_frames)} frames)…", flush=True)
        if call_nano_banana(diagram_prompt, diagram_out, input_images=ref_frames):
            print(f"  ✓ {diagram_out.relative_to(UFO_ROOT)}", flush=True)

    append_case_image_refs(md_path, nano_out, codex_out, diagram_out, primary_frame)


def collect_entities(kind: str, entity_id: str | None) -> list[tuple[Path, str]]:
    out = []
    if kind in ("videos", "both"):
        for p in sorted(WIKI_VIDEOS_DIR.glob("*.md")):
            if entity_id and p.stem != entity_id:
                continue
            out.append((p, "video"))
    if kind in ("documents", "both"):
        for p in sorted(WIKI_DOCS_DIR.glob("*.md")):
            if entity_id and p.stem != entity_id:
                continue
            out.append((p, "document"))
    return out


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--kind", choices=["videos", "documents", "both"], default="videos")
    ap.add_argument("--entity-id", help="single entity (video_id or doc_id)")
    ap.add_argument("--skip-nano", action="store_true", help="skip Nano Banana")
    ap.add_argument("--skip-codex", action="store_true", help="skip Codex")
    ap.add_argument("--force", action="store_true", help="re-generate even if exists")
    args = ap.parse_args()

    if not os.environ.get("GEMINI_API_KEY") and not args.skip_nano:
        sys.stderr.write("GEMINI_API_KEY not set (needed for Nano Banana)\n")
        sys.exit(2)

    entities = collect_entities(args.kind, args.entity_id)
    print(f"Processing {len(entities)} entit(y/ies)…")
    for md_path, kind in entities:
        try:
            process_entity(md_path, kind, args.force, args.skip_codex, args.skip_nano)
        except Exception as e:
            sys.stderr.write(f"FATAL on {md_path.name}: {e}\n")
            continue

    # Log
    if entities:
        with open(LOG_PATH, "a", encoding="utf-8") as fh:
            fh.write(
                f"\n## {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')} — CASE IMAGES (Phase 4.6)\n"
                f"- operator: archivist + case-writer\n- script: scripts/11-generate-case-images.py\n"
                f"- kind: {args.kind}\n- entities: {len(entities)}\n"
                f"- skip_nano: {args.skip_nano}\n- skip_codex: {args.skip_codex}\n"
            )


if __name__ == "__main__":
    main()