disclosure-bureau/scripts/08-video-analysis.py

#!/usr/bin/env python3
"""
08-video-analysis.py — Sherlock Holmes-style deep video analysis with Gemini 3.1 Pro

For each .mp4 in /Users/guto/ufo/raw/videos/:
  1. Upload to Gemini Files API
  2. Wait for ACTIVE state
  3. Call gemini-3.1-pro-preview with structured Sherlock prompt
  4. Receive JSON containing:
       - audio_transcript_verbatim (original language, with timestamps)
       - vision_description (rich English description, frame-by-frame)
       - vision_description_pt_br
       - entities_extracted (people/voices, organizations, locations, equipment, UAP objects)
       - uap_observations (shape, color, motion descriptors, sensor info, kinematics)
       - timeline (events with timestamps in mm:ss)
       - anomalies (sensor artifacts vs candidate phenomena, with Locard-style reasoning)
       - sherlock_observations (what Holmes/Poirot/Dupin would notice — non-obvious details)
       - classification_markings, redactions (visible on screen)
       - confidence_band per major claim
  5. Save raw JSON to processing/video-analysis/<video-id>.json
  6. Write markdown to wiki/videos/<video-id>.md with bilingual frontmatter + body

Idempotent: skips videos whose .md + .json already exist (use --force to redo).

Usage:
  ./08-video-analysis.py                              # process all videos in raw/videos/
  ./08-video-analysis.py --video DOD_111688970.mp4    # single file
  ./08-video-analysis.py --max-files 3                # cap for testing
  ./08-video-analysis.py --model gemini-3.1-flash-lite  # cheaper fallback
  ./08-video-analysis.py --force                      # re-process even if output exists
"""
from __future__ import annotations

import argparse
import hashlib
import json
import os
import re
import sys
import time
import unicodedata
from datetime import datetime, timezone
from pathlib import Path

try:
    from google import genai
    from google.genai import types as genai_types
except ImportError:
    sys.stderr.write("Missing google-genai. Run: pip3 install google-genai\n")
    sys.exit(1)

try:
    import yaml
except ImportError:
    sys.stderr.write("Missing pyyaml. Run: pip3 install pyyaml\n")
    sys.exit(1)


UFO_ROOT = Path("/Users/guto/ufo")
VIDEOS_DIR = UFO_ROOT / "raw" / "videos"
VIDEO_ANALYSIS_DIR = UFO_ROOT / "processing" / "video-analysis"
WIKI_VIDEOS_DIR = UFO_ROOT / "wiki" / "videos"
LOG_PATH = UFO_ROOT / "wiki" / "log.md"

DEFAULT_MODEL = "gemini-3.1-pro-preview"
FALLBACK_MODELS = ["gemini-3-pro-preview", "gemini-3.1-flash-lite"]
SCHEMA_VERSION = "0.1.0"
WIKI_VERSION = "0.1.0"


SHERLOCK_VIDEO_PROMPT = """You are an evidence officer in the Investigation Bureau, applying the combined methodologies of Sherlock Holmes (observation + elimination), Hercule Poirot (psychology of behavior), Auguste Dupin (ratiocination, hidden-in-plain-sight), and Edmond Locard (trace evidence, chain of custody) to a UAP/UFO video released by the U.S. Department of War at war.gov/ufo.

Your task: extract EVERYTHING from this video — visual content, audio, transcription, contextual signals, equipment, persons, and any anomalies. Be exhaustive. A great detective notices what others miss.

Output a SINGLE JSON object (no markdown fence, no preamble, no commentary) matching this exact schema:

{
  "video_overview": {
    "duration_seconds": <float>,
    "primary_subject": "what the video is fundamentally about, one sentence",
    "camera_perspective": "cockpit | ground | aerial | satellite | unknown",
    "sensor_type": "visual_eo | infrared_FLIR | radar_screen | mixed | unknown",
    "platform_inferred": "F/A-18 | helicopter | naval ship | satellite | unknown — based on cockpit layout, HUD elements, screen overlays etc.",
    "primary_language_spoken": "en | pt | es | other | none"
  },

  "audio_transcript_verbatim": [
    {
      "t_start": "mm:ss",
      "t_end": "mm:ss",
      "speaker": "pilot_1 | pilot_2 | unknown_male | unknown_female | radio | narrator | n/a",
      "text": "EXACT VERBATIM TRANSCRIPTION in the original language, preserve all 'uh', stutters, military jargon, callsigns. Do NOT translate.",
      "confidence": "high | medium | low"
    }
  ],

  "vision_description": "Comprehensive English narrative of what is visible on screen, with timestamps. Frame-by-frame for key moments. Describe HUD overlays, sensor readouts, on-screen text, classification markings, redactions, target lock indicators, altitude/heading/airspeed values visible, any UAP morphology and motion (track its path with timestamps). 8-15 sentences for a typical 1-3 minute clip.",
  "vision_description_pt_br": "Same content as vision_description, translated to Brazilian Portuguese (pt-br). Brazilian vocabulary and spelling. Preserve UTF-8 accents. Keep verbatim quotes from audio/screen in original language.",

  "classification_markings_visible": [
    {"t_start": "mm:ss", "t_end": "mm:ss", "level": "UNCLASSIFIED | CUI | CONFIDENTIAL | SECRET | TOP SECRET", "caveats": ["NOFORN", "ORCON", ...], "location_on_screen": "header_banner | corner | watermark | stamp"}
  ],

  "redactions_visible": [
    {"t_start": "mm:ss", "t_end": "mm:ss", "code": "(b)(1) 1.4(a) | (b)(3) | (b)(6) | other_or_unknown", "description": "what is being blacked/blurred (face, callsign, location, sensor frequency, etc.)"}
  ],

  "entities_extracted": {
    "people": [{"label": "Pilot 1 | Air Crew | Officer X", "role": "primary observer | radio operator | passenger | narrator | unknown", "voice_only": true, "first_appearance": "mm:ss"}],
    "organizations": [{"name": "USS Nimitz | VFA-41 | AARO | ...", "evidence_for": "patch visible | radio callsign | hull number"}],
    "locations": [{"name": "Pacific Ocean off San Diego | Strait of Hormuz | ...", "evidence_for": "coordinates on HUD | named in audio | identifiable landmark", "coordinates": {"lat": null, "lon": null, "raw_text": "..."}}],
    "events": [{"label": "UAP intercept during routine patrol", "date": "YYYY-MM-DD | YYYY | NA"}],
    "uap_objects": [{"shape": "sphere | disc | triangle | cylinder | cube | elongated-ellipsoid | cigar | irregular | unknown", "color": "...", "size_estimate": "...", "motion_descriptors": ["hover", "instantaneous-direction-change", "descent-from-X-to-Y", "no-visible-exhaust"]}],
    "vehicles": [{"name": "F/A-18 Super Hornet | AH-64 | ...", "class": "aircraft | ship | submarine | spacecraft | satellite | ground | other"}],
    "equipment_visible": [{"name": "ATFLIR pod | AESA radar | binoculars | EO/IR turret", "purpose": "sensor used to observe UAP"}],
    "operations": [{"name": "Range Fouler | Operation X", "type": "military-operation | reporting-protocol | research-program | task-force | other"}],
    "concepts": [{"name": "FLIR | Mark I eyeball | SECRET//NOFORN | ...", "class": "jargon | legal-instrument | sensor-term | scientific-term | other"}]
  },

  "uap_observation_fields": {
    "first_visible_at": "mm:ss",
    "last_visible_at": "mm:ss",
    "duration_visible_seconds": <int>,
    "shape": "sphere | disc | triangle | cylinder | cube | elongated-ellipsoid | cigar | irregular | unknown",
    "color": "metallic | white | dark | luminous | ...",
    "size_estimate": "1-3 m | 10 m | car-sized | etc.",
    "altitude_ft": <int or null>,
    "speed_kts": <int or null or "supersonic">,
    "bearing_deg": <int or null>,
    "distance_nm": <float or null>,
    "coordinates": {"lat": null, "lon": null, "raw_text": "..."},
    "maneuver_descriptors": ["hover", "instantaneous-direction-change", ...],
    "sensor_observations": [{"sensor": "ATFLIR", "type": "infrared", "duration_min": "..."}],
    "kinematic_anomalies": "describe any motion that defies known physics: no inertia, no sonic boom at high speed, instantaneous turns, etc."
  },

  "timeline": [
    {"t": "mm:ss", "event": "Operator says 'There's a whole fleet of them, look on the SA'", "kind": "audio_quote | visual_event | sensor_event | redaction | classification_change"}
  ],

  "anomalies_detected": [
    {
      "kind": "kinematic | sensor_artifact | atmospheric | optical_illusion | hoax_indicator | unredacted_slip | inconsistency",
      "description": "what is anomalous",
      "evidence": "at timestamp mm:ss the object does X while expected Y",
      "candidate_explanations": ["prosaic-advanced-tech", "sensor-glare", "atmospheric-refraction", "extraterrestrial", "hoax", ...],
      "confidence_band": "high | medium | low | speculation"
    }
  ],

  "sherlock_observations": [
    {
      "detective_lens": "holmes | poirot | dupin | locard",
      "observation": "Non-obvious detail a regular viewer would miss. E.g., 'the operator's voice quaver at 1:42 increases at the moment the SA reading changes — emotional reaction precedes the sensor change by 0.5s, suggesting the operator saw the target visually before the radar updated.'",
      "implication": "why it matters investigatively",
      "confidence_band": "high | medium | low | speculation"
    }
  ],

  "executive_summary_en": "3-5 sentence English summary of WHAT happened in this video, suitable for a chat citation in the future Sherlock UI.",
  "executive_summary_pt_br": "Same summary in Brazilian Portuguese (pt-br).",

  "quality_signals": {
    "video_quality_overall": "high | medium | low",
    "audio_quality_overall": "high | medium | low | none",
    "redaction_density": "none | light | heavy | full-blackout",
    "completeness": "complete | truncated | partial",
    "extraction_confidence": "high | medium | low"
  },

  "flags": ["sensitive-content", "audio-only", "redaction-heavy", "low-resolution", "interlaced-artifacts", "thermal-only", "no-audio"]
}

Rules:
- Output ONLY the JSON. No fence, no preamble.
- Empty arrays for not-applicable fields; null for unknown scalars. Never omit keys.
- ALL extracted text (transcript, on-screen text, callsigns) stays in ORIGINAL source language. Do NOT translate.
- ONLY `vision_description_pt_br` and `executive_summary_pt_br` are translations — Brazilian Portuguese (pt-br), NOT European Portuguese. Preserve UTF-8 accents.
- Verbatim quotes from audio INSIDE narrative fields stay in original language; only the surrounding narration is translated.
- Be EXHAUSTIVE in sherlock_observations — aim for 5-15 observations, including subtle audio cues, sensor metadata, behavioral signals.
- For anomalies, list ≥3 candidate explanations including a mundane one (sensor artifact, parallax, atmospheric).
- If duration_seconds = 0 or no content, still return the JSON with empty arrays and flags=["empty-or-corrupt"].
"""


def utc_now_iso() -> str:
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def filename_to_video_id(filename: str) -> str:
    base = filename.rsplit(".", 1)[0]
    nfkd = unicodedata.normalize("NFKD", base)
    ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
    lower = ascii_str.lower()
    replaced = re.sub(r"[^a-z0-9-]", "-", lower)
    collapsed = re.sub(r"-+", "-", replaced).strip("-")
    if collapsed and collapsed[0].isdigit():
        collapsed = "vid-" + collapsed
    return collapsed


def sha256_file(p: Path) -> str:
    h = hashlib.sha256()
    with open(p, "rb") as fh:
        for chunk in iter(lambda: fh.read(65536), b""):
            h.update(chunk)
    return h.hexdigest()


def upload_and_wait(client, video_path: Path, poll_interval: float = 2.0, timeout: int = 600):
    """Upload video to Files API and wait until ACTIVE."""
    print(f"  uploading {video_path.name} ({video_path.stat().st_size/1024/1024:.1f} MB)…", flush=True)
    t0 = time.time()
    f = client.files.upload(file=str(video_path))
    while f.state.name == "PROCESSING":
        if time.time() - t0 > timeout:
            raise TimeoutError(f"upload still PROCESSING after {timeout}s")
        time.sleep(poll_interval)
        f = client.files.get(name=f.name)
    if f.state.name != "ACTIVE":
        raise RuntimeError(f"file state is {f.state.name} (not ACTIVE) — cannot use")
    print(f"  ✓ file ready ({time.time() - t0:.1f}s upload+process)", flush=True)
    return f


def call_gemini_for_video(client, video_file, model: str, attempt: int = 1, timeout: int = 240):
    """Generate Sherlock analysis for a video file. Retries with fallback models on failure.
    Uses a thread-based timeout to avoid SDK hangs (genai sometimes hangs forever on rate limit)."""
    import concurrent.futures

    def _call():
        return client.models.generate_content(
            model=model,
            contents=[video_file, SHERLOCK_VIDEO_PROMPT],
            config=genai_types.GenerateContentConfig(
                response_mime_type="application/json",
                temperature=0.2,
                max_output_tokens=32768,
            ),
        )

    try:
        with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
            future = ex.submit(_call)
            try:
                resp = future.result(timeout=timeout)
            except concurrent.futures.TimeoutError:
                raise RuntimeError(f"Gemini hung >{timeout}s — treating as failure")
        return resp.text, model
    except Exception as e:
        if attempt < len(FALLBACK_MODELS):
            next_model = FALLBACK_MODELS[attempt - 1]
            print(f"  ⚠ {model} failed ({e}); falling back to {next_model}", flush=True)
            return call_gemini_for_video(client, video_file, next_model, attempt + 1, timeout)
        raise


def render_video_md(
    *,
    video_id: str,
    video_path: Path,
    analysis: dict,
    meta: dict,
    now_iso: str,
) -> str:
    """Render wiki/videos/<video-id>.md (bilingual)."""
    frontmatter = {
        "schema_version": SCHEMA_VERSION,
        "type": "video",
        "video_id": video_id,
        "original_filename": video_path.name,
        "raw_path": f"../../raw/videos/{video_path.name}",
        "sha256": sha256_file(video_path),
        "size_bytes": video_path.stat().st_size,
        "collection": "DOW-UAP-Video",
        "vision_model": meta.get("model"),
        "analyzed_at": now_iso,
        # Promote video_overview
        **{f"overview_{k}": v for k, v in (analysis.get("video_overview") or {}).items()},
        "uap_observation_fields": analysis.get("uap_observation_fields"),
        "classification_markings_visible": analysis.get("classification_markings_visible") or [],
        "redactions_visible": analysis.get("redactions_visible") or [],
        "entities_extracted": analysis.get("entities_extracted") or {},
        "timeline": analysis.get("timeline") or [],
        "anomalies_detected": analysis.get("anomalies_detected") or [],
        "sherlock_observations": analysis.get("sherlock_observations") or [],
        "audio_transcript_verbatim": analysis.get("audio_transcript_verbatim") or [],
        "vision_description": analysis.get("vision_description", ""),
        "vision_description_pt_br": analysis.get("vision_description_pt_br", ""),
        "executive_summary_en": analysis.get("executive_summary_en", ""),
        "executive_summary_pt_br": analysis.get("executive_summary_pt_br", ""),
        "quality_signals": analysis.get("quality_signals") or {},
        "flags": analysis.get("flags") or [],
        "last_ingest": now_iso,
        "wiki_version": WIKI_VERSION,
    }
    yaml_str = yaml.dump(frontmatter, allow_unicode=True, sort_keys=False, default_flow_style=False)

    body = f"""# Video Analysis — {video_id}

> Source: `{video_path.name}` · Gemini model: `{meta.get("model")}` · Analyzed: {now_iso}

## Executive Summary (EN)

{analysis.get("executive_summary_en", "").strip()}

## Sumário Executivo (PT-BR)

{analysis.get("executive_summary_pt_br", "").strip()}

## Vision Description (EN)

{analysis.get("vision_description", "").strip()}

## Descrição Vision (PT-BR)

{analysis.get("vision_description_pt_br", "").strip()}

## Audio Transcript (verbatim, original language)

"""
    for seg in (analysis.get("audio_transcript_verbatim") or []):
        body += f"- **[{seg.get('t_start','?')}–{seg.get('t_end','?')}] {seg.get('speaker','?')}**: {seg.get('text','')} _(confidence: {seg.get('confidence','?')})_\n"

    body += "\n## Sherlock Observations\n\n"
    for obs in (analysis.get("sherlock_observations") or []):
        body += f"- **[{obs.get('detective_lens','?')}]** {obs.get('observation','')}\n  - _Implication:_ {obs.get('implication','')}\n  - _Confidence:_ `{obs.get('confidence_band','?')}`\n\n"

    body += "## Anomalies Detected\n\n"
    for a in (analysis.get("anomalies_detected") or []):
        body += f"- **{a.get('kind','?')}**: {a.get('description','')}\n  - _Evidence:_ {a.get('evidence','')}\n  - _Candidates:_ {', '.join(a.get('candidate_explanations', []))}\n  - _Confidence:_ `{a.get('confidence_band','?')}`\n\n"

    return f"---\n{yaml_str}---\n\n{body}"


def process_video(client, video_path: Path, model: str, force: bool = False) -> bool:
    video_id = filename_to_video_id(video_path.name)
    json_out = VIDEO_ANALYSIS_DIR / f"{video_id}.json"
    md_out = WIKI_VIDEOS_DIR / f"{video_id}.md"

    if not force and json_out.exists() and md_out.exists():
        print(f"  skip {video_id} (already processed)", flush=True)
        return True

    json_out.parent.mkdir(parents=True, exist_ok=True)
    md_out.parent.mkdir(parents=True, exist_ok=True)

    print(f"\n=== {video_path.name} → {video_id} ===", flush=True)
    t0 = time.time()
    try:
        video_file = upload_and_wait(client, video_path)
    except Exception as e:
        print(f"  ✗ upload failed: {e}", flush=True)
        return False

    print(f"  calling {model} for Sherlock analysis…", flush=True)
    try:
        text, model_used = call_gemini_for_video(client, video_file, model)
    except Exception as e:
        print(f"  ✗ generation failed: {e}", flush=True)
        return False

    # Strip optional fence
    text = text.strip()
    if text.startswith("```"):
        text = re.sub(r"^```(?:json)?\s*", "", text)
        text = re.sub(r"\s*```$", "", text)
    try:
        analysis = json.loads(text)
    except json.JSONDecodeError as e:
        print(f"  ✗ JSON parse failed: {e}", flush=True)
        # Save raw output anyway for inspection
        json_out.with_suffix(".raw.txt").write_text(text, encoding="utf-8")
        return False

    meta = {"model": model_used, "duration_seconds": round(time.time() - t0, 1)}
    json_out.write_text(
        json.dumps({"analysis": analysis, "meta": meta}, indent=2, ensure_ascii=False),
        encoding="utf-8",
    )
    md = render_video_md(
        video_id=video_id,
        video_path=video_path,
        analysis=analysis,
        meta=meta,
        now_iso=utc_now_iso(),
    )
    md_out.write_text(md, encoding="utf-8")

    # Clean up uploaded file to free quota
    try:
        client.files.delete(name=video_file.name)
    except Exception:
        pass

    elapsed = time.time() - t0
    print(f"  ✓ {video_id} done ({elapsed:.1f}s)", flush=True)
    return True


def main():
    ap = argparse.ArgumentParser(description="Sherlock-style video analysis via Gemini 3.1 Pro.")
    g = ap.add_mutually_exclusive_group()
    g.add_argument("--video", help="single video filename in raw/videos/")
    g.add_argument("--all", action="store_true", help="process all videos (default)")
    ap.add_argument("--max-files", type=int, default=None, help="cap number of videos (for testing)")
    ap.add_argument("--model", default=DEFAULT_MODEL, help=f"Gemini model (default {DEFAULT_MODEL})")
    ap.add_argument("--force", action="store_true", help="reprocess existing outputs")
    ap.add_argument("--sort-by-size", action="store_true", help="process smallest videos first (for cheap testing)")
    args = ap.parse_args()

    api_key = os.environ.get("GEMINI_API_KEY")
    if not api_key:
        sys.stderr.write("GEMINI_API_KEY not set\n")
        sys.exit(2)

    client = genai.Client(api_key=api_key)

    if args.video:
        v = VIDEOS_DIR / args.video
        if not v.exists():
            sys.stderr.write(f"Video not found: {v}\n"); sys.exit(1)
        videos = [v]
    else:
        videos = sorted(VIDEOS_DIR.glob("*.mp4"))
        if args.sort_by_size:
            videos.sort(key=lambda p: p.stat().st_size)

    if args.max_files:
        videos = videos[: args.max_files]

    print(f"Processing {len(videos)} video(s) with model {args.model}")
    ok = 0
    fail = []
    for v in videos:
        if process_video(client, v, args.model, force=args.force):
            ok += 1
        else:
            fail.append(v.name)

    print(f"\nDone. ok={ok}, failed={len(fail)}")
    if fail:
        print("Failed:", fail)

    if ok > 0:
        with open(LOG_PATH, "a", encoding="utf-8") as fh:
            fh.write(
                f"\n## {utc_now_iso()} — VIDEO ANALYSIS (Gemini Phase 4)\n"
                f"- operator: archivist + evidence-officer\n- script: scripts/08-video-analysis.py\n"
                f"- model: {args.model}\n- videos_ok: {ok}\n- videos_failed: {len(fail)}\n"
            )


if __name__ == "__main__":
    main()