#!/usr/bin/env python3 """ 08-video-analysis.py — Sherlock Holmes-style deep video analysis with Gemini 3.1 Pro For each .mp4 in /Users/guto/ufo/raw/videos/: 1. Upload to Gemini Files API 2. Wait for ACTIVE state 3. Call gemini-3.1-pro-preview with structured Sherlock prompt 4. Receive JSON containing: - audio_transcript_verbatim (original language, with timestamps) - vision_description (rich English description, frame-by-frame) - vision_description_pt_br - entities_extracted (people/voices, organizations, locations, equipment, UAP objects) - uap_observations (shape, color, motion descriptors, sensor info, kinematics) - timeline (events with timestamps in mm:ss) - anomalies (sensor artifacts vs candidate phenomena, with Locard-style reasoning) - sherlock_observations (what Holmes/Poirot/Dupin would notice — non-obvious details) - classification_markings, redactions (visible on screen) - confidence_band per major claim 5. Save raw JSON to processing/video-analysis/.json 6. Write markdown to wiki/videos/.md with bilingual frontmatter + body Idempotent: skips videos whose .md + .json already exist (use --force to redo). Usage: ./08-video-analysis.py # process all videos in raw/videos/ ./08-video-analysis.py --video DOD_111688970.mp4 # single file ./08-video-analysis.py --max-files 3 # cap for testing ./08-video-analysis.py --model gemini-3.1-flash-lite # cheaper fallback ./08-video-analysis.py --force # re-process even if output exists """ from __future__ import annotations import argparse import hashlib import json import os import re import sys import time import unicodedata from datetime import datetime, timezone from pathlib import Path try: from google import genai from google.genai import types as genai_types except ImportError: sys.stderr.write("Missing google-genai. Run: pip3 install google-genai\n") sys.exit(1) try: import yaml except ImportError: sys.stderr.write("Missing pyyaml. Run: pip3 install pyyaml\n") sys.exit(1) UFO_ROOT = Path("/Users/guto/ufo") VIDEOS_DIR = UFO_ROOT / "raw" / "videos" VIDEO_ANALYSIS_DIR = UFO_ROOT / "processing" / "video-analysis" WIKI_VIDEOS_DIR = UFO_ROOT / "wiki" / "videos" LOG_PATH = UFO_ROOT / "wiki" / "log.md" DEFAULT_MODEL = "gemini-3.1-pro-preview" FALLBACK_MODELS = ["gemini-3-pro-preview", "gemini-3.1-flash-lite"] SCHEMA_VERSION = "0.1.0" WIKI_VERSION = "0.1.0" SHERLOCK_VIDEO_PROMPT = """You are an evidence officer in the Investigation Bureau, applying the combined methodologies of Sherlock Holmes (observation + elimination), Hercule Poirot (psychology of behavior), Auguste Dupin (ratiocination, hidden-in-plain-sight), and Edmond Locard (trace evidence, chain of custody) to a UAP/UFO video released by the U.S. Department of War at war.gov/ufo. Your task: extract EVERYTHING from this video — visual content, audio, transcription, contextual signals, equipment, persons, and any anomalies. Be exhaustive. A great detective notices what others miss. Output a SINGLE JSON object (no markdown fence, no preamble, no commentary) matching this exact schema: { "video_overview": { "duration_seconds": , "primary_subject": "what the video is fundamentally about, one sentence", "camera_perspective": "cockpit | ground | aerial | satellite | unknown", "sensor_type": "visual_eo | infrared_FLIR | radar_screen | mixed | unknown", "platform_inferred": "F/A-18 | helicopter | naval ship | satellite | unknown — based on cockpit layout, HUD elements, screen overlays etc.", "primary_language_spoken": "en | pt | es | other | none" }, "audio_transcript_verbatim": [ { "t_start": "mm:ss", "t_end": "mm:ss", "speaker": "pilot_1 | pilot_2 | unknown_male | unknown_female | radio | narrator | n/a", "text": "EXACT VERBATIM TRANSCRIPTION in the original language, preserve all 'uh', stutters, military jargon, callsigns. Do NOT translate.", "confidence": "high | medium | low" } ], "vision_description": "Comprehensive English narrative of what is visible on screen, with timestamps. Frame-by-frame for key moments. Describe HUD overlays, sensor readouts, on-screen text, classification markings, redactions, target lock indicators, altitude/heading/airspeed values visible, any UAP morphology and motion (track its path with timestamps). 8-15 sentences for a typical 1-3 minute clip.", "vision_description_pt_br": "Same content as vision_description, translated to Brazilian Portuguese (pt-br). Brazilian vocabulary and spelling. Preserve UTF-8 accents. Keep verbatim quotes from audio/screen in original language.", "classification_markings_visible": [ {"t_start": "mm:ss", "t_end": "mm:ss", "level": "UNCLASSIFIED | CUI | CONFIDENTIAL | SECRET | TOP SECRET", "caveats": ["NOFORN", "ORCON", ...], "location_on_screen": "header_banner | corner | watermark | stamp"} ], "redactions_visible": [ {"t_start": "mm:ss", "t_end": "mm:ss", "code": "(b)(1) 1.4(a) | (b)(3) | (b)(6) | other_or_unknown", "description": "what is being blacked/blurred (face, callsign, location, sensor frequency, etc.)"} ], "entities_extracted": { "people": [{"label": "Pilot 1 | Air Crew | Officer X", "role": "primary observer | radio operator | passenger | narrator | unknown", "voice_only": true, "first_appearance": "mm:ss"}], "organizations": [{"name": "USS Nimitz | VFA-41 | AARO | ...", "evidence_for": "patch visible | radio callsign | hull number"}], "locations": [{"name": "Pacific Ocean off San Diego | Strait of Hormuz | ...", "evidence_for": "coordinates on HUD | named in audio | identifiable landmark", "coordinates": {"lat": null, "lon": null, "raw_text": "..."}}], "events": [{"label": "UAP intercept during routine patrol", "date": "YYYY-MM-DD | YYYY | NA"}], "uap_objects": [{"shape": "sphere | disc | triangle | cylinder | cube | elongated-ellipsoid | cigar | irregular | unknown", "color": "...", "size_estimate": "...", "motion_descriptors": ["hover", "instantaneous-direction-change", "descent-from-X-to-Y", "no-visible-exhaust"]}], "vehicles": [{"name": "F/A-18 Super Hornet | AH-64 | ...", "class": "aircraft | ship | submarine | spacecraft | satellite | ground | other"}], "equipment_visible": [{"name": "ATFLIR pod | AESA radar | binoculars | EO/IR turret", "purpose": "sensor used to observe UAP"}], "operations": [{"name": "Range Fouler | Operation X", "type": "military-operation | reporting-protocol | research-program | task-force | other"}], "concepts": [{"name": "FLIR | Mark I eyeball | SECRET//NOFORN | ...", "class": "jargon | legal-instrument | sensor-term | scientific-term | other"}] }, "uap_observation_fields": { "first_visible_at": "mm:ss", "last_visible_at": "mm:ss", "duration_visible_seconds": , "shape": "sphere | disc | triangle | cylinder | cube | elongated-ellipsoid | cigar | irregular | unknown", "color": "metallic | white | dark | luminous | ...", "size_estimate": "1-3 m | 10 m | car-sized | etc.", "altitude_ft": , "speed_kts": , "bearing_deg": , "distance_nm": , "coordinates": {"lat": null, "lon": null, "raw_text": "..."}, "maneuver_descriptors": ["hover", "instantaneous-direction-change", ...], "sensor_observations": [{"sensor": "ATFLIR", "type": "infrared", "duration_min": "..."}], "kinematic_anomalies": "describe any motion that defies known physics: no inertia, no sonic boom at high speed, instantaneous turns, etc." }, "timeline": [ {"t": "mm:ss", "event": "Operator says 'There's a whole fleet of them, look on the SA'", "kind": "audio_quote | visual_event | sensor_event | redaction | classification_change"} ], "anomalies_detected": [ { "kind": "kinematic | sensor_artifact | atmospheric | optical_illusion | hoax_indicator | unredacted_slip | inconsistency", "description": "what is anomalous", "evidence": "at timestamp mm:ss the object does X while expected Y", "candidate_explanations": ["prosaic-advanced-tech", "sensor-glare", "atmospheric-refraction", "extraterrestrial", "hoax", ...], "confidence_band": "high | medium | low | speculation" } ], "sherlock_observations": [ { "detective_lens": "holmes | poirot | dupin | locard", "observation": "Non-obvious detail a regular viewer would miss. E.g., 'the operator's voice quaver at 1:42 increases at the moment the SA reading changes — emotional reaction precedes the sensor change by 0.5s, suggesting the operator saw the target visually before the radar updated.'", "implication": "why it matters investigatively", "confidence_band": "high | medium | low | speculation" } ], "executive_summary_en": "3-5 sentence English summary of WHAT happened in this video, suitable for a chat citation in the future Sherlock UI.", "executive_summary_pt_br": "Same summary in Brazilian Portuguese (pt-br).", "quality_signals": { "video_quality_overall": "high | medium | low", "audio_quality_overall": "high | medium | low | none", "redaction_density": "none | light | heavy | full-blackout", "completeness": "complete | truncated | partial", "extraction_confidence": "high | medium | low" }, "flags": ["sensitive-content", "audio-only", "redaction-heavy", "low-resolution", "interlaced-artifacts", "thermal-only", "no-audio"] } Rules: - Output ONLY the JSON. No fence, no preamble. - Empty arrays for not-applicable fields; null for unknown scalars. Never omit keys. - ALL extracted text (transcript, on-screen text, callsigns) stays in ORIGINAL source language. Do NOT translate. - ONLY `vision_description_pt_br` and `executive_summary_pt_br` are translations — Brazilian Portuguese (pt-br), NOT European Portuguese. Preserve UTF-8 accents. - Verbatim quotes from audio INSIDE narrative fields stay in original language; only the surrounding narration is translated. - Be EXHAUSTIVE in sherlock_observations — aim for 5-15 observations, including subtle audio cues, sensor metadata, behavioral signals. - For anomalies, list ≥3 candidate explanations including a mundane one (sensor artifact, parallax, atmospheric). - If duration_seconds = 0 or no content, still return the JSON with empty arrays and flags=["empty-or-corrupt"]. """ def utc_now_iso() -> str: return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def filename_to_video_id(filename: str) -> str: base = filename.rsplit(".", 1)[0] nfkd = unicodedata.normalize("NFKD", base) ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c)) lower = ascii_str.lower() replaced = re.sub(r"[^a-z0-9-]", "-", lower) collapsed = re.sub(r"-+", "-", replaced).strip("-") if collapsed and collapsed[0].isdigit(): collapsed = "vid-" + collapsed return collapsed def sha256_file(p: Path) -> str: h = hashlib.sha256() with open(p, "rb") as fh: for chunk in iter(lambda: fh.read(65536), b""): h.update(chunk) return h.hexdigest() def upload_and_wait(client, video_path: Path, poll_interval: float = 2.0, timeout: int = 600): """Upload video to Files API and wait until ACTIVE.""" print(f" uploading {video_path.name} ({video_path.stat().st_size/1024/1024:.1f} MB)…", flush=True) t0 = time.time() f = client.files.upload(file=str(video_path)) while f.state.name == "PROCESSING": if time.time() - t0 > timeout: raise TimeoutError(f"upload still PROCESSING after {timeout}s") time.sleep(poll_interval) f = client.files.get(name=f.name) if f.state.name != "ACTIVE": raise RuntimeError(f"file state is {f.state.name} (not ACTIVE) — cannot use") print(f" ✓ file ready ({time.time() - t0:.1f}s upload+process)", flush=True) return f def call_gemini_for_video(client, video_file, model: str, attempt: int = 1, timeout: int = 240): """Generate Sherlock analysis for a video file. Retries with fallback models on failure. Uses a thread-based timeout to avoid SDK hangs (genai sometimes hangs forever on rate limit).""" import concurrent.futures def _call(): return client.models.generate_content( model=model, contents=[video_file, SHERLOCK_VIDEO_PROMPT], config=genai_types.GenerateContentConfig( response_mime_type="application/json", temperature=0.2, max_output_tokens=32768, ), ) try: with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex: future = ex.submit(_call) try: resp = future.result(timeout=timeout) except concurrent.futures.TimeoutError: raise RuntimeError(f"Gemini hung >{timeout}s — treating as failure") return resp.text, model except Exception as e: if attempt < len(FALLBACK_MODELS): next_model = FALLBACK_MODELS[attempt - 1] print(f" ⚠ {model} failed ({e}); falling back to {next_model}", flush=True) return call_gemini_for_video(client, video_file, next_model, attempt + 1, timeout) raise def render_video_md( *, video_id: str, video_path: Path, analysis: dict, meta: dict, now_iso: str, ) -> str: """Render wiki/videos/.md (bilingual).""" frontmatter = { "schema_version": SCHEMA_VERSION, "type": "video", "video_id": video_id, "original_filename": video_path.name, "raw_path": f"../../raw/videos/{video_path.name}", "sha256": sha256_file(video_path), "size_bytes": video_path.stat().st_size, "collection": "DOW-UAP-Video", "vision_model": meta.get("model"), "analyzed_at": now_iso, # Promote video_overview **{f"overview_{k}": v for k, v in (analysis.get("video_overview") or {}).items()}, "uap_observation_fields": analysis.get("uap_observation_fields"), "classification_markings_visible": analysis.get("classification_markings_visible") or [], "redactions_visible": analysis.get("redactions_visible") or [], "entities_extracted": analysis.get("entities_extracted") or {}, "timeline": analysis.get("timeline") or [], "anomalies_detected": analysis.get("anomalies_detected") or [], "sherlock_observations": analysis.get("sherlock_observations") or [], "audio_transcript_verbatim": analysis.get("audio_transcript_verbatim") or [], "vision_description": analysis.get("vision_description", ""), "vision_description_pt_br": analysis.get("vision_description_pt_br", ""), "executive_summary_en": analysis.get("executive_summary_en", ""), "executive_summary_pt_br": analysis.get("executive_summary_pt_br", ""), "quality_signals": analysis.get("quality_signals") or {}, "flags": analysis.get("flags") or [], "last_ingest": now_iso, "wiki_version": WIKI_VERSION, } yaml_str = yaml.dump(frontmatter, allow_unicode=True, sort_keys=False, default_flow_style=False) body = f"""# Video Analysis — {video_id} > Source: `{video_path.name}` · Gemini model: `{meta.get("model")}` · Analyzed: {now_iso} ## Executive Summary (EN) {analysis.get("executive_summary_en", "").strip()} ## Sumário Executivo (PT-BR) {analysis.get("executive_summary_pt_br", "").strip()} ## Vision Description (EN) {analysis.get("vision_description", "").strip()} ## Descrição Vision (PT-BR) {analysis.get("vision_description_pt_br", "").strip()} ## Audio Transcript (verbatim, original language) """ for seg in (analysis.get("audio_transcript_verbatim") or []): body += f"- **[{seg.get('t_start','?')}–{seg.get('t_end','?')}] {seg.get('speaker','?')}**: {seg.get('text','')} _(confidence: {seg.get('confidence','?')})_\n" body += "\n## Sherlock Observations\n\n" for obs in (analysis.get("sherlock_observations") or []): body += f"- **[{obs.get('detective_lens','?')}]** {obs.get('observation','')}\n - _Implication:_ {obs.get('implication','')}\n - _Confidence:_ `{obs.get('confidence_band','?')}`\n\n" body += "## Anomalies Detected\n\n" for a in (analysis.get("anomalies_detected") or []): body += f"- **{a.get('kind','?')}**: {a.get('description','')}\n - _Evidence:_ {a.get('evidence','')}\n - _Candidates:_ {', '.join(a.get('candidate_explanations', []))}\n - _Confidence:_ `{a.get('confidence_band','?')}`\n\n" return f"---\n{yaml_str}---\n\n{body}" def process_video(client, video_path: Path, model: str, force: bool = False) -> bool: video_id = filename_to_video_id(video_path.name) json_out = VIDEO_ANALYSIS_DIR / f"{video_id}.json" md_out = WIKI_VIDEOS_DIR / f"{video_id}.md" if not force and json_out.exists() and md_out.exists(): print(f" skip {video_id} (already processed)", flush=True) return True json_out.parent.mkdir(parents=True, exist_ok=True) md_out.parent.mkdir(parents=True, exist_ok=True) print(f"\n=== {video_path.name} → {video_id} ===", flush=True) t0 = time.time() try: video_file = upload_and_wait(client, video_path) except Exception as e: print(f" ✗ upload failed: {e}", flush=True) return False print(f" calling {model} for Sherlock analysis…", flush=True) try: text, model_used = call_gemini_for_video(client, video_file, model) except Exception as e: print(f" ✗ generation failed: {e}", flush=True) return False # Strip optional fence text = text.strip() if text.startswith("```"): text = re.sub(r"^```(?:json)?\s*", "", text) text = re.sub(r"\s*```$", "", text) try: analysis = json.loads(text) except json.JSONDecodeError as e: print(f" ✗ JSON parse failed: {e}", flush=True) # Save raw output anyway for inspection json_out.with_suffix(".raw.txt").write_text(text, encoding="utf-8") return False meta = {"model": model_used, "duration_seconds": round(time.time() - t0, 1)} json_out.write_text( json.dumps({"analysis": analysis, "meta": meta}, indent=2, ensure_ascii=False), encoding="utf-8", ) md = render_video_md( video_id=video_id, video_path=video_path, analysis=analysis, meta=meta, now_iso=utc_now_iso(), ) md_out.write_text(md, encoding="utf-8") # Clean up uploaded file to free quota try: client.files.delete(name=video_file.name) except Exception: pass elapsed = time.time() - t0 print(f" ✓ {video_id} done ({elapsed:.1f}s)", flush=True) return True def main(): ap = argparse.ArgumentParser(description="Sherlock-style video analysis via Gemini 3.1 Pro.") g = ap.add_mutually_exclusive_group() g.add_argument("--video", help="single video filename in raw/videos/") g.add_argument("--all", action="store_true", help="process all videos (default)") ap.add_argument("--max-files", type=int, default=None, help="cap number of videos (for testing)") ap.add_argument("--model", default=DEFAULT_MODEL, help=f"Gemini model (default {DEFAULT_MODEL})") ap.add_argument("--force", action="store_true", help="reprocess existing outputs") ap.add_argument("--sort-by-size", action="store_true", help="process smallest videos first (for cheap testing)") args = ap.parse_args() api_key = os.environ.get("GEMINI_API_KEY") if not api_key: sys.stderr.write("GEMINI_API_KEY not set\n") sys.exit(2) client = genai.Client(api_key=api_key) if args.video: v = VIDEOS_DIR / args.video if not v.exists(): sys.stderr.write(f"Video not found: {v}\n"); sys.exit(1) videos = [v] else: videos = sorted(VIDEOS_DIR.glob("*.mp4")) if args.sort_by_size: videos.sort(key=lambda p: p.stat().st_size) if args.max_files: videos = videos[: args.max_files] print(f"Processing {len(videos)} video(s) with model {args.model}") ok = 0 fail = [] for v in videos: if process_video(client, v, args.model, force=args.force): ok += 1 else: fail.append(v.name) print(f"\nDone. ok={ok}, failed={len(fail)}") if fail: print("Failed:", fail) if ok > 0: with open(LOG_PATH, "a", encoding="utf-8") as fh: fh.write( f"\n## {utc_now_iso()} — VIDEO ANALYSIS (Gemini Phase 4)\n" f"- operator: archivist + evidence-officer\n- script: scripts/08-video-analysis.py\n" f"- model: {args.model}\n- videos_ok: {ok}\n- videos_failed: {len(fail)}\n" ) if __name__ == "__main__": main()