disclosure-bureau/scripts/_archived-gemini/08-video-analysis.py

456 lines
21 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
"""
08-video-analysis.py Sherlock Holmes-style deep video analysis with Gemini 3.1 Pro
For each .mp4 in /Users/guto/ufo/raw/videos/:
1. Upload to Gemini Files API
2. Wait for ACTIVE state
3. Call gemini-3.1-pro-preview with structured Sherlock prompt
4. Receive JSON containing:
- audio_transcript_verbatim (original language, with timestamps)
- vision_description (rich English description, frame-by-frame)
- vision_description_pt_br
- entities_extracted (people/voices, organizations, locations, equipment, UAP objects)
- uap_observations (shape, color, motion descriptors, sensor info, kinematics)
- timeline (events with timestamps in mm:ss)
- anomalies (sensor artifacts vs candidate phenomena, with Locard-style reasoning)
- sherlock_observations (what Holmes/Poirot/Dupin would notice non-obvious details)
- classification_markings, redactions (visible on screen)
- confidence_band per major claim
5. Save raw JSON to processing/video-analysis/<video-id>.json
6. Write markdown to wiki/videos/<video-id>.md with bilingual frontmatter + body
Idempotent: skips videos whose .md + .json already exist (use --force to redo).
Usage:
./08-video-analysis.py # process all videos in raw/videos/
./08-video-analysis.py --video DOD_111688970.mp4 # single file
./08-video-analysis.py --max-files 3 # cap for testing
./08-video-analysis.py --model gemini-3.1-flash-lite # cheaper fallback
./08-video-analysis.py --force # re-process even if output exists
"""
from __future__ import annotations
import argparse
import hashlib
import json
import os
import re
import sys
import time
import unicodedata
from datetime import datetime, timezone
from pathlib import Path
try:
from google import genai
from google.genai import types as genai_types
except ImportError:
sys.stderr.write("Missing google-genai. Run: pip3 install google-genai\n")
sys.exit(1)
try:
import yaml
except ImportError:
sys.stderr.write("Missing pyyaml. Run: pip3 install pyyaml\n")
sys.exit(1)
UFO_ROOT = Path("/Users/guto/ufo")
VIDEOS_DIR = UFO_ROOT / "raw" / "videos"
VIDEO_ANALYSIS_DIR = UFO_ROOT / "processing" / "video-analysis"
WIKI_VIDEOS_DIR = UFO_ROOT / "wiki" / "videos"
LOG_PATH = UFO_ROOT / "wiki" / "log.md"
DEFAULT_MODEL = "gemini-3.1-pro-preview"
FALLBACK_MODELS = ["gemini-3-pro-preview", "gemini-3.1-flash-lite"]
SCHEMA_VERSION = "0.1.0"
WIKI_VERSION = "0.1.0"
SHERLOCK_VIDEO_PROMPT = """You are an evidence officer in the Investigation Bureau, applying the combined methodologies of Sherlock Holmes (observation + elimination), Hercule Poirot (psychology of behavior), Auguste Dupin (ratiocination, hidden-in-plain-sight), and Edmond Locard (trace evidence, chain of custody) to a UAP/UFO video released by the U.S. Department of War at war.gov/ufo.
Your task: extract EVERYTHING from this video visual content, audio, transcription, contextual signals, equipment, persons, and any anomalies. Be exhaustive. A great detective notices what others miss.
Output a SINGLE JSON object (no markdown fence, no preamble, no commentary) matching this exact schema:
{
"video_overview": {
"duration_seconds": <float>,
"primary_subject": "what the video is fundamentally about, one sentence",
"camera_perspective": "cockpit | ground | aerial | satellite | unknown",
"sensor_type": "visual_eo | infrared_FLIR | radar_screen | mixed | unknown",
"platform_inferred": "F/A-18 | helicopter | naval ship | satellite | unknown — based on cockpit layout, HUD elements, screen overlays etc.",
"primary_language_spoken": "en | pt | es | other | none"
},
"audio_transcript_verbatim": [
{
"t_start": "mm:ss",
"t_end": "mm:ss",
"speaker": "pilot_1 | pilot_2 | unknown_male | unknown_female | radio | narrator | n/a",
"text": "EXACT VERBATIM TRANSCRIPTION in the original language, preserve all 'uh', stutters, military jargon, callsigns. Do NOT translate.",
"confidence": "high | medium | low"
}
],
"vision_description": "Comprehensive English narrative of what is visible on screen, with timestamps. Frame-by-frame for key moments. Describe HUD overlays, sensor readouts, on-screen text, classification markings, redactions, target lock indicators, altitude/heading/airspeed values visible, any UAP morphology and motion (track its path with timestamps). 8-15 sentences for a typical 1-3 minute clip.",
"vision_description_pt_br": "Same content as vision_description, translated to Brazilian Portuguese (pt-br). Brazilian vocabulary and spelling. Preserve UTF-8 accents. Keep verbatim quotes from audio/screen in original language.",
"classification_markings_visible": [
{"t_start": "mm:ss", "t_end": "mm:ss", "level": "UNCLASSIFIED | CUI | CONFIDENTIAL | SECRET | TOP SECRET", "caveats": ["NOFORN", "ORCON", ...], "location_on_screen": "header_banner | corner | watermark | stamp"}
],
"redactions_visible": [
{"t_start": "mm:ss", "t_end": "mm:ss", "code": "(b)(1) 1.4(a) | (b)(3) | (b)(6) | other_or_unknown", "description": "what is being blacked/blurred (face, callsign, location, sensor frequency, etc.)"}
],
"entities_extracted": {
"people": [{"label": "Pilot 1 | Air Crew | Officer X", "role": "primary observer | radio operator | passenger | narrator | unknown", "voice_only": true, "first_appearance": "mm:ss"}],
"organizations": [{"name": "USS Nimitz | VFA-41 | AARO | ...", "evidence_for": "patch visible | radio callsign | hull number"}],
"locations": [{"name": "Pacific Ocean off San Diego | Strait of Hormuz | ...", "evidence_for": "coordinates on HUD | named in audio | identifiable landmark", "coordinates": {"lat": null, "lon": null, "raw_text": "..."}}],
"events": [{"label": "UAP intercept during routine patrol", "date": "YYYY-MM-DD | YYYY | NA"}],
"uap_objects": [{"shape": "sphere | disc | triangle | cylinder | cube | elongated-ellipsoid | cigar | irregular | unknown", "color": "...", "size_estimate": "...", "motion_descriptors": ["hover", "instantaneous-direction-change", "descent-from-X-to-Y", "no-visible-exhaust"]}],
"vehicles": [{"name": "F/A-18 Super Hornet | AH-64 | ...", "class": "aircraft | ship | submarine | spacecraft | satellite | ground | other"}],
"equipment_visible": [{"name": "ATFLIR pod | AESA radar | binoculars | EO/IR turret", "purpose": "sensor used to observe UAP"}],
"operations": [{"name": "Range Fouler | Operation X", "type": "military-operation | reporting-protocol | research-program | task-force | other"}],
"concepts": [{"name": "FLIR | Mark I eyeball | SECRET//NOFORN | ...", "class": "jargon | legal-instrument | sensor-term | scientific-term | other"}]
},
"uap_observation_fields": {
"first_visible_at": "mm:ss",
"last_visible_at": "mm:ss",
"duration_visible_seconds": <int>,
"shape": "sphere | disc | triangle | cylinder | cube | elongated-ellipsoid | cigar | irregular | unknown",
"color": "metallic | white | dark | luminous | ...",
"size_estimate": "1-3 m | 10 m | car-sized | etc.",
"altitude_ft": <int or null>,
"speed_kts": <int or null or "supersonic">,
"bearing_deg": <int or null>,
"distance_nm": <float or null>,
"coordinates": {"lat": null, "lon": null, "raw_text": "..."},
"maneuver_descriptors": ["hover", "instantaneous-direction-change", ...],
"sensor_observations": [{"sensor": "ATFLIR", "type": "infrared", "duration_min": "..."}],
"kinematic_anomalies": "describe any motion that defies known physics: no inertia, no sonic boom at high speed, instantaneous turns, etc."
},
"timeline": [
{"t": "mm:ss", "event": "Operator says 'There's a whole fleet of them, look on the SA'", "kind": "audio_quote | visual_event | sensor_event | redaction | classification_change"}
],
"anomalies_detected": [
{
"kind": "kinematic | sensor_artifact | atmospheric | optical_illusion | hoax_indicator | unredacted_slip | inconsistency",
"description": "what is anomalous",
"evidence": "at timestamp mm:ss the object does X while expected Y",
"candidate_explanations": ["prosaic-advanced-tech", "sensor-glare", "atmospheric-refraction", "extraterrestrial", "hoax", ...],
"confidence_band": "high | medium | low | speculation"
}
],
"sherlock_observations": [
{
"detective_lens": "holmes | poirot | dupin | locard",
"observation": "Non-obvious detail a regular viewer would miss. E.g., 'the operator's voice quaver at 1:42 increases at the moment the SA reading changes — emotional reaction precedes the sensor change by 0.5s, suggesting the operator saw the target visually before the radar updated.'",
"implication": "why it matters investigatively",
"confidence_band": "high | medium | low | speculation"
}
],
"executive_summary_en": "3-5 sentence English summary of WHAT happened in this video, suitable for a chat citation in the future Sherlock UI.",
"executive_summary_pt_br": "Same summary in Brazilian Portuguese (pt-br).",
"quality_signals": {
"video_quality_overall": "high | medium | low",
"audio_quality_overall": "high | medium | low | none",
"redaction_density": "none | light | heavy | full-blackout",
"completeness": "complete | truncated | partial",
"extraction_confidence": "high | medium | low"
},
"flags": ["sensitive-content", "audio-only", "redaction-heavy", "low-resolution", "interlaced-artifacts", "thermal-only", "no-audio"]
}
Rules:
- Output ONLY the JSON. No fence, no preamble.
- Empty arrays for not-applicable fields; null for unknown scalars. Never omit keys.
- ALL extracted text (transcript, on-screen text, callsigns) stays in ORIGINAL source language. Do NOT translate.
- ONLY `vision_description_pt_br` and `executive_summary_pt_br` are translations Brazilian Portuguese (pt-br), NOT European Portuguese. Preserve UTF-8 accents.
- Verbatim quotes from audio INSIDE narrative fields stay in original language; only the surrounding narration is translated.
- Be EXHAUSTIVE in sherlock_observations aim for 5-15 observations, including subtle audio cues, sensor metadata, behavioral signals.
- For anomalies, list 3 candidate explanations including a mundane one (sensor artifact, parallax, atmospheric).
- If duration_seconds = 0 or no content, still return the JSON with empty arrays and flags=["empty-or-corrupt"].
"""
def utc_now_iso() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def filename_to_video_id(filename: str) -> str:
base = filename.rsplit(".", 1)[0]
nfkd = unicodedata.normalize("NFKD", base)
ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
lower = ascii_str.lower()
replaced = re.sub(r"[^a-z0-9-]", "-", lower)
collapsed = re.sub(r"-+", "-", replaced).strip("-")
if collapsed and collapsed[0].isdigit():
collapsed = "vid-" + collapsed
return collapsed
def sha256_file(p: Path) -> str:
h = hashlib.sha256()
with open(p, "rb") as fh:
for chunk in iter(lambda: fh.read(65536), b""):
h.update(chunk)
return h.hexdigest()
def upload_and_wait(client, video_path: Path, poll_interval: float = 2.0, timeout: int = 600):
"""Upload video to Files API and wait until ACTIVE."""
print(f" uploading {video_path.name} ({video_path.stat().st_size/1024/1024:.1f} MB)…", flush=True)
t0 = time.time()
f = client.files.upload(file=str(video_path))
while f.state.name == "PROCESSING":
if time.time() - t0 > timeout:
raise TimeoutError(f"upload still PROCESSING after {timeout}s")
time.sleep(poll_interval)
f = client.files.get(name=f.name)
if f.state.name != "ACTIVE":
raise RuntimeError(f"file state is {f.state.name} (not ACTIVE) — cannot use")
print(f" ✓ file ready ({time.time() - t0:.1f}s upload+process)", flush=True)
return f
def call_gemini_for_video(client, video_file, model: str, attempt: int = 1, timeout: int = 240):
"""Generate Sherlock analysis for a video file. Retries with fallback models on failure.
Uses a thread-based timeout to avoid SDK hangs (genai sometimes hangs forever on rate limit)."""
import concurrent.futures
def _call():
return client.models.generate_content(
model=model,
contents=[video_file, SHERLOCK_VIDEO_PROMPT],
config=genai_types.GenerateContentConfig(
response_mime_type="application/json",
temperature=0.2,
max_output_tokens=32768,
),
)
try:
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
future = ex.submit(_call)
try:
resp = future.result(timeout=timeout)
except concurrent.futures.TimeoutError:
raise RuntimeError(f"Gemini hung >{timeout}s — treating as failure")
return resp.text, model
except Exception as e:
if attempt < len(FALLBACK_MODELS):
next_model = FALLBACK_MODELS[attempt - 1]
print(f"{model} failed ({e}); falling back to {next_model}", flush=True)
return call_gemini_for_video(client, video_file, next_model, attempt + 1, timeout)
raise
def render_video_md(
*,
video_id: str,
video_path: Path,
analysis: dict,
meta: dict,
now_iso: str,
) -> str:
"""Render wiki/videos/<video-id>.md (bilingual)."""
frontmatter = {
"schema_version": SCHEMA_VERSION,
"type": "video",
"video_id": video_id,
"original_filename": video_path.name,
"raw_path": f"../../raw/videos/{video_path.name}",
"sha256": sha256_file(video_path),
"size_bytes": video_path.stat().st_size,
"collection": "DOW-UAP-Video",
"vision_model": meta.get("model"),
"analyzed_at": now_iso,
# Promote video_overview
**{f"overview_{k}": v for k, v in (analysis.get("video_overview") or {}).items()},
"uap_observation_fields": analysis.get("uap_observation_fields"),
"classification_markings_visible": analysis.get("classification_markings_visible") or [],
"redactions_visible": analysis.get("redactions_visible") or [],
"entities_extracted": analysis.get("entities_extracted") or {},
"timeline": analysis.get("timeline") or [],
"anomalies_detected": analysis.get("anomalies_detected") or [],
"sherlock_observations": analysis.get("sherlock_observations") or [],
"audio_transcript_verbatim": analysis.get("audio_transcript_verbatim") or [],
"vision_description": analysis.get("vision_description", ""),
"vision_description_pt_br": analysis.get("vision_description_pt_br", ""),
"executive_summary_en": analysis.get("executive_summary_en", ""),
"executive_summary_pt_br": analysis.get("executive_summary_pt_br", ""),
"quality_signals": analysis.get("quality_signals") or {},
"flags": analysis.get("flags") or [],
"last_ingest": now_iso,
"wiki_version": WIKI_VERSION,
}
yaml_str = yaml.dump(frontmatter, allow_unicode=True, sort_keys=False, default_flow_style=False)
body = f"""# Video Analysis — {video_id}
> Source: `{video_path.name}` · Gemini model: `{meta.get("model")}` · Analyzed: {now_iso}
## Executive Summary (EN)
{analysis.get("executive_summary_en", "").strip()}
## Sumário Executivo (PT-BR)
{analysis.get("executive_summary_pt_br", "").strip()}
## Vision Description (EN)
{analysis.get("vision_description", "").strip()}
## Descrição Vision (PT-BR)
{analysis.get("vision_description_pt_br", "").strip()}
## Audio Transcript (verbatim, original language)
"""
for seg in (analysis.get("audio_transcript_verbatim") or []):
body += f"- **[{seg.get('t_start','?')}{seg.get('t_end','?')}] {seg.get('speaker','?')}**: {seg.get('text','')} _(confidence: {seg.get('confidence','?')})_\n"
body += "\n## Sherlock Observations\n\n"
for obs in (analysis.get("sherlock_observations") or []):
body += f"- **[{obs.get('detective_lens','?')}]** {obs.get('observation','')}\n - _Implication:_ {obs.get('implication','')}\n - _Confidence:_ `{obs.get('confidence_band','?')}`\n\n"
body += "## Anomalies Detected\n\n"
for a in (analysis.get("anomalies_detected") or []):
body += f"- **{a.get('kind','?')}**: {a.get('description','')}\n - _Evidence:_ {a.get('evidence','')}\n - _Candidates:_ {', '.join(a.get('candidate_explanations', []))}\n - _Confidence:_ `{a.get('confidence_band','?')}`\n\n"
return f"---\n{yaml_str}---\n\n{body}"
def process_video(client, video_path: Path, model: str, force: bool = False) -> bool:
video_id = filename_to_video_id(video_path.name)
json_out = VIDEO_ANALYSIS_DIR / f"{video_id}.json"
md_out = WIKI_VIDEOS_DIR / f"{video_id}.md"
if not force and json_out.exists() and md_out.exists():
print(f" skip {video_id} (already processed)", flush=True)
return True
json_out.parent.mkdir(parents=True, exist_ok=True)
md_out.parent.mkdir(parents=True, exist_ok=True)
print(f"\n=== {video_path.name}{video_id} ===", flush=True)
t0 = time.time()
try:
video_file = upload_and_wait(client, video_path)
except Exception as e:
print(f" ✗ upload failed: {e}", flush=True)
return False
print(f" calling {model} for Sherlock analysis…", flush=True)
try:
text, model_used = call_gemini_for_video(client, video_file, model)
except Exception as e:
print(f" ✗ generation failed: {e}", flush=True)
return False
# Strip optional fence
text = text.strip()
if text.startswith("```"):
text = re.sub(r"^```(?:json)?\s*", "", text)
text = re.sub(r"\s*```$", "", text)
try:
analysis = json.loads(text)
except json.JSONDecodeError as e:
print(f" ✗ JSON parse failed: {e}", flush=True)
# Save raw output anyway for inspection
json_out.with_suffix(".raw.txt").write_text(text, encoding="utf-8")
return False
meta = {"model": model_used, "duration_seconds": round(time.time() - t0, 1)}
json_out.write_text(
json.dumps({"analysis": analysis, "meta": meta}, indent=2, ensure_ascii=False),
encoding="utf-8",
)
md = render_video_md(
video_id=video_id,
video_path=video_path,
analysis=analysis,
meta=meta,
now_iso=utc_now_iso(),
)
md_out.write_text(md, encoding="utf-8")
# Clean up uploaded file to free quota
try:
client.files.delete(name=video_file.name)
except Exception:
pass
elapsed = time.time() - t0
print(f"{video_id} done ({elapsed:.1f}s)", flush=True)
return True
def main():
ap = argparse.ArgumentParser(description="Sherlock-style video analysis via Gemini 3.1 Pro.")
g = ap.add_mutually_exclusive_group()
g.add_argument("--video", help="single video filename in raw/videos/")
g.add_argument("--all", action="store_true", help="process all videos (default)")
ap.add_argument("--max-files", type=int, default=None, help="cap number of videos (for testing)")
ap.add_argument("--model", default=DEFAULT_MODEL, help=f"Gemini model (default {DEFAULT_MODEL})")
ap.add_argument("--force", action="store_true", help="reprocess existing outputs")
ap.add_argument("--sort-by-size", action="store_true", help="process smallest videos first (for cheap testing)")
args = ap.parse_args()
api_key = os.environ.get("GEMINI_API_KEY")
if not api_key:
sys.stderr.write("GEMINI_API_KEY not set\n")
sys.exit(2)
client = genai.Client(api_key=api_key)
if args.video:
v = VIDEOS_DIR / args.video
if not v.exists():
sys.stderr.write(f"Video not found: {v}\n"); sys.exit(1)
videos = [v]
else:
videos = sorted(VIDEOS_DIR.glob("*.mp4"))
if args.sort_by_size:
videos.sort(key=lambda p: p.stat().st_size)
if args.max_files:
videos = videos[: args.max_files]
print(f"Processing {len(videos)} video(s) with model {args.model}")
ok = 0
fail = []
for v in videos:
if process_video(client, v, args.model, force=args.force):
ok += 1
else:
fail.append(v.name)
print(f"\nDone. ok={ok}, failed={len(fail)}")
if fail:
print("Failed:", fail)
if ok > 0:
with open(LOG_PATH, "a", encoding="utf-8") as fh:
fh.write(
f"\n## {utc_now_iso()} — VIDEO ANALYSIS (Gemini Phase 4)\n"
f"- operator: archivist + evidence-officer\n- script: scripts/08-video-analysis.py\n"
f"- model: {args.model}\n- videos_ok: {ok}\n- videos_failed: {len(fail)}\n"
)
if __name__ == "__main__":
main()