disclosure-bureau/scripts/08-video-analysis.py

455 lines
21 KiB
Python
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
08-video-analysis.py — Sherlock Holmes-style deep video analysis with Gemini 3.1 Pro
For each .mp4 in /Users/guto/ufo/raw/videos/:
1. Upload to Gemini Files API
2. Wait for ACTIVE state
3. Call gemini-3.1-pro-preview with structured Sherlock prompt
4. Receive JSON containing:
- audio_transcript_verbatim (original language, with timestamps)
- vision_description (rich English description, frame-by-frame)
- vision_description_pt_br
- entities_extracted (people/voices, organizations, locations, equipment, UAP objects)
- uap_observations (shape, color, motion descriptors, sensor info, kinematics)
- timeline (events with timestamps in mm:ss)
- anomalies (sensor artifacts vs candidate phenomena, with Locard-style reasoning)
- sherlock_observations (what Holmes/Poirot/Dupin would notice — non-obvious details)
- classification_markings, redactions (visible on screen)
- confidence_band per major claim
5. Save raw JSON to processing/video-analysis/<video-id>.json
6. Write markdown to wiki/videos/<video-id>.md with bilingual frontmatter + body
Idempotent: skips videos whose .md + .json already exist (use --force to redo).
Usage:
./08-video-analysis.py # process all videos in raw/videos/
./08-video-analysis.py --video DOD_111688970.mp4 # single file
./08-video-analysis.py --max-files 3 # cap for testing
./08-video-analysis.py --model gemini-3.1-flash-lite # cheaper fallback
./08-video-analysis.py --force # re-process even if output exists
"""
from __future__ import annotations
import argparse
import hashlib
import json
import os
import re
import sys
import time
import unicodedata
from datetime import datetime, timezone
from pathlib import Path
try:
from google import genai
from google.genai import types as genai_types
except ImportError:
sys.stderr.write("Missing google-genai. Run: pip3 install google-genai\n")
sys.exit(1)
try:
import yaml
except ImportError:
sys.stderr.write("Missing pyyaml. Run: pip3 install pyyaml\n")
sys.exit(1)
UFO_ROOT = Path("/Users/guto/ufo")
VIDEOS_DIR = UFO_ROOT / "raw" / "videos"
VIDEO_ANALYSIS_DIR = UFO_ROOT / "processing" / "video-analysis"
WIKI_VIDEOS_DIR = UFO_ROOT / "wiki" / "videos"
LOG_PATH = UFO_ROOT / "wiki" / "log.md"
DEFAULT_MODEL = "gemini-3.1-pro-preview"
FALLBACK_MODELS = ["gemini-3-pro-preview", "gemini-3.1-flash-lite"]
SCHEMA_VERSION = "0.1.0"
WIKI_VERSION = "0.1.0"
SHERLOCK_VIDEO_PROMPT = """You are an evidence officer in the Investigation Bureau, applying the combined methodologies of Sherlock Holmes (observation + elimination), Hercule Poirot (psychology of behavior), Auguste Dupin (ratiocination, hidden-in-plain-sight), and Edmond Locard (trace evidence, chain of custody) to a UAP/UFO video released by the U.S. Department of War at war.gov/ufo.
Your task: extract EVERYTHING from this video — visual content, audio, transcription, contextual signals, equipment, persons, and any anomalies. Be exhaustive. A great detective notices what others miss.
Output a SINGLE JSON object (no markdown fence, no preamble, no commentary) matching this exact schema:
{
"video_overview": {
"duration_seconds": <float>,
"primary_subject": "what the video is fundamentally about, one sentence",
"camera_perspective": "cockpit | ground | aerial | satellite | unknown",
"sensor_type": "visual_eo | infrared_FLIR | radar_screen | mixed | unknown",
"platform_inferred": "F/A-18 | helicopter | naval ship | satellite | unknown — based on cockpit layout, HUD elements, screen overlays etc.",
"primary_language_spoken": "en | pt | es | other | none"
},
"audio_transcript_verbatim": [
{
"t_start": "mm:ss",
"t_end": "mm:ss",
"speaker": "pilot_1 | pilot_2 | unknown_male | unknown_female | radio | narrator | n/a",
"text": "EXACT VERBATIM TRANSCRIPTION in the original language, preserve all 'uh', stutters, military jargon, callsigns. Do NOT translate.",
"confidence": "high | medium | low"
}
],
"vision_description": "Comprehensive English narrative of what is visible on screen, with timestamps. Frame-by-frame for key moments. Describe HUD overlays, sensor readouts, on-screen text, classification markings, redactions, target lock indicators, altitude/heading/airspeed values visible, any UAP morphology and motion (track its path with timestamps). 8-15 sentences for a typical 1-3 minute clip.",
"vision_description_pt_br": "Same content as vision_description, translated to Brazilian Portuguese (pt-br). Brazilian vocabulary and spelling. Preserve UTF-8 accents. Keep verbatim quotes from audio/screen in original language.",
"classification_markings_visible": [
{"t_start": "mm:ss", "t_end": "mm:ss", "level": "UNCLASSIFIED | CUI | CONFIDENTIAL | SECRET | TOP SECRET", "caveats": ["NOFORN", "ORCON", ...], "location_on_screen": "header_banner | corner | watermark | stamp"}
],
"redactions_visible": [
{"t_start": "mm:ss", "t_end": "mm:ss", "code": "(b)(1) 1.4(a) | (b)(3) | (b)(6) | other_or_unknown", "description": "what is being blacked/blurred (face, callsign, location, sensor frequency, etc.)"}
],
"entities_extracted": {
"people": [{"label": "Pilot 1 | Air Crew | Officer X", "role": "primary observer | radio operator | passenger | narrator | unknown", "voice_only": true, "first_appearance": "mm:ss"}],
"organizations": [{"name": "USS Nimitz | VFA-41 | AARO | ...", "evidence_for": "patch visible | radio callsign | hull number"}],
"locations": [{"name": "Pacific Ocean off San Diego | Strait of Hormuz | ...", "evidence_for": "coordinates on HUD | named in audio | identifiable landmark", "coordinates": {"lat": null, "lon": null, "raw_text": "..."}}],
"events": [{"label": "UAP intercept during routine patrol", "date": "YYYY-MM-DD | YYYY | NA"}],
"uap_objects": [{"shape": "sphere | disc | triangle | cylinder | cube | elongated-ellipsoid | cigar | irregular | unknown", "color": "...", "size_estimate": "...", "motion_descriptors": ["hover", "instantaneous-direction-change", "descent-from-X-to-Y", "no-visible-exhaust"]}],
"vehicles": [{"name": "F/A-18 Super Hornet | AH-64 | ...", "class": "aircraft | ship | submarine | spacecraft | satellite | ground | other"}],
"equipment_visible": [{"name": "ATFLIR pod | AESA radar | binoculars | EO/IR turret", "purpose": "sensor used to observe UAP"}],
"operations": [{"name": "Range Fouler | Operation X", "type": "military-operation | reporting-protocol | research-program | task-force | other"}],
"concepts": [{"name": "FLIR | Mark I eyeball | SECRET//NOFORN | ...", "class": "jargon | legal-instrument | sensor-term | scientific-term | other"}]
},
"uap_observation_fields": {
"first_visible_at": "mm:ss",
"last_visible_at": "mm:ss",
"duration_visible_seconds": <int>,
"shape": "sphere | disc | triangle | cylinder | cube | elongated-ellipsoid | cigar | irregular | unknown",
"color": "metallic | white | dark | luminous | ...",
"size_estimate": "1-3 m | 10 m | car-sized | etc.",
"altitude_ft": <int or null>,
"speed_kts": <int or null or "supersonic">,
"bearing_deg": <int or null>,
"distance_nm": <float or null>,
"coordinates": {"lat": null, "lon": null, "raw_text": "..."},
"maneuver_descriptors": ["hover", "instantaneous-direction-change", ...],
"sensor_observations": [{"sensor": "ATFLIR", "type": "infrared", "duration_min": "..."}],
"kinematic_anomalies": "describe any motion that defies known physics: no inertia, no sonic boom at high speed, instantaneous turns, etc."
},
"timeline": [
{"t": "mm:ss", "event": "Operator says 'There's a whole fleet of them, look on the SA'", "kind": "audio_quote | visual_event | sensor_event | redaction | classification_change"}
],
"anomalies_detected": [
{
"kind": "kinematic | sensor_artifact | atmospheric | optical_illusion | hoax_indicator | unredacted_slip | inconsistency",
"description": "what is anomalous",
"evidence": "at timestamp mm:ss the object does X while expected Y",
"candidate_explanations": ["prosaic-advanced-tech", "sensor-glare", "atmospheric-refraction", "extraterrestrial", "hoax", ...],
"confidence_band": "high | medium | low | speculation"
}
],
"sherlock_observations": [
{
"detective_lens": "holmes | poirot | dupin | locard",
"observation": "Non-obvious detail a regular viewer would miss. E.g., 'the operator's voice quaver at 1:42 increases at the moment the SA reading changes — emotional reaction precedes the sensor change by 0.5s, suggesting the operator saw the target visually before the radar updated.'",
"implication": "why it matters investigatively",
"confidence_band": "high | medium | low | speculation"
}
],
"executive_summary_en": "3-5 sentence English summary of WHAT happened in this video, suitable for a chat citation in the future Sherlock UI.",
"executive_summary_pt_br": "Same summary in Brazilian Portuguese (pt-br).",
"quality_signals": {
"video_quality_overall": "high | medium | low",
"audio_quality_overall": "high | medium | low | none",
"redaction_density": "none | light | heavy | full-blackout",
"completeness": "complete | truncated | partial",
"extraction_confidence": "high | medium | low"
},
"flags": ["sensitive-content", "audio-only", "redaction-heavy", "low-resolution", "interlaced-artifacts", "thermal-only", "no-audio"]
}
Rules:
- Output ONLY the JSON. No fence, no preamble.
- Empty arrays for not-applicable fields; null for unknown scalars. Never omit keys.
- ALL extracted text (transcript, on-screen text, callsigns) stays in ORIGINAL source language. Do NOT translate.
- ONLY `vision_description_pt_br` and `executive_summary_pt_br` are translations — Brazilian Portuguese (pt-br), NOT European Portuguese. Preserve UTF-8 accents.
- Verbatim quotes from audio INSIDE narrative fields stay in original language; only the surrounding narration is translated.
- Be EXHAUSTIVE in sherlock_observations — aim for 5-15 observations, including subtle audio cues, sensor metadata, behavioral signals.
- For anomalies, list ≥3 candidate explanations including a mundane one (sensor artifact, parallax, atmospheric).
- If duration_seconds = 0 or no content, still return the JSON with empty arrays and flags=["empty-or-corrupt"].
"""
def utc_now_iso() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def filename_to_video_id(filename: str) -> str:
base = filename.rsplit(".", 1)[0]
nfkd = unicodedata.normalize("NFKD", base)
ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
lower = ascii_str.lower()
replaced = re.sub(r"[^a-z0-9-]", "-", lower)
collapsed = re.sub(r"-+", "-", replaced).strip("-")
if collapsed and collapsed[0].isdigit():
collapsed = "vid-" + collapsed
return collapsed
def sha256_file(p: Path) -> str:
h = hashlib.sha256()
with open(p, "rb") as fh:
for chunk in iter(lambda: fh.read(65536), b""):
h.update(chunk)
return h.hexdigest()
def upload_and_wait(client, video_path: Path, poll_interval: float = 2.0, timeout: int = 600):
"""Upload video to Files API and wait until ACTIVE."""
print(f" uploading {video_path.name} ({video_path.stat().st_size/1024/1024:.1f} MB)…", flush=True)
t0 = time.time()
f = client.files.upload(file=str(video_path))
while f.state.name == "PROCESSING":
if time.time() - t0 > timeout:
raise TimeoutError(f"upload still PROCESSING after {timeout}s")
time.sleep(poll_interval)
f = client.files.get(name=f.name)
if f.state.name != "ACTIVE":
raise RuntimeError(f"file state is {f.state.name} (not ACTIVE) — cannot use")
print(f" ✓ file ready ({time.time() - t0:.1f}s upload+process)", flush=True)
return f
def call_gemini_for_video(client, video_file, model: str, attempt: int = 1, timeout: int = 240):
"""Generate Sherlock analysis for a video file. Retries with fallback models on failure.
Uses a thread-based timeout to avoid SDK hangs (genai sometimes hangs forever on rate limit)."""
import concurrent.futures
def _call():
return client.models.generate_content(
model=model,
contents=[video_file, SHERLOCK_VIDEO_PROMPT],
config=genai_types.GenerateContentConfig(
response_mime_type="application/json",
temperature=0.2,
max_output_tokens=32768,
),
)
try:
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
future = ex.submit(_call)
try:
resp = future.result(timeout=timeout)
except concurrent.futures.TimeoutError:
raise RuntimeError(f"Gemini hung >{timeout}s — treating as failure")
return resp.text, model
except Exception as e:
if attempt < len(FALLBACK_MODELS):
next_model = FALLBACK_MODELS[attempt - 1]
print(f"{model} failed ({e}); falling back to {next_model}", flush=True)
return call_gemini_for_video(client, video_file, next_model, attempt + 1, timeout)
raise
def render_video_md(
*,
video_id: str,
video_path: Path,
analysis: dict,
meta: dict,
now_iso: str,
) -> str:
"""Render wiki/videos/<video-id>.md (bilingual)."""
frontmatter = {
"schema_version": SCHEMA_VERSION,
"type": "video",
"video_id": video_id,
"original_filename": video_path.name,
"raw_path": f"../../raw/videos/{video_path.name}",
"sha256": sha256_file(video_path),
"size_bytes": video_path.stat().st_size,
"collection": "DOW-UAP-Video",
"vision_model": meta.get("model"),
"analyzed_at": now_iso,
# Promote video_overview
**{f"overview_{k}": v for k, v in (analysis.get("video_overview") or {}).items()},
"uap_observation_fields": analysis.get("uap_observation_fields"),
"classification_markings_visible": analysis.get("classification_markings_visible") or [],
"redactions_visible": analysis.get("redactions_visible") or [],
"entities_extracted": analysis.get("entities_extracted") or {},
"timeline": analysis.get("timeline") or [],
"anomalies_detected": analysis.get("anomalies_detected") or [],
"sherlock_observations": analysis.get("sherlock_observations") or [],
"audio_transcript_verbatim": analysis.get("audio_transcript_verbatim") or [],
"vision_description": analysis.get("vision_description", ""),
"vision_description_pt_br": analysis.get("vision_description_pt_br", ""),
"executive_summary_en": analysis.get("executive_summary_en", ""),
"executive_summary_pt_br": analysis.get("executive_summary_pt_br", ""),
"quality_signals": analysis.get("quality_signals") or {},
"flags": analysis.get("flags") or [],
"last_ingest": now_iso,
"wiki_version": WIKI_VERSION,
}
yaml_str = yaml.dump(frontmatter, allow_unicode=True, sort_keys=False, default_flow_style=False)
body = f"""# Video Analysis — {video_id}
> Source: `{video_path.name}` · Gemini model: `{meta.get("model")}` · Analyzed: {now_iso}
## Executive Summary (EN)
{analysis.get("executive_summary_en", "").strip()}
## Sumário Executivo (PT-BR)
{analysis.get("executive_summary_pt_br", "").strip()}
## Vision Description (EN)
{analysis.get("vision_description", "").strip()}
## Descrição Vision (PT-BR)
{analysis.get("vision_description_pt_br", "").strip()}
## Audio Transcript (verbatim, original language)
"""
for seg in (analysis.get("audio_transcript_verbatim") or []):
body += f"- **[{seg.get('t_start','?')}{seg.get('t_end','?')}] {seg.get('speaker','?')}**: {seg.get('text','')} _(confidence: {seg.get('confidence','?')})_\n"
body += "\n## Sherlock Observations\n\n"
for obs in (analysis.get("sherlock_observations") or []):
body += f"- **[{obs.get('detective_lens','?')}]** {obs.get('observation','')}\n - _Implication:_ {obs.get('implication','')}\n - _Confidence:_ `{obs.get('confidence_band','?')}`\n\n"
body += "## Anomalies Detected\n\n"
for a in (analysis.get("anomalies_detected") or []):
body += f"- **{a.get('kind','?')}**: {a.get('description','')}\n - _Evidence:_ {a.get('evidence','')}\n - _Candidates:_ {', '.join(a.get('candidate_explanations', []))}\n - _Confidence:_ `{a.get('confidence_band','?')}`\n\n"
return f"---\n{yaml_str}---\n\n{body}"
def process_video(client, video_path: Path, model: str, force: bool = False) -> bool:
video_id = filename_to_video_id(video_path.name)
json_out = VIDEO_ANALYSIS_DIR / f"{video_id}.json"
md_out = WIKI_VIDEOS_DIR / f"{video_id}.md"
if not force and json_out.exists() and md_out.exists():
print(f" skip {video_id} (already processed)", flush=True)
return True
json_out.parent.mkdir(parents=True, exist_ok=True)
md_out.parent.mkdir(parents=True, exist_ok=True)
print(f"\n=== {video_path.name}{video_id} ===", flush=True)
t0 = time.time()
try:
video_file = upload_and_wait(client, video_path)
except Exception as e:
print(f" ✗ upload failed: {e}", flush=True)
return False
print(f" calling {model} for Sherlock analysis…", flush=True)
try:
text, model_used = call_gemini_for_video(client, video_file, model)
except Exception as e:
print(f" ✗ generation failed: {e}", flush=True)
return False
# Strip optional fence
text = text.strip()
if text.startswith("```"):
text = re.sub(r"^```(?:json)?\s*", "", text)
text = re.sub(r"\s*```$", "", text)
try:
analysis = json.loads(text)
except json.JSONDecodeError as e:
print(f" ✗ JSON parse failed: {e}", flush=True)
# Save raw output anyway for inspection
json_out.with_suffix(".raw.txt").write_text(text, encoding="utf-8")
return False
meta = {"model": model_used, "duration_seconds": round(time.time() - t0, 1)}
json_out.write_text(
json.dumps({"analysis": analysis, "meta": meta}, indent=2, ensure_ascii=False),
encoding="utf-8",
)
md = render_video_md(
video_id=video_id,
video_path=video_path,
analysis=analysis,
meta=meta,
now_iso=utc_now_iso(),
)
md_out.write_text(md, encoding="utf-8")
# Clean up uploaded file to free quota
try:
client.files.delete(name=video_file.name)
except Exception:
pass
elapsed = time.time() - t0
print(f"{video_id} done ({elapsed:.1f}s)", flush=True)
return True
def main():
ap = argparse.ArgumentParser(description="Sherlock-style video analysis via Gemini 3.1 Pro.")
g = ap.add_mutually_exclusive_group()
g.add_argument("--video", help="single video filename in raw/videos/")
g.add_argument("--all", action="store_true", help="process all videos (default)")
ap.add_argument("--max-files", type=int, default=None, help="cap number of videos (for testing)")
ap.add_argument("--model", default=DEFAULT_MODEL, help=f"Gemini model (default {DEFAULT_MODEL})")
ap.add_argument("--force", action="store_true", help="reprocess existing outputs")
ap.add_argument("--sort-by-size", action="store_true", help="process smallest videos first (for cheap testing)")
args = ap.parse_args()
api_key = os.environ.get("GEMINI_API_KEY")
if not api_key:
sys.stderr.write("GEMINI_API_KEY not set\n")
sys.exit(2)
client = genai.Client(api_key=api_key)
if args.video:
v = VIDEOS_DIR / args.video
if not v.exists():
sys.stderr.write(f"Video not found: {v}\n"); sys.exit(1)
videos = [v]
else:
videos = sorted(VIDEOS_DIR.glob("*.mp4"))
if args.sort_by_size:
videos.sort(key=lambda p: p.stat().st_size)
if args.max_files:
videos = videos[: args.max_files]
print(f"Processing {len(videos)} video(s) with model {args.model}")
ok = 0
fail = []
for v in videos:
if process_video(client, v, args.model, force=args.force):
ok += 1
else:
fail.append(v.name)
print(f"\nDone. ok={ok}, failed={len(fail)}")
if fail:
print("Failed:", fail)
if ok > 0:
with open(LOG_PATH, "a", encoding="utf-8") as fh:
fh.write(
f"\n## {utc_now_iso()} — VIDEO ANALYSIS (Gemini Phase 4)\n"
f"- operator: archivist + evidence-officer\n- script: scripts/08-video-analysis.py\n"
f"- model: {args.model}\n- videos_ok: {ok}\n- videos_failed: {len(fail)}\n"
)
if __name__ == "__main__":
main()