456 lines
21 KiB
Python
456 lines
21 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
08-video-analysis.py — Sherlock Holmes-style deep video analysis with Gemini 3.1 Pro
|
|||
|
|
|
|||
|
|
For each .mp4 in /Users/guto/ufo/raw/videos/:
|
|||
|
|
1. Upload to Gemini Files API
|
|||
|
|
2. Wait for ACTIVE state
|
|||
|
|
3. Call gemini-3.1-pro-preview with structured Sherlock prompt
|
|||
|
|
4. Receive JSON containing:
|
|||
|
|
- audio_transcript_verbatim (original language, with timestamps)
|
|||
|
|
- vision_description (rich English description, frame-by-frame)
|
|||
|
|
- vision_description_pt_br
|
|||
|
|
- entities_extracted (people/voices, organizations, locations, equipment, UAP objects)
|
|||
|
|
- uap_observations (shape, color, motion descriptors, sensor info, kinematics)
|
|||
|
|
- timeline (events with timestamps in mm:ss)
|
|||
|
|
- anomalies (sensor artifacts vs candidate phenomena, with Locard-style reasoning)
|
|||
|
|
- sherlock_observations (what Holmes/Poirot/Dupin would notice — non-obvious details)
|
|||
|
|
- classification_markings, redactions (visible on screen)
|
|||
|
|
- confidence_band per major claim
|
|||
|
|
5. Save raw JSON to processing/video-analysis/<video-id>.json
|
|||
|
|
6. Write markdown to wiki/videos/<video-id>.md with bilingual frontmatter + body
|
|||
|
|
|
|||
|
|
Idempotent: skips videos whose .md + .json already exist (use --force to redo).
|
|||
|
|
|
|||
|
|
Usage:
|
|||
|
|
./08-video-analysis.py # process all videos in raw/videos/
|
|||
|
|
./08-video-analysis.py --video DOD_111688970.mp4 # single file
|
|||
|
|
./08-video-analysis.py --max-files 3 # cap for testing
|
|||
|
|
./08-video-analysis.py --model gemini-3.1-flash-lite # cheaper fallback
|
|||
|
|
./08-video-analysis.py --force # re-process even if output exists
|
|||
|
|
"""
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import argparse
|
|||
|
|
import hashlib
|
|||
|
|
import json
|
|||
|
|
import os
|
|||
|
|
import re
|
|||
|
|
import sys
|
|||
|
|
import time
|
|||
|
|
import unicodedata
|
|||
|
|
from datetime import datetime, timezone
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
from google import genai
|
|||
|
|
from google.genai import types as genai_types
|
|||
|
|
except ImportError:
|
|||
|
|
sys.stderr.write("Missing google-genai. Run: pip3 install google-genai\n")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
import yaml
|
|||
|
|
except ImportError:
|
|||
|
|
sys.stderr.write("Missing pyyaml. Run: pip3 install pyyaml\n")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
|
|||
|
|
UFO_ROOT = Path("/Users/guto/ufo")
|
|||
|
|
VIDEOS_DIR = UFO_ROOT / "raw" / "videos"
|
|||
|
|
VIDEO_ANALYSIS_DIR = UFO_ROOT / "processing" / "video-analysis"
|
|||
|
|
WIKI_VIDEOS_DIR = UFO_ROOT / "wiki" / "videos"
|
|||
|
|
LOG_PATH = UFO_ROOT / "wiki" / "log.md"
|
|||
|
|
|
|||
|
|
DEFAULT_MODEL = "gemini-3.1-pro-preview"
|
|||
|
|
FALLBACK_MODELS = ["gemini-3-pro-preview", "gemini-3.1-flash-lite"]
|
|||
|
|
SCHEMA_VERSION = "0.1.0"
|
|||
|
|
WIKI_VERSION = "0.1.0"
|
|||
|
|
|
|||
|
|
|
|||
|
|
SHERLOCK_VIDEO_PROMPT = """You are an evidence officer in the Investigation Bureau, applying the combined methodologies of Sherlock Holmes (observation + elimination), Hercule Poirot (psychology of behavior), Auguste Dupin (ratiocination, hidden-in-plain-sight), and Edmond Locard (trace evidence, chain of custody) to a UAP/UFO video released by the U.S. Department of War at war.gov/ufo.
|
|||
|
|
|
|||
|
|
Your task: extract EVERYTHING from this video — visual content, audio, transcription, contextual signals, equipment, persons, and any anomalies. Be exhaustive. A great detective notices what others miss.
|
|||
|
|
|
|||
|
|
Output a SINGLE JSON object (no markdown fence, no preamble, no commentary) matching this exact schema:
|
|||
|
|
|
|||
|
|
{
|
|||
|
|
"video_overview": {
|
|||
|
|
"duration_seconds": <float>,
|
|||
|
|
"primary_subject": "what the video is fundamentally about, one sentence",
|
|||
|
|
"camera_perspective": "cockpit | ground | aerial | satellite | unknown",
|
|||
|
|
"sensor_type": "visual_eo | infrared_FLIR | radar_screen | mixed | unknown",
|
|||
|
|
"platform_inferred": "F/A-18 | helicopter | naval ship | satellite | unknown — based on cockpit layout, HUD elements, screen overlays etc.",
|
|||
|
|
"primary_language_spoken": "en | pt | es | other | none"
|
|||
|
|
},
|
|||
|
|
|
|||
|
|
"audio_transcript_verbatim": [
|
|||
|
|
{
|
|||
|
|
"t_start": "mm:ss",
|
|||
|
|
"t_end": "mm:ss",
|
|||
|
|
"speaker": "pilot_1 | pilot_2 | unknown_male | unknown_female | radio | narrator | n/a",
|
|||
|
|
"text": "EXACT VERBATIM TRANSCRIPTION in the original language, preserve all 'uh', stutters, military jargon, callsigns. Do NOT translate.",
|
|||
|
|
"confidence": "high | medium | low"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
|
|||
|
|
"vision_description": "Comprehensive English narrative of what is visible on screen, with timestamps. Frame-by-frame for key moments. Describe HUD overlays, sensor readouts, on-screen text, classification markings, redactions, target lock indicators, altitude/heading/airspeed values visible, any UAP morphology and motion (track its path with timestamps). 8-15 sentences for a typical 1-3 minute clip.",
|
|||
|
|
"vision_description_pt_br": "Same content as vision_description, translated to Brazilian Portuguese (pt-br). Brazilian vocabulary and spelling. Preserve UTF-8 accents. Keep verbatim quotes from audio/screen in original language.",
|
|||
|
|
|
|||
|
|
"classification_markings_visible": [
|
|||
|
|
{"t_start": "mm:ss", "t_end": "mm:ss", "level": "UNCLASSIFIED | CUI | CONFIDENTIAL | SECRET | TOP SECRET", "caveats": ["NOFORN", "ORCON", ...], "location_on_screen": "header_banner | corner | watermark | stamp"}
|
|||
|
|
],
|
|||
|
|
|
|||
|
|
"redactions_visible": [
|
|||
|
|
{"t_start": "mm:ss", "t_end": "mm:ss", "code": "(b)(1) 1.4(a) | (b)(3) | (b)(6) | other_or_unknown", "description": "what is being blacked/blurred (face, callsign, location, sensor frequency, etc.)"}
|
|||
|
|
],
|
|||
|
|
|
|||
|
|
"entities_extracted": {
|
|||
|
|
"people": [{"label": "Pilot 1 | Air Crew | Officer X", "role": "primary observer | radio operator | passenger | narrator | unknown", "voice_only": true, "first_appearance": "mm:ss"}],
|
|||
|
|
"organizations": [{"name": "USS Nimitz | VFA-41 | AARO | ...", "evidence_for": "patch visible | radio callsign | hull number"}],
|
|||
|
|
"locations": [{"name": "Pacific Ocean off San Diego | Strait of Hormuz | ...", "evidence_for": "coordinates on HUD | named in audio | identifiable landmark", "coordinates": {"lat": null, "lon": null, "raw_text": "..."}}],
|
|||
|
|
"events": [{"label": "UAP intercept during routine patrol", "date": "YYYY-MM-DD | YYYY | NA"}],
|
|||
|
|
"uap_objects": [{"shape": "sphere | disc | triangle | cylinder | cube | elongated-ellipsoid | cigar | irregular | unknown", "color": "...", "size_estimate": "...", "motion_descriptors": ["hover", "instantaneous-direction-change", "descent-from-X-to-Y", "no-visible-exhaust"]}],
|
|||
|
|
"vehicles": [{"name": "F/A-18 Super Hornet | AH-64 | ...", "class": "aircraft | ship | submarine | spacecraft | satellite | ground | other"}],
|
|||
|
|
"equipment_visible": [{"name": "ATFLIR pod | AESA radar | binoculars | EO/IR turret", "purpose": "sensor used to observe UAP"}],
|
|||
|
|
"operations": [{"name": "Range Fouler | Operation X", "type": "military-operation | reporting-protocol | research-program | task-force | other"}],
|
|||
|
|
"concepts": [{"name": "FLIR | Mark I eyeball | SECRET//NOFORN | ...", "class": "jargon | legal-instrument | sensor-term | scientific-term | other"}]
|
|||
|
|
},
|
|||
|
|
|
|||
|
|
"uap_observation_fields": {
|
|||
|
|
"first_visible_at": "mm:ss",
|
|||
|
|
"last_visible_at": "mm:ss",
|
|||
|
|
"duration_visible_seconds": <int>,
|
|||
|
|
"shape": "sphere | disc | triangle | cylinder | cube | elongated-ellipsoid | cigar | irregular | unknown",
|
|||
|
|
"color": "metallic | white | dark | luminous | ...",
|
|||
|
|
"size_estimate": "1-3 m | 10 m | car-sized | etc.",
|
|||
|
|
"altitude_ft": <int or null>,
|
|||
|
|
"speed_kts": <int or null or "supersonic">,
|
|||
|
|
"bearing_deg": <int or null>,
|
|||
|
|
"distance_nm": <float or null>,
|
|||
|
|
"coordinates": {"lat": null, "lon": null, "raw_text": "..."},
|
|||
|
|
"maneuver_descriptors": ["hover", "instantaneous-direction-change", ...],
|
|||
|
|
"sensor_observations": [{"sensor": "ATFLIR", "type": "infrared", "duration_min": "..."}],
|
|||
|
|
"kinematic_anomalies": "describe any motion that defies known physics: no inertia, no sonic boom at high speed, instantaneous turns, etc."
|
|||
|
|
},
|
|||
|
|
|
|||
|
|
"timeline": [
|
|||
|
|
{"t": "mm:ss", "event": "Operator says 'There's a whole fleet of them, look on the SA'", "kind": "audio_quote | visual_event | sensor_event | redaction | classification_change"}
|
|||
|
|
],
|
|||
|
|
|
|||
|
|
"anomalies_detected": [
|
|||
|
|
{
|
|||
|
|
"kind": "kinematic | sensor_artifact | atmospheric | optical_illusion | hoax_indicator | unredacted_slip | inconsistency",
|
|||
|
|
"description": "what is anomalous",
|
|||
|
|
"evidence": "at timestamp mm:ss the object does X while expected Y",
|
|||
|
|
"candidate_explanations": ["prosaic-advanced-tech", "sensor-glare", "atmospheric-refraction", "extraterrestrial", "hoax", ...],
|
|||
|
|
"confidence_band": "high | medium | low | speculation"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
|
|||
|
|
"sherlock_observations": [
|
|||
|
|
{
|
|||
|
|
"detective_lens": "holmes | poirot | dupin | locard",
|
|||
|
|
"observation": "Non-obvious detail a regular viewer would miss. E.g., 'the operator's voice quaver at 1:42 increases at the moment the SA reading changes — emotional reaction precedes the sensor change by 0.5s, suggesting the operator saw the target visually before the radar updated.'",
|
|||
|
|
"implication": "why it matters investigatively",
|
|||
|
|
"confidence_band": "high | medium | low | speculation"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
|
|||
|
|
"executive_summary_en": "3-5 sentence English summary of WHAT happened in this video, suitable for a chat citation in the future Sherlock UI.",
|
|||
|
|
"executive_summary_pt_br": "Same summary in Brazilian Portuguese (pt-br).",
|
|||
|
|
|
|||
|
|
"quality_signals": {
|
|||
|
|
"video_quality_overall": "high | medium | low",
|
|||
|
|
"audio_quality_overall": "high | medium | low | none",
|
|||
|
|
"redaction_density": "none | light | heavy | full-blackout",
|
|||
|
|
"completeness": "complete | truncated | partial",
|
|||
|
|
"extraction_confidence": "high | medium | low"
|
|||
|
|
},
|
|||
|
|
|
|||
|
|
"flags": ["sensitive-content", "audio-only", "redaction-heavy", "low-resolution", "interlaced-artifacts", "thermal-only", "no-audio"]
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
Rules:
|
|||
|
|
- Output ONLY the JSON. No fence, no preamble.
|
|||
|
|
- Empty arrays for not-applicable fields; null for unknown scalars. Never omit keys.
|
|||
|
|
- ALL extracted text (transcript, on-screen text, callsigns) stays in ORIGINAL source language. Do NOT translate.
|
|||
|
|
- ONLY `vision_description_pt_br` and `executive_summary_pt_br` are translations — Brazilian Portuguese (pt-br), NOT European Portuguese. Preserve UTF-8 accents.
|
|||
|
|
- Verbatim quotes from audio INSIDE narrative fields stay in original language; only the surrounding narration is translated.
|
|||
|
|
- Be EXHAUSTIVE in sherlock_observations — aim for 5-15 observations, including subtle audio cues, sensor metadata, behavioral signals.
|
|||
|
|
- For anomalies, list ≥3 candidate explanations including a mundane one (sensor artifact, parallax, atmospheric).
|
|||
|
|
- If duration_seconds = 0 or no content, still return the JSON with empty arrays and flags=["empty-or-corrupt"].
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
|
|||
|
|
def utc_now_iso() -> str:
|
|||
|
|
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def filename_to_video_id(filename: str) -> str:
|
|||
|
|
base = filename.rsplit(".", 1)[0]
|
|||
|
|
nfkd = unicodedata.normalize("NFKD", base)
|
|||
|
|
ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
|
|||
|
|
lower = ascii_str.lower()
|
|||
|
|
replaced = re.sub(r"[^a-z0-9-]", "-", lower)
|
|||
|
|
collapsed = re.sub(r"-+", "-", replaced).strip("-")
|
|||
|
|
if collapsed and collapsed[0].isdigit():
|
|||
|
|
collapsed = "vid-" + collapsed
|
|||
|
|
return collapsed
|
|||
|
|
|
|||
|
|
|
|||
|
|
def sha256_file(p: Path) -> str:
|
|||
|
|
h = hashlib.sha256()
|
|||
|
|
with open(p, "rb") as fh:
|
|||
|
|
for chunk in iter(lambda: fh.read(65536), b""):
|
|||
|
|
h.update(chunk)
|
|||
|
|
return h.hexdigest()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def upload_and_wait(client, video_path: Path, poll_interval: float = 2.0, timeout: int = 600):
|
|||
|
|
"""Upload video to Files API and wait until ACTIVE."""
|
|||
|
|
print(f" uploading {video_path.name} ({video_path.stat().st_size/1024/1024:.1f} MB)…", flush=True)
|
|||
|
|
t0 = time.time()
|
|||
|
|
f = client.files.upload(file=str(video_path))
|
|||
|
|
while f.state.name == "PROCESSING":
|
|||
|
|
if time.time() - t0 > timeout:
|
|||
|
|
raise TimeoutError(f"upload still PROCESSING after {timeout}s")
|
|||
|
|
time.sleep(poll_interval)
|
|||
|
|
f = client.files.get(name=f.name)
|
|||
|
|
if f.state.name != "ACTIVE":
|
|||
|
|
raise RuntimeError(f"file state is {f.state.name} (not ACTIVE) — cannot use")
|
|||
|
|
print(f" ✓ file ready ({time.time() - t0:.1f}s upload+process)", flush=True)
|
|||
|
|
return f
|
|||
|
|
|
|||
|
|
|
|||
|
|
def call_gemini_for_video(client, video_file, model: str, attempt: int = 1, timeout: int = 240):
|
|||
|
|
"""Generate Sherlock analysis for a video file. Retries with fallback models on failure.
|
|||
|
|
Uses a thread-based timeout to avoid SDK hangs (genai sometimes hangs forever on rate limit)."""
|
|||
|
|
import concurrent.futures
|
|||
|
|
|
|||
|
|
def _call():
|
|||
|
|
return client.models.generate_content(
|
|||
|
|
model=model,
|
|||
|
|
contents=[video_file, SHERLOCK_VIDEO_PROMPT],
|
|||
|
|
config=genai_types.GenerateContentConfig(
|
|||
|
|
response_mime_type="application/json",
|
|||
|
|
temperature=0.2,
|
|||
|
|
max_output_tokens=32768,
|
|||
|
|
),
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
|
|||
|
|
future = ex.submit(_call)
|
|||
|
|
try:
|
|||
|
|
resp = future.result(timeout=timeout)
|
|||
|
|
except concurrent.futures.TimeoutError:
|
|||
|
|
raise RuntimeError(f"Gemini hung >{timeout}s — treating as failure")
|
|||
|
|
return resp.text, model
|
|||
|
|
except Exception as e:
|
|||
|
|
if attempt < len(FALLBACK_MODELS):
|
|||
|
|
next_model = FALLBACK_MODELS[attempt - 1]
|
|||
|
|
print(f" ⚠ {model} failed ({e}); falling back to {next_model}", flush=True)
|
|||
|
|
return call_gemini_for_video(client, video_file, next_model, attempt + 1, timeout)
|
|||
|
|
raise
|
|||
|
|
|
|||
|
|
|
|||
|
|
def render_video_md(
|
|||
|
|
*,
|
|||
|
|
video_id: str,
|
|||
|
|
video_path: Path,
|
|||
|
|
analysis: dict,
|
|||
|
|
meta: dict,
|
|||
|
|
now_iso: str,
|
|||
|
|
) -> str:
|
|||
|
|
"""Render wiki/videos/<video-id>.md (bilingual)."""
|
|||
|
|
frontmatter = {
|
|||
|
|
"schema_version": SCHEMA_VERSION,
|
|||
|
|
"type": "video",
|
|||
|
|
"video_id": video_id,
|
|||
|
|
"original_filename": video_path.name,
|
|||
|
|
"raw_path": f"../../raw/videos/{video_path.name}",
|
|||
|
|
"sha256": sha256_file(video_path),
|
|||
|
|
"size_bytes": video_path.stat().st_size,
|
|||
|
|
"collection": "DOW-UAP-Video",
|
|||
|
|
"vision_model": meta.get("model"),
|
|||
|
|
"analyzed_at": now_iso,
|
|||
|
|
# Promote video_overview
|
|||
|
|
**{f"overview_{k}": v for k, v in (analysis.get("video_overview") or {}).items()},
|
|||
|
|
"uap_observation_fields": analysis.get("uap_observation_fields"),
|
|||
|
|
"classification_markings_visible": analysis.get("classification_markings_visible") or [],
|
|||
|
|
"redactions_visible": analysis.get("redactions_visible") or [],
|
|||
|
|
"entities_extracted": analysis.get("entities_extracted") or {},
|
|||
|
|
"timeline": analysis.get("timeline") or [],
|
|||
|
|
"anomalies_detected": analysis.get("anomalies_detected") or [],
|
|||
|
|
"sherlock_observations": analysis.get("sherlock_observations") or [],
|
|||
|
|
"audio_transcript_verbatim": analysis.get("audio_transcript_verbatim") or [],
|
|||
|
|
"vision_description": analysis.get("vision_description", ""),
|
|||
|
|
"vision_description_pt_br": analysis.get("vision_description_pt_br", ""),
|
|||
|
|
"executive_summary_en": analysis.get("executive_summary_en", ""),
|
|||
|
|
"executive_summary_pt_br": analysis.get("executive_summary_pt_br", ""),
|
|||
|
|
"quality_signals": analysis.get("quality_signals") or {},
|
|||
|
|
"flags": analysis.get("flags") or [],
|
|||
|
|
"last_ingest": now_iso,
|
|||
|
|
"wiki_version": WIKI_VERSION,
|
|||
|
|
}
|
|||
|
|
yaml_str = yaml.dump(frontmatter, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|||
|
|
|
|||
|
|
body = f"""# Video Analysis — {video_id}
|
|||
|
|
|
|||
|
|
> Source: `{video_path.name}` · Gemini model: `{meta.get("model")}` · Analyzed: {now_iso}
|
|||
|
|
|
|||
|
|
## Executive Summary (EN)
|
|||
|
|
|
|||
|
|
{analysis.get("executive_summary_en", "").strip()}
|
|||
|
|
|
|||
|
|
## Sumário Executivo (PT-BR)
|
|||
|
|
|
|||
|
|
{analysis.get("executive_summary_pt_br", "").strip()}
|
|||
|
|
|
|||
|
|
## Vision Description (EN)
|
|||
|
|
|
|||
|
|
{analysis.get("vision_description", "").strip()}
|
|||
|
|
|
|||
|
|
## Descrição Vision (PT-BR)
|
|||
|
|
|
|||
|
|
{analysis.get("vision_description_pt_br", "").strip()}
|
|||
|
|
|
|||
|
|
## Audio Transcript (verbatim, original language)
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
for seg in (analysis.get("audio_transcript_verbatim") or []):
|
|||
|
|
body += f"- **[{seg.get('t_start','?')}–{seg.get('t_end','?')}] {seg.get('speaker','?')}**: {seg.get('text','')} _(confidence: {seg.get('confidence','?')})_\n"
|
|||
|
|
|
|||
|
|
body += "\n## Sherlock Observations\n\n"
|
|||
|
|
for obs in (analysis.get("sherlock_observations") or []):
|
|||
|
|
body += f"- **[{obs.get('detective_lens','?')}]** {obs.get('observation','')}\n - _Implication:_ {obs.get('implication','')}\n - _Confidence:_ `{obs.get('confidence_band','?')}`\n\n"
|
|||
|
|
|
|||
|
|
body += "## Anomalies Detected\n\n"
|
|||
|
|
for a in (analysis.get("anomalies_detected") or []):
|
|||
|
|
body += f"- **{a.get('kind','?')}**: {a.get('description','')}\n - _Evidence:_ {a.get('evidence','')}\n - _Candidates:_ {', '.join(a.get('candidate_explanations', []))}\n - _Confidence:_ `{a.get('confidence_band','?')}`\n\n"
|
|||
|
|
|
|||
|
|
return f"---\n{yaml_str}---\n\n{body}"
|
|||
|
|
|
|||
|
|
|
|||
|
|
def process_video(client, video_path: Path, model: str, force: bool = False) -> bool:
|
|||
|
|
video_id = filename_to_video_id(video_path.name)
|
|||
|
|
json_out = VIDEO_ANALYSIS_DIR / f"{video_id}.json"
|
|||
|
|
md_out = WIKI_VIDEOS_DIR / f"{video_id}.md"
|
|||
|
|
|
|||
|
|
if not force and json_out.exists() and md_out.exists():
|
|||
|
|
print(f" skip {video_id} (already processed)", flush=True)
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
json_out.parent.mkdir(parents=True, exist_ok=True)
|
|||
|
|
md_out.parent.mkdir(parents=True, exist_ok=True)
|
|||
|
|
|
|||
|
|
print(f"\n=== {video_path.name} → {video_id} ===", flush=True)
|
|||
|
|
t0 = time.time()
|
|||
|
|
try:
|
|||
|
|
video_file = upload_and_wait(client, video_path)
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f" ✗ upload failed: {e}", flush=True)
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
print(f" calling {model} for Sherlock analysis…", flush=True)
|
|||
|
|
try:
|
|||
|
|
text, model_used = call_gemini_for_video(client, video_file, model)
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f" ✗ generation failed: {e}", flush=True)
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# Strip optional fence
|
|||
|
|
text = text.strip()
|
|||
|
|
if text.startswith("```"):
|
|||
|
|
text = re.sub(r"^```(?:json)?\s*", "", text)
|
|||
|
|
text = re.sub(r"\s*```$", "", text)
|
|||
|
|
try:
|
|||
|
|
analysis = json.loads(text)
|
|||
|
|
except json.JSONDecodeError as e:
|
|||
|
|
print(f" ✗ JSON parse failed: {e}", flush=True)
|
|||
|
|
# Save raw output anyway for inspection
|
|||
|
|
json_out.with_suffix(".raw.txt").write_text(text, encoding="utf-8")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
meta = {"model": model_used, "duration_seconds": round(time.time() - t0, 1)}
|
|||
|
|
json_out.write_text(
|
|||
|
|
json.dumps({"analysis": analysis, "meta": meta}, indent=2, ensure_ascii=False),
|
|||
|
|
encoding="utf-8",
|
|||
|
|
)
|
|||
|
|
md = render_video_md(
|
|||
|
|
video_id=video_id,
|
|||
|
|
video_path=video_path,
|
|||
|
|
analysis=analysis,
|
|||
|
|
meta=meta,
|
|||
|
|
now_iso=utc_now_iso(),
|
|||
|
|
)
|
|||
|
|
md_out.write_text(md, encoding="utf-8")
|
|||
|
|
|
|||
|
|
# Clean up uploaded file to free quota
|
|||
|
|
try:
|
|||
|
|
client.files.delete(name=video_file.name)
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
elapsed = time.time() - t0
|
|||
|
|
print(f" ✓ {video_id} done ({elapsed:.1f}s)", flush=True)
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
ap = argparse.ArgumentParser(description="Sherlock-style video analysis via Gemini 3.1 Pro.")
|
|||
|
|
g = ap.add_mutually_exclusive_group()
|
|||
|
|
g.add_argument("--video", help="single video filename in raw/videos/")
|
|||
|
|
g.add_argument("--all", action="store_true", help="process all videos (default)")
|
|||
|
|
ap.add_argument("--max-files", type=int, default=None, help="cap number of videos (for testing)")
|
|||
|
|
ap.add_argument("--model", default=DEFAULT_MODEL, help=f"Gemini model (default {DEFAULT_MODEL})")
|
|||
|
|
ap.add_argument("--force", action="store_true", help="reprocess existing outputs")
|
|||
|
|
ap.add_argument("--sort-by-size", action="store_true", help="process smallest videos first (for cheap testing)")
|
|||
|
|
args = ap.parse_args()
|
|||
|
|
|
|||
|
|
api_key = os.environ.get("GEMINI_API_KEY")
|
|||
|
|
if not api_key:
|
|||
|
|
sys.stderr.write("GEMINI_API_KEY not set\n")
|
|||
|
|
sys.exit(2)
|
|||
|
|
|
|||
|
|
client = genai.Client(api_key=api_key)
|
|||
|
|
|
|||
|
|
if args.video:
|
|||
|
|
v = VIDEOS_DIR / args.video
|
|||
|
|
if not v.exists():
|
|||
|
|
sys.stderr.write(f"Video not found: {v}\n"); sys.exit(1)
|
|||
|
|
videos = [v]
|
|||
|
|
else:
|
|||
|
|
videos = sorted(VIDEOS_DIR.glob("*.mp4"))
|
|||
|
|
if args.sort_by_size:
|
|||
|
|
videos.sort(key=lambda p: p.stat().st_size)
|
|||
|
|
|
|||
|
|
if args.max_files:
|
|||
|
|
videos = videos[: args.max_files]
|
|||
|
|
|
|||
|
|
print(f"Processing {len(videos)} video(s) with model {args.model}")
|
|||
|
|
ok = 0
|
|||
|
|
fail = []
|
|||
|
|
for v in videos:
|
|||
|
|
if process_video(client, v, args.model, force=args.force):
|
|||
|
|
ok += 1
|
|||
|
|
else:
|
|||
|
|
fail.append(v.name)
|
|||
|
|
|
|||
|
|
print(f"\nDone. ok={ok}, failed={len(fail)}")
|
|||
|
|
if fail:
|
|||
|
|
print("Failed:", fail)
|
|||
|
|
|
|||
|
|
if ok > 0:
|
|||
|
|
with open(LOG_PATH, "a", encoding="utf-8") as fh:
|
|||
|
|
fh.write(
|
|||
|
|
f"\n## {utc_now_iso()} — VIDEO ANALYSIS (Gemini Phase 4)\n"
|
|||
|
|
f"- operator: archivist + evidence-officer\n- script: scripts/08-video-analysis.py\n"
|
|||
|
|
f"- model: {args.model}\n- videos_ok: {ok}\n- videos_failed: {len(fail)}\n"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|