disclosure-bureau/scripts/synthesize/01_anchor_events.py

274 lines
13 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
"""
01_anchor_events.py Seed the 20 anchor UAP events with curated bilingual
narrative summaries via Claude Code OAuth subprocess (Sonnet).
Anchor list comes from ADR-003 (Phase 0). Each event:
- Gets/creates wiki/entities/events/EV-<date>-<slug>.md
- Frontmatter: summary_status=curated, summary_confidence=high,
narrative_summary=<EN>, narrative_summary_pt_br=<PT-BR>
- Body untouched if file already exists with manual edits.
Idempotent: re-run skips events where summary_status == 'curated'.
Usage:
./01_anchor_events.py # all anchors
./01_anchor_events.py --only roswell # one event (substring match)
./01_anchor_events.py --dry-run # print prompt + would-write, no LLM call
"""
from __future__ import annotations
import argparse
import json
import re
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
try:
import yaml
except ImportError:
sys.stderr.write("pip3 install pyyaml\n")
sys.exit(1)
UFO_ROOT = Path(__file__).resolve().parents[2]
EVENTS_DIR = UFO_ROOT / "wiki" / "entities" / "events"
LOG_PATH = UFO_ROOT / "wiki" / "log.md"
# Each tuple = (event_id, canonical_name_en, date_start, primary_location, event_class, observers_hint)
# date_start uses YYYY-MM-DD when known, YYYY when only year, YYYY-MM-XX if no day.
ANCHOR_EVENTS = [
("EV-1897-04-17-aurora-airship-crash", "Aurora Airship Crash", "1897-04-17",
"Aurora, Texas, USA", "uap-encounter",
"Local townspeople and newspaper reporters (Dallas Morning News, 17 abr 1897)"),
("EV-1944-XX-XX-foo-fighters-european-theater", "Foo Fighters (European Theater)", "1944",
"Western Front, Europe", "uap-encounter",
"Allied bomber crews of the 415th Night Fighter Squadron and others"),
("EV-1947-06-21-maury-island-incident", "Maury Island Incident", "1947-06-21",
"Puget Sound, Washington, USA", "uap-encounter",
"Harold Dahl, Fred Crisman, Kenneth Arnold (later investigator)"),
("EV-1947-06-24-kenneth-arnold-mount-rainier", "Kenneth Arnold Mount Rainier Sighting", "1947-06-24",
"Mount Rainier, Washington, USA", "uap-encounter",
"Kenneth Arnold, civilian pilot"),
("EV-1947-07-08-roswell-incident", "Roswell Incident", "1947-07-08",
"Roswell, New Mexico, USA", "uap-encounter",
"USAAF 509th Bombardment Group, William Brazel, Major Jesse Marcel"),
("EV-1948-01-07-mantell-crash", "Mantell UFO Incident", "1948-01-07",
"Franklin, Kentucky, USA", "uap-related-fatality",
"Captain Thomas Mantell, Kentucky Air National Guard"),
("EV-1948-07-24-chiles-whitted-encounter", "Chiles-Whitted UFO Encounter", "1948-07-24",
"Montgomery, Alabama, USA", "uap-encounter",
"Eastern Airlines pilots Clarence Chiles and John Whitted"),
("EV-1952-09-XX-operation-mainbrace-sightings", "Operation Mainbrace UAP Sightings", "1952-09",
"North Atlantic / Scandinavia", "uap-encounter",
"NATO naval forces, RAF and USAF crews"),
("EV-1959-06-26-father-gill-papua-encounter", "Father Gill Papua Encounter", "1959-06-26",
"Boianai, Papua New Guinea", "uap-encounter",
"Reverend William Gill and 37 mission staff and locals"),
("EV-1964-04-24-lonnie-zamora-socorro", "Lonnie Zamora Socorro Landing", "1964-04-24",
"Socorro, New Mexico, USA", "uap-encounter",
"Police Sergeant Lonnie Zamora"),
("EV-1966-04-06-westall-school-encounter", "Westall School Encounter", "1966-04-06",
"Clayton South, Victoria, Australia", "uap-encounter",
"Over 200 students and teachers of Westall High School"),
("EV-1975-11-05-travis-walton-abduction", "Travis Walton Abduction", "1975-11-05",
"ApacheSitgreaves National Forest, Arizona, USA", "uap-abduction-claim",
"Travis Walton and logging crew of six"),
("EV-1977-09-XX-operacao-prato", "Operação Prato", "1977-09",
"Ilha de Colares, Pará, Brasil", "uap-encounter",
"Força Aérea Brasileira (FAB), Captain Uyrangê Hollanda and local residents"),
("EV-1980-12-27-rendlesham-forest-incident", "Rendlesham Forest Incident", "1980-12-27",
"Rendlesham Forest, Suffolk, UK (RAF Woodbridge / RAF Bentwaters)", "uap-encounter",
"USAF personnel including Lt Col Charles Halt, Sgt Jim Penniston, John Burroughs"),
("EV-1980-12-29-cash-landrum-incident", "Cash-Landrum Incident", "1980-12-29",
"Dayton, Texas, USA", "uap-related-injury",
"Betty Cash, Vickie Landrum, Colby Landrum"),
("EV-1986-05-19-são-paulo-night-of-the-ufos", "São Paulo Noite Oficial dos OVNIs", "1986-05-19",
"Costa do Brasil / São José dos Campos, SP", "uap-encounter",
"FAB pilots flying Mirage III and F-5E intercepts, Brigadeiro Octavio Moreira Lima briefing"),
("EV-1989-11-XX-belgian-wave", "Belgian UFO Wave", "1989-11",
"Belgium (multiple sites)", "uap-encounter",
"Belgian Air Force, gendarmerie and over 13,500 civilian witnesses"),
("EV-1997-03-13-phoenix-lights", "Phoenix Lights", "1997-03-13",
"Phoenix, Arizona, USA (and southern Arizona)", "uap-encounter",
"Thousands of civilians, Governor Fife Symington (later)"),
("EV-2004-11-14-nimitz-tic-tac", "Nimitz Tic Tac Incident", "2004-11-14",
"USS Nimitz Carrier Strike Group, off San Diego coast", "uap-encounter",
"USN F/A-18F crews Cdr David Fravor, Lt Cdr Jim Slaight, Lt Cdr Chad Underwood; USS Princeton radar (Senior Chief Kevin Day)"),
("EV-2017-12-16-aatip-disclosure", "AATIP Public Disclosure", "2017-12-16",
"New York / Washington, D.C., USA", "uap-disclosure-event",
"New York Times reporting; Luis Elizondo, Harry Reid, Robert Bigelow"),
]
# Voice rules (HolmesWatson, fact-dense, no hype).
PROMPT_TEMPLATE = """You are writing a curated encyclopedic event card for an investigative UAP/UFO wiki ("The Disclosure Bureau"). Voice rules:
- HolmesWatson narrator: precise, fact-dense, no hype, no breathless language.
- Open with what happened, where, when. Then who observed it. Then what made it remarkable. Optionally, what the official record / later investigations concluded.
- 36 sentences. No editorial speculation beyond what is well-documented.
- Use the *original language for verbatim quotes*; otherwise English for the EN summary and Brazilian Portuguese (pt-br with full UTF-8 accents) for the PT-BR summary. Do NOT translate already-Portuguese proper names ("Operação Prato" stays as-is in EN too).
- Avoid the words "alegadamente"/"allegedly" unless it's genuinely contested. Be honest about uncertainty when warranted.
- Never include sentences like "Will be enriched in Phase N" or any placeholder this is the final text.
EVENT TO DOCUMENT:
- ID: {event_id}
- Canonical name: {name}
- Date: {date}
- Primary location: {location}
- Class: {cls}
- Known observers / parties: {observers}
OUTPUT (STRICT JSON, no markdown fences, no commentary):
{{
"narrative_summary": "<EN, 3-6 sentences>",
"narrative_summary_pt_br": "<PT-BR brasileiro, 3-6 sentences>"
}}"""
def utc_iso() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def call_sonnet(prompt: str, dry_run: bool = False) -> dict:
"""Spawn `claude -p` subprocess (uses CLAUDE_CODE_OAUTH_TOKEN env) and return parsed JSON."""
if dry_run:
return {"narrative_summary": "[dry-run placeholder]", "narrative_summary_pt_br": "[dry-run placeholder]"}
try:
res = subprocess.run(
["claude", "-p", "--model", "sonnet", "--output-format", "json"],
input=prompt,
capture_output=True,
text=True,
timeout=180,
check=False,
)
except subprocess.TimeoutExpired:
raise RuntimeError("claude subprocess timed out after 180s")
if res.returncode != 0:
raise RuntimeError(f"claude exit {res.returncode}: {res.stderr[:300]}")
# claude --output-format json returns wrapped envelope; extract `result`.
try:
env = json.loads(res.stdout)
except json.JSONDecodeError as e:
raise RuntimeError(f"unparseable claude stdout: {e} :: {res.stdout[:300]}")
txt = env.get("result") or env.get("response") or env.get("content") or ""
# Strip code fences if any
txt = re.sub(r"^```(?:json)?\s*|\s*```$", "", txt.strip(), flags=re.MULTILINE).strip()
# Try direct parse; on fail, extract first {...} block
try:
return json.loads(txt)
except json.JSONDecodeError:
m = re.search(r"\{[^{}]*\"narrative_summary\".*?\}", txt, flags=re.DOTALL)
if not m:
raise RuntimeError(f"no JSON object in claude output: {txt[:300]}")
return json.loads(m.group(0))
def load_yaml_body(path: Path) -> tuple[dict, str]:
raw = path.read_text(encoding="utf-8")
if not raw.startswith("---"):
return {}, raw
end = raw.find("---", 4)
fm = yaml.safe_load(raw[3:end].strip()) or {}
body = raw[end + 3:].lstrip("\n")
return fm, body
def write_yaml_body(path: Path, fm: dict, body: str) -> None:
yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
sep = "" if body.startswith("\n") else "\n"
path.write_text(f"---\n{yaml_str}---\n{sep}{body}", encoding="utf-8")
def upsert_anchor(event_id: str, name: str, date: str, location: str, cls: str, observers: str,
dry_run: bool, only: str | None) -> tuple[str, bool]:
if only and only.lower() not in (event_id.lower() + " " + name.lower()):
return ("skipped (not matched by --only)", False)
path = EVENTS_DIR / f"{event_id}.md"
existing_fm: dict = {}
existing_body: str = ""
if path.exists():
existing_fm, existing_body = load_yaml_body(path)
if existing_fm.get("summary_status") == "curated":
return ("skipped (already curated)", False)
prompt = PROMPT_TEMPLATE.format(
event_id=event_id, name=name, date=date, location=location, cls=cls, observers=observers,
)
print(f" → calling sonnet for {event_id} ...", flush=True)
out = call_sonnet(prompt, dry_run=dry_run)
narr_en = (out.get("narrative_summary") or "").strip()
narr_pt = (out.get("narrative_summary_pt_br") or "").strip()
if not narr_en or not narr_pt:
return (f"empty output (en={len(narr_en)}, pt={len(narr_pt)})", False)
# Build/refresh frontmatter
fm = {
"schema_version": "0.1.0",
"type": "entity",
"entity_class": "event",
"event_id": event_id,
"canonical_name": name,
"aliases": existing_fm.get("aliases") or [name],
"event_class": cls,
"date_start": date,
"date_end": existing_fm.get("date_end") or date,
"date_confidence": "high",
"primary_location": location,
"observers": existing_fm.get("observers") or [],
"uap_objects": existing_fm.get("uap_objects") or [],
"documented_in": existing_fm.get("documented_in") or [],
"total_mentions": existing_fm.get("total_mentions") or 0,
"documents_count": existing_fm.get("documents_count") or 0,
"narrative_summary": narr_en,
"narrative_summary_pt_br": narr_pt,
"summary_status": "curated",
"summary_confidence": "high",
"enrichment_status": existing_fm.get("enrichment_status") or "none",
"external_sources": existing_fm.get("external_sources") or [],
"last_ingest": existing_fm.get("last_ingest") or utc_iso(),
"last_lint": utc_iso(),
"wiki_version": "0.1.0",
}
body = existing_body if existing_body.strip() else (
f"# {name}\n\n## Description (EN)\n\n{narr_en}\n\n## Descrição (PT-BR)\n\n{narr_pt}\n"
)
if dry_run:
return ("ok (dry)", True)
EVENTS_DIR.mkdir(parents=True, exist_ok=True)
write_yaml_body(path, fm, body)
return ("ok", True)
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument("--only", default=None, help="filter anchor events by substring match")
p.add_argument("--dry-run", action="store_true")
args = p.parse_args()
print(f"Anchor events: {len(ANCHOR_EVENTS)}")
done = 0
for ev in ANCHOR_EVENTS:
msg, ok = upsert_anchor(*ev, dry_run=args.dry_run, only=args.only)
sign = "" if ok else "·"
print(f" {sign} {ev[0]}{msg}")
if ok:
done += 1
if not args.dry_run and done > 0:
with LOG_PATH.open("a", encoding="utf-8") as f:
f.write(
f"\n## {utc_iso()} · CURATE_ANCHOR_EVENTS\n"
f"- script: scripts/synthesize/01_anchor_events.py\n"
f"- curated: {done}/{len(ANCHOR_EVENTS)}\n"
f"- model: claude-sonnet (via CLAUDE_CODE_OAUTH_TOKEN)\n"
)
print(f"\nCurated: {done}/{len(ANCHOR_EVENTS)}")
return 0
if __name__ == "__main__":
sys.exit(main())