disclosure-bureau/scripts/02b-enrich-with-web-metadata.py
Luiz Gustavo 55cac8a395
Some checks failed
CI / Web — typecheck + lint + build (push) Failing after 1m30s
CI / Scripts — Python smoke (push) Failing after 32s
CI / Web — npm audit (push) Failing after 37s
W0+W1+W1.2: security hardening, observability, autocomplete, glitchtip, forgejo CI
W0 — security hardening (5 fixes verified live on disclosure.top)
- middleware: gate /api/admin/* same as /admin/* (F1)
- imgproxy: tighten LOCAL_FILESYSTEM_ROOT from / to /var/lib/storage (F2)
- studio: real basic-auth label (bcrypt hash, middleware reference) (F3)
- relations: ENABLE ROW LEVEL SECURITY + public SELECT policy (F4)
- migration 0003: fold is_searchable + hybrid_search update into canonical (TD#2)

W1 — observability + resilience + autocomplete
- studio: HOSTNAME=0.0.0.0 so Next.js binds on loopback for healthcheck
- compose: PG_POOL_MAX=20, CLAUDE_CODE_OAUTH_TOKEN gated by separate env
- claude-code.ts: subprocess timeout configurable (CLAUDE_CODE_TIMEOUT_MS)
- openrouter.ts: retry with exponential backoff + Retry-After + in-memory
  circuit breaker (promotes FALLBACK after CB_THRESHOLD failures)
- lib/logger.ts: pino logger (NDJSON prod / pretty dev) + withRequest helper
- middleware: mints correlation_id, stamps x-correlation-id response header,
  emits structured http_request log per /api/* call
- messages/route.ts: switch to structured logger
- 60_meili_index.py: push documents + chunks into Meilisearch
- /api/search/autocomplete: parallel meili search (docs + chunks), 5-8ms p50
- search-autocomplete.tsx: debounced dropdown wired into search-panel

W1.2 — Glitchtip + Forgejo self-hosted
- compose: glitchtip-redis + glitchtip-web + glitchtip-worker (v4.2)
- compose: forgejo + forgejo-runner (server v9, runner v6) with group_add=988
- @sentry/nextjs SDK wired (instrumentation.ts + sentry.{client,server}.config.ts)
- /api/admin/throw smoke endpoint (gated by W0-F1 middleware)
- Synthetic event ingestion verified at glitchtip.disclosure.top
- forgejo.disclosure.top up, repo discadmin/disclosure-bureau created,
  runner registered (labels: ubuntu-latest, docker)
- .forgejo/workflows/ci.yml: typecheck + lint + build + npm audit + python
  syntax + compose validation

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 18:18:42 -03:00

383 lines
15 KiB
Python
Executable file

#!/usr/bin/env python3
"""
02b-enrich-with-web-metadata.py — Phase 0.5
Injects the war.gov-extracted metadata (record_id, incident_date,
incident_location, agency, etc.) into each wiki/documents/<doc-id>.md
frontmatter. Also marks the 4 placeholder records as `availability:
pending-upstream`.
For each document.md we already created from a local PDF, we find the
matching war.gov record using the same 3-tier matcher as 00b-coverage:
1. exact-norm
2. primary-id (DOW-UAP-D74, DOS-UAP-D1, etc.)
3. Jaccard ≥0.5 on signature tokens
The matched record's fields are added under a `war_gov` block in the
frontmatter (non-destructive — never overwrites existing manual data).
If `--rename-events` is passed, events file `EV-XXXX-XX-XX-…` are renamed
to `EV-YYYY-MM-DD-…` based on the matched document's incident_date.
The script updates all wiki-link references to the renamed event ids.
Usage:
./02b-enrich-with-web-metadata.py [--dry-run] [--rename-events]
"""
from __future__ import annotations
import argparse
import json
import re
import sys
import unicodedata
from datetime import datetime, timezone
from pathlib import Path
try:
import yaml
except ImportError:
sys.stderr.write("Missing pyyaml. Run: pip3 install pyyaml\n")
sys.exit(1)
UFO_ROOT = Path("/Users/guto/ufo")
DOCS_DIR = UFO_ROOT / "wiki" / "documents"
EVENTS_DIR = UFO_ROOT / "wiki" / "entities" / "events"
LOG_PATH = UFO_ROOT / "wiki" / "log.md"
METADATA_JSON = UFO_ROOT / "processing" / "war-gov-metadata" / "all-documents-release-01-basic.json"
# Records whose Download serves a placeholder file (verified 2026-05-13)
PLACEHOLDER_RECORDS = {"record-140", "record-154", "record-155", "record-156"}
COMMON = {
"mission", "report", "uap", "the", "of", "and", "a", "in", "on", "for",
"with", "to", "from", "department", "war", "fbi", "nasa", "state",
"unresolved", "debrief", "summary", "transcript", "crew", "general",
"vol", "incident", "summaries", "photo", "video", "cable", "email",
"correspondence", "correspondance", "launch", "range", "fouler",
"force", "air", "navy", "between", "or", "year", "month",
"january", "february", "march", "april", "may", "june", "july",
"august", "september", "october", "november", "december", "redacted",
"sub", "sighting", "about", "kuwait", "kazakhstan", "papua", "guinea",
"syria", "iraq", "iran", "yemen", "djibouti", "japan", "greece",
"mexico", "germany", "turkey", "turkmenistan", "georgia", "tbilisi",
"indopacom", "middle", "east", "africa", "europe", "western", "united",
"states", "north", "south", "america",
}
def normalize(s: str) -> str:
if not s:
return ""
nfkd = unicodedata.normalize("NFKD", s)
ascii_s = "".join(c for c in nfkd if not unicodedata.combining(c))
lower = ascii_s.lower().replace("'", "").replace(",", "-").replace("[", "").replace("]", "")
replaced = re.sub(r"[^a-z0-9]+", "-", lower)
norm = re.sub(r"-+", "-", replaced).strip("-")
prev = None
while prev != norm:
prev = norm
norm = re.sub(r"(?<=[a-z-])0+(\d)", r"\1", norm)
return norm
def signature_tokens(s: str) -> set[str]:
return {t for t in normalize(s).split("-") if t and t not in COMMON}
def jaccard(a: set, b: set) -> float:
return len(a & b) / len(a | b) if a and b else 0.0
def primary_id(s: str) -> str | None:
n = normalize(s)
# Catch (agency)-uap-d(\d+) once and rest of the dedicated patterns. Match
# "cia-uap-d001", "doe-uap-d002", "odni-uap-d001", "dow-uap-d017", etc.
m = re.match(r"^((?:cia|doe|dod|dow|dos|odni|nasa|fbi)-uap-[a-z]{1,4}\d+[a-z]?)", n)
if m:
return m.group(1)
for p in (
r"^(fbi-photo-[a-z]\d+)",
):
m = re.match(p, n)
if m:
return m.group(1)
return None
def parse_us_date(s: str) -> tuple[str, str]:
"""Parse a US-format date like '12/30/47' or '11/9/23' into
(iso_date, confidence). Year handling: 2-digit years <=30 → 20xx, else 19xx.
Returns (iso, confidence_band) e.g. ('1947-12-30','high').
Special cases: 'N/A' → ('NA','none'), 'LATE 2025' → ('2025-12-XX','low').
Year-only '1969' → ('1969-XX-XX','medium').
Range '4/10/2025-4/11/2025' → first date with confidence medium.
"""
if not s or s.strip() == "" or s.strip().upper() in ("N/A", "NA", "NULL"):
return ("NA", "none")
s = s.strip()
# Take first half of range
if "-" in s and any(c.isdigit() for c in s.split("-")[0]):
first = s.split("-")[0].strip()
# Try parsing the first half
iso, conf = parse_us_date(first)
if iso != "NA":
return (iso, "medium")
# Fuzzy patterns
if re.match(r"^late\s+\d{4}$", s, re.I):
y = re.search(r"\d{4}", s).group(0)
return (f"{y}-12-XX", "low")
if re.match(r"^\d{4}$", s):
return (f"{s}-XX-XX", "medium")
# M/D/YY or M/D/YYYY
m = re.match(r"^(\d{1,2})/(\d{1,2})/(\d{2,4})$", s)
if m:
mo, d, y = m.groups()
y_int = int(y)
if len(y) == 2:
y_int = 2000 + y_int if y_int <= 30 else 1900 + y_int
iso = f"{y_int:04d}-{int(mo):02d}-{int(d):02d}"
return (iso, "high")
return ("NA", "speculation")
def event_id_from_date_and_slug(iso_date: str, slug_seed: str) -> str:
"""Build EV-YYYY-MM-DD-<slug> id."""
if iso_date == "NA":
y, mo, d = "XXXX", "XX", "XX"
else:
parts = iso_date.split("-")
y = parts[0] if len(parts) > 0 else "XXXX"
mo = parts[1] if len(parts) > 1 else "XX"
d = parts[2] if len(parts) > 2 else "XX"
slug = normalize(slug_seed)[:50].strip("-") or "unlabeled"
return f"EV-{y}-{mo}-{d}-{slug}"
def read_md(path: Path) -> tuple[dict, str]:
c = path.read_text(encoding="utf-8")
if not c.startswith("---"):
return {}, c
end = c.find("---", 4)
if end == -1:
return {}, c
try:
return (yaml.safe_load(c[3:end].strip()) or {}), c[end + 3 :].lstrip("\n")
except yaml.YAMLError:
return {}, c[end + 3 :].lstrip("\n")
def write_md(path: Path, fm: dict, body: str, dry_run: bool = False) -> bool:
yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
new = f"---\n{yaml_str}---\n\n{body}" if not body.startswith("\n") else f"---\n{yaml_str}---\n{body}"
if path.exists() and path.read_text(encoding="utf-8") == new:
return False
if dry_run:
return True
path.write_text(new, encoding="utf-8")
return True
# ---------------------------------------------------------------------- main
def build_war_index(records: list[dict]) -> list[tuple[dict, str, set[str], str | None]]:
"""Return list of (record, norm_title, sig_tokens, primary_id)."""
out = []
for r in records:
t = r.get("title", "")
out.append((r, normalize(t), signature_tokens(t), primary_id(t)))
return out
def match_doc_to_war(doc_norm: str, doc_sig: set[str], doc_pid: str | None, war_index: list) -> tuple[dict | None, str]:
# Tier 1
for r, wnorm, _wsig, _wpid in war_index:
if wnorm == doc_norm:
return r, "exact-norm"
# Tier 2
if doc_pid:
for r, _wnorm, _wsig, wpid in war_index:
if wpid and wpid == doc_pid:
return r, f"primary-id={doc_pid}"
# Tier 3 containment
for r, wnorm, _wsig, _wpid in war_index:
if len(doc_norm) >= 12 and len(wnorm) >= 12 and (doc_norm in wnorm or wnorm in doc_norm):
return r, "containment"
# Tier 4 Jaccard
best, best_j = None, 0.0
for r, _wnorm, wsig, _wpid in war_index:
j = jaccard(doc_sig, wsig)
if j > best_j:
best_j = j; best = r
if best and best_j >= 0.50:
return best, f"jaccard={best_j:.2f}"
return None, "no-match"
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dry-run", action="store_true")
ap.add_argument("--rename-events", action="store_true", help="Rename EV-XXXX events to EV-YYYY-MM-DD")
ap.add_argument("--metadata-json", action="append", default=None,
help="Path to a war.gov metadata JSON. Pass multiple times to merge releases. "
"Defaults to release-01 + release-02 if present.")
args = ap.parse_args()
if args.metadata_json:
json_paths = [Path(p) for p in args.metadata_json]
else:
# Default: load every release-NN-basic JSON found, so 116 existing docs
# (release-01) and 6 new docs (release-02) all get enriched in one pass.
json_paths = sorted((UFO_ROOT / "processing" / "war-gov-metadata").glob("all-documents-release-*-basic.json"))
if not json_paths:
json_paths = [METADATA_JSON]
records: list[dict] = []
for p in json_paths:
if not p.exists():
sys.stderr.write(f"Metadata JSON not found: {p}\n"); sys.exit(1)
d = json.loads(p.read_text(encoding="utf-8"))
recs = d.get("documents", [])
extracted_at = d.get("extracted_at")
for r in recs:
r.setdefault("_extracted_at", extracted_at)
r.setdefault("_source_json", p.name)
print(f"war.gov records from {p.name}: {len(recs)}")
records.extend(recs)
print(f"war.gov records total: {len(records)}")
war_index = build_war_index(records)
docs = sorted(DOCS_DIR.glob("*.md"))
print(f"local document.md files: {len(docs)}")
enriched = 0
unchanged = 0
unmatched = []
event_renames: list[tuple[str, str]] = [] # (old_event_id, new_event_id)
for doc_path in docs:
fm, body = read_md(doc_path)
if fm.get("type") != "document":
continue
title_candidates = [
fm.get("canonical_title", ""),
fm.get("original_filename", ""),
doc_path.stem,
]
doc_norm = normalize(title_candidates[0]) or normalize(title_candidates[1]) or normalize(title_candidates[2])
doc_sig = signature_tokens(title_candidates[0]) | signature_tokens(title_candidates[1])
doc_pid = primary_id(title_candidates[0]) or primary_id(title_candidates[1]) or primary_id(doc_path.stem)
match, reason = match_doc_to_war(doc_norm, doc_sig, doc_pid, war_index)
if not match:
unmatched.append(doc_path.name)
continue
# Build war_gov block
incident_iso, date_conf = parse_us_date(match.get("incident_date") or "")
release_iso, _ = parse_us_date(match.get("release_date") or "")
war_block = {
"record_id": match["record_id"],
"title_official": match.get("title"),
"agency_official": match.get("agency"),
"release_date_official": release_iso,
"release_date_raw": match.get("release_date"),
"incident_date_official": incident_iso,
"incident_date_raw": match.get("incident_date"),
"incident_date_confidence": date_conf,
"incident_location_official": match.get("incident_location"),
"document_type_official": match.get("document_type"),
"match_reason": reason,
"availability": "pending-upstream" if match["record_id"] in PLACEHOLDER_RECORDS else "downloaded",
"extracted_from_war_gov_at": match.get("_extracted_at"),
}
new_fm = dict(fm)
new_fm["war_gov"] = war_block
# Promote some fields to top-level if they were "NA" or empty
if (new_fm.get("document_date") in (None, "", "NA")) and incident_iso != "NA":
new_fm["document_date"] = incident_iso
if write_md(doc_path, new_fm, body, dry_run=args.dry_run):
enriched += 1
print(f"{doc_path.name}{match['record_id']} ({reason})")
# Compute potential event rename if applicable
if args.rename_events and incident_iso != "NA":
# Look for events referenced in this document that start with EV-XXXX-
key_events = (new_fm.get("key_entities") or {}).get("events") or []
for ref in key_events:
if isinstance(ref, str):
m = re.search(r"\[\[event/(EV-XXXX-XX-XX-[a-z0-9-]+)\]\]", ref)
if m:
old = m.group(1)
slug = old.replace("EV-XXXX-XX-XX-", "", 1)
new_id = event_id_from_date_and_slug(incident_iso, slug)
if new_id != old:
event_renames.append((old, new_id))
else:
unchanged += 1
# Apply event renames
rename_count = 0
for old, new in set(event_renames):
old_path = EVENTS_DIR / f"{old}.md"
new_path = EVENTS_DIR / f"{new}.md"
if not old_path.exists():
continue
if new_path.exists() and new_path != old_path:
print(f" ⚠ skip rename {old}{new} (target exists)")
continue
if args.dry_run:
print(f" [dry] rename {old}{new}")
rename_count += 1
continue
# Read, update event_id field, write to new path, delete old
fm, body = read_md(old_path)
fm["event_id"] = new
# Update date_start/date_end if currently NA
parts = new.split("-")
if len(parts) >= 4:
y, mo, d = parts[1], parts[2], parts[3]
if y != "XXXX" and (fm.get("date_start") in (None, "NA")):
if mo != "XX" and d != "XX":
fm["date_start"] = f"{y}-{mo}-{d}"
fm["date_end"] = fm.get("date_end") or f"{y}-{mo}-{d}"
fm["date_confidence"] = "high"
elif mo != "XX":
fm["date_start"] = f"{y}-{mo}"
write_md(new_path, fm, body)
old_path.unlink()
rename_count += 1
# Update all wiki-links pointing to the old event_id everywhere
for f in list(UFO_ROOT.rglob("*.md")):
if "/processing/" in str(f) or f == new_path:
continue
c = f.read_text(encoding="utf-8")
if old not in c:
continue
c2 = c.replace(f"[[event/{old}]]", f"[[event/{new}]]")
if c2 != c:
f.write_text(c2, encoding="utf-8")
print(f" ↺ renamed {old}{new}")
# Log
print(f"\nEnriched: {enriched}, unchanged: {unchanged}, unmatched: {len(unmatched)}, event renames: {rename_count}")
if unmatched:
print("Unmatched docs (no war.gov record found):")
for n in unmatched[:20]:
print(f" - {n}")
if len(unmatched) > 20:
print(f" … and {len(unmatched) - 20} more")
if not args.dry_run and enriched > 0:
with open(LOG_PATH, "a", encoding="utf-8") as fh:
fh.write(
f"\n## {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')} — ENRICH WAR.GOV (Phase 0.5)\n"
f"- operator: archivist\n- script: scripts/02b-enrich-with-web-metadata.py\n"
f"- json_source: {', '.join(p.name for p in json_paths)}\n"
f"- enriched: {enriched}\n- unchanged: {unchanged}\n- unmatched: {len(unmatched)}\n"
f"- event_renames: {rename_count}\n"
)
if __name__ == "__main__":
main()