W0 — security hardening (5 fixes verified live on disclosure.top)
- middleware: gate /api/admin/* same as /admin/* (F1)
- imgproxy: tighten LOCAL_FILESYSTEM_ROOT from / to /var/lib/storage (F2)
- studio: real basic-auth label (bcrypt hash, middleware reference) (F3)
- relations: ENABLE ROW LEVEL SECURITY + public SELECT policy (F4)
- migration 0003: fold is_searchable + hybrid_search update into canonical (TD#2)
W1 — observability + resilience + autocomplete
- studio: HOSTNAME=0.0.0.0 so Next.js binds on loopback for healthcheck
- compose: PG_POOL_MAX=20, CLAUDE_CODE_OAUTH_TOKEN gated by separate env
- claude-code.ts: subprocess timeout configurable (CLAUDE_CODE_TIMEOUT_MS)
- openrouter.ts: retry with exponential backoff + Retry-After + in-memory
circuit breaker (promotes FALLBACK after CB_THRESHOLD failures)
- lib/logger.ts: pino logger (NDJSON prod / pretty dev) + withRequest helper
- middleware: mints correlation_id, stamps x-correlation-id response header,
emits structured http_request log per /api/* call
- messages/route.ts: switch to structured logger
- 60_meili_index.py: push documents + chunks into Meilisearch
- /api/search/autocomplete: parallel meili search (docs + chunks), 5-8ms p50
- search-autocomplete.tsx: debounced dropdown wired into search-panel
W1.2 — Glitchtip + Forgejo self-hosted
- compose: glitchtip-redis + glitchtip-web + glitchtip-worker (v4.2)
- compose: forgejo + forgejo-runner (server v9, runner v6) with group_add=988
- @sentry/nextjs SDK wired (instrumentation.ts + sentry.{client,server}.config.ts)
- /api/admin/throw smoke endpoint (gated by W0-F1 middleware)
- Synthetic event ingestion verified at glitchtip.disclosure.top
- forgejo.disclosure.top up, repo discadmin/disclosure-bureau created,
runner registered (labels: ubuntu-latest, docker)
- .forgejo/workflows/ci.yml: typecheck + lint + build + npm audit + python
syntax + compose validation
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
383 lines
15 KiB
Python
Executable file
383 lines
15 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
02b-enrich-with-web-metadata.py — Phase 0.5
|
|
|
|
Injects the war.gov-extracted metadata (record_id, incident_date,
|
|
incident_location, agency, etc.) into each wiki/documents/<doc-id>.md
|
|
frontmatter. Also marks the 4 placeholder records as `availability:
|
|
pending-upstream`.
|
|
|
|
For each document.md we already created from a local PDF, we find the
|
|
matching war.gov record using the same 3-tier matcher as 00b-coverage:
|
|
1. exact-norm
|
|
2. primary-id (DOW-UAP-D74, DOS-UAP-D1, etc.)
|
|
3. Jaccard ≥0.5 on signature tokens
|
|
|
|
The matched record's fields are added under a `war_gov` block in the
|
|
frontmatter (non-destructive — never overwrites existing manual data).
|
|
|
|
If `--rename-events` is passed, events file `EV-XXXX-XX-XX-…` are renamed
|
|
to `EV-YYYY-MM-DD-…` based on the matched document's incident_date.
|
|
The script updates all wiki-link references to the renamed event ids.
|
|
|
|
Usage:
|
|
./02b-enrich-with-web-metadata.py [--dry-run] [--rename-events]
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
import unicodedata
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import yaml
|
|
except ImportError:
|
|
sys.stderr.write("Missing pyyaml. Run: pip3 install pyyaml\n")
|
|
sys.exit(1)
|
|
|
|
UFO_ROOT = Path("/Users/guto/ufo")
|
|
DOCS_DIR = UFO_ROOT / "wiki" / "documents"
|
|
EVENTS_DIR = UFO_ROOT / "wiki" / "entities" / "events"
|
|
LOG_PATH = UFO_ROOT / "wiki" / "log.md"
|
|
METADATA_JSON = UFO_ROOT / "processing" / "war-gov-metadata" / "all-documents-release-01-basic.json"
|
|
|
|
# Records whose Download serves a placeholder file (verified 2026-05-13)
|
|
PLACEHOLDER_RECORDS = {"record-140", "record-154", "record-155", "record-156"}
|
|
|
|
COMMON = {
|
|
"mission", "report", "uap", "the", "of", "and", "a", "in", "on", "for",
|
|
"with", "to", "from", "department", "war", "fbi", "nasa", "state",
|
|
"unresolved", "debrief", "summary", "transcript", "crew", "general",
|
|
"vol", "incident", "summaries", "photo", "video", "cable", "email",
|
|
"correspondence", "correspondance", "launch", "range", "fouler",
|
|
"force", "air", "navy", "between", "or", "year", "month",
|
|
"january", "february", "march", "april", "may", "june", "july",
|
|
"august", "september", "october", "november", "december", "redacted",
|
|
"sub", "sighting", "about", "kuwait", "kazakhstan", "papua", "guinea",
|
|
"syria", "iraq", "iran", "yemen", "djibouti", "japan", "greece",
|
|
"mexico", "germany", "turkey", "turkmenistan", "georgia", "tbilisi",
|
|
"indopacom", "middle", "east", "africa", "europe", "western", "united",
|
|
"states", "north", "south", "america",
|
|
}
|
|
|
|
|
|
def normalize(s: str) -> str:
|
|
if not s:
|
|
return ""
|
|
nfkd = unicodedata.normalize("NFKD", s)
|
|
ascii_s = "".join(c for c in nfkd if not unicodedata.combining(c))
|
|
lower = ascii_s.lower().replace("'", "").replace(",", "-").replace("[", "").replace("]", "")
|
|
replaced = re.sub(r"[^a-z0-9]+", "-", lower)
|
|
norm = re.sub(r"-+", "-", replaced).strip("-")
|
|
prev = None
|
|
while prev != norm:
|
|
prev = norm
|
|
norm = re.sub(r"(?<=[a-z-])0+(\d)", r"\1", norm)
|
|
return norm
|
|
|
|
|
|
def signature_tokens(s: str) -> set[str]:
|
|
return {t for t in normalize(s).split("-") if t and t not in COMMON}
|
|
|
|
|
|
def jaccard(a: set, b: set) -> float:
|
|
return len(a & b) / len(a | b) if a and b else 0.0
|
|
|
|
|
|
def primary_id(s: str) -> str | None:
|
|
n = normalize(s)
|
|
# Catch (agency)-uap-d(\d+) once and rest of the dedicated patterns. Match
|
|
# "cia-uap-d001", "doe-uap-d002", "odni-uap-d001", "dow-uap-d017", etc.
|
|
m = re.match(r"^((?:cia|doe|dod|dow|dos|odni|nasa|fbi)-uap-[a-z]{1,4}\d+[a-z]?)", n)
|
|
if m:
|
|
return m.group(1)
|
|
for p in (
|
|
r"^(fbi-photo-[a-z]\d+)",
|
|
):
|
|
m = re.match(p, n)
|
|
if m:
|
|
return m.group(1)
|
|
return None
|
|
|
|
|
|
def parse_us_date(s: str) -> tuple[str, str]:
|
|
"""Parse a US-format date like '12/30/47' or '11/9/23' into
|
|
(iso_date, confidence). Year handling: 2-digit years <=30 → 20xx, else 19xx.
|
|
Returns (iso, confidence_band) e.g. ('1947-12-30','high').
|
|
Special cases: 'N/A' → ('NA','none'), 'LATE 2025' → ('2025-12-XX','low').
|
|
Year-only '1969' → ('1969-XX-XX','medium').
|
|
Range '4/10/2025-4/11/2025' → first date with confidence medium.
|
|
"""
|
|
if not s or s.strip() == "" or s.strip().upper() in ("N/A", "NA", "NULL"):
|
|
return ("NA", "none")
|
|
s = s.strip()
|
|
# Take first half of range
|
|
if "-" in s and any(c.isdigit() for c in s.split("-")[0]):
|
|
first = s.split("-")[0].strip()
|
|
# Try parsing the first half
|
|
iso, conf = parse_us_date(first)
|
|
if iso != "NA":
|
|
return (iso, "medium")
|
|
# Fuzzy patterns
|
|
if re.match(r"^late\s+\d{4}$", s, re.I):
|
|
y = re.search(r"\d{4}", s).group(0)
|
|
return (f"{y}-12-XX", "low")
|
|
if re.match(r"^\d{4}$", s):
|
|
return (f"{s}-XX-XX", "medium")
|
|
# M/D/YY or M/D/YYYY
|
|
m = re.match(r"^(\d{1,2})/(\d{1,2})/(\d{2,4})$", s)
|
|
if m:
|
|
mo, d, y = m.groups()
|
|
y_int = int(y)
|
|
if len(y) == 2:
|
|
y_int = 2000 + y_int if y_int <= 30 else 1900 + y_int
|
|
iso = f"{y_int:04d}-{int(mo):02d}-{int(d):02d}"
|
|
return (iso, "high")
|
|
return ("NA", "speculation")
|
|
|
|
|
|
def event_id_from_date_and_slug(iso_date: str, slug_seed: str) -> str:
|
|
"""Build EV-YYYY-MM-DD-<slug> id."""
|
|
if iso_date == "NA":
|
|
y, mo, d = "XXXX", "XX", "XX"
|
|
else:
|
|
parts = iso_date.split("-")
|
|
y = parts[0] if len(parts) > 0 else "XXXX"
|
|
mo = parts[1] if len(parts) > 1 else "XX"
|
|
d = parts[2] if len(parts) > 2 else "XX"
|
|
slug = normalize(slug_seed)[:50].strip("-") or "unlabeled"
|
|
return f"EV-{y}-{mo}-{d}-{slug}"
|
|
|
|
|
|
def read_md(path: Path) -> tuple[dict, str]:
|
|
c = path.read_text(encoding="utf-8")
|
|
if not c.startswith("---"):
|
|
return {}, c
|
|
end = c.find("---", 4)
|
|
if end == -1:
|
|
return {}, c
|
|
try:
|
|
return (yaml.safe_load(c[3:end].strip()) or {}), c[end + 3 :].lstrip("\n")
|
|
except yaml.YAMLError:
|
|
return {}, c[end + 3 :].lstrip("\n")
|
|
|
|
|
|
def write_md(path: Path, fm: dict, body: str, dry_run: bool = False) -> bool:
|
|
yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
new = f"---\n{yaml_str}---\n\n{body}" if not body.startswith("\n") else f"---\n{yaml_str}---\n{body}"
|
|
if path.exists() and path.read_text(encoding="utf-8") == new:
|
|
return False
|
|
if dry_run:
|
|
return True
|
|
path.write_text(new, encoding="utf-8")
|
|
return True
|
|
|
|
|
|
# ---------------------------------------------------------------------- main
|
|
|
|
|
|
def build_war_index(records: list[dict]) -> list[tuple[dict, str, set[str], str | None]]:
|
|
"""Return list of (record, norm_title, sig_tokens, primary_id)."""
|
|
out = []
|
|
for r in records:
|
|
t = r.get("title", "")
|
|
out.append((r, normalize(t), signature_tokens(t), primary_id(t)))
|
|
return out
|
|
|
|
|
|
def match_doc_to_war(doc_norm: str, doc_sig: set[str], doc_pid: str | None, war_index: list) -> tuple[dict | None, str]:
|
|
# Tier 1
|
|
for r, wnorm, _wsig, _wpid in war_index:
|
|
if wnorm == doc_norm:
|
|
return r, "exact-norm"
|
|
# Tier 2
|
|
if doc_pid:
|
|
for r, _wnorm, _wsig, wpid in war_index:
|
|
if wpid and wpid == doc_pid:
|
|
return r, f"primary-id={doc_pid}"
|
|
# Tier 3 containment
|
|
for r, wnorm, _wsig, _wpid in war_index:
|
|
if len(doc_norm) >= 12 and len(wnorm) >= 12 and (doc_norm in wnorm or wnorm in doc_norm):
|
|
return r, "containment"
|
|
# Tier 4 Jaccard
|
|
best, best_j = None, 0.0
|
|
for r, _wnorm, wsig, _wpid in war_index:
|
|
j = jaccard(doc_sig, wsig)
|
|
if j > best_j:
|
|
best_j = j; best = r
|
|
if best and best_j >= 0.50:
|
|
return best, f"jaccard={best_j:.2f}"
|
|
return None, "no-match"
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--dry-run", action="store_true")
|
|
ap.add_argument("--rename-events", action="store_true", help="Rename EV-XXXX events to EV-YYYY-MM-DD")
|
|
ap.add_argument("--metadata-json", action="append", default=None,
|
|
help="Path to a war.gov metadata JSON. Pass multiple times to merge releases. "
|
|
"Defaults to release-01 + release-02 if present.")
|
|
args = ap.parse_args()
|
|
|
|
if args.metadata_json:
|
|
json_paths = [Path(p) for p in args.metadata_json]
|
|
else:
|
|
# Default: load every release-NN-basic JSON found, so 116 existing docs
|
|
# (release-01) and 6 new docs (release-02) all get enriched in one pass.
|
|
json_paths = sorted((UFO_ROOT / "processing" / "war-gov-metadata").glob("all-documents-release-*-basic.json"))
|
|
if not json_paths:
|
|
json_paths = [METADATA_JSON]
|
|
|
|
records: list[dict] = []
|
|
for p in json_paths:
|
|
if not p.exists():
|
|
sys.stderr.write(f"Metadata JSON not found: {p}\n"); sys.exit(1)
|
|
d = json.loads(p.read_text(encoding="utf-8"))
|
|
recs = d.get("documents", [])
|
|
extracted_at = d.get("extracted_at")
|
|
for r in recs:
|
|
r.setdefault("_extracted_at", extracted_at)
|
|
r.setdefault("_source_json", p.name)
|
|
print(f"war.gov records from {p.name}: {len(recs)}")
|
|
records.extend(recs)
|
|
print(f"war.gov records total: {len(records)}")
|
|
|
|
war_index = build_war_index(records)
|
|
docs = sorted(DOCS_DIR.glob("*.md"))
|
|
print(f"local document.md files: {len(docs)}")
|
|
|
|
enriched = 0
|
|
unchanged = 0
|
|
unmatched = []
|
|
event_renames: list[tuple[str, str]] = [] # (old_event_id, new_event_id)
|
|
|
|
for doc_path in docs:
|
|
fm, body = read_md(doc_path)
|
|
if fm.get("type") != "document":
|
|
continue
|
|
title_candidates = [
|
|
fm.get("canonical_title", ""),
|
|
fm.get("original_filename", ""),
|
|
doc_path.stem,
|
|
]
|
|
doc_norm = normalize(title_candidates[0]) or normalize(title_candidates[1]) or normalize(title_candidates[2])
|
|
doc_sig = signature_tokens(title_candidates[0]) | signature_tokens(title_candidates[1])
|
|
doc_pid = primary_id(title_candidates[0]) or primary_id(title_candidates[1]) or primary_id(doc_path.stem)
|
|
|
|
match, reason = match_doc_to_war(doc_norm, doc_sig, doc_pid, war_index)
|
|
if not match:
|
|
unmatched.append(doc_path.name)
|
|
continue
|
|
|
|
# Build war_gov block
|
|
incident_iso, date_conf = parse_us_date(match.get("incident_date") or "")
|
|
release_iso, _ = parse_us_date(match.get("release_date") or "")
|
|
war_block = {
|
|
"record_id": match["record_id"],
|
|
"title_official": match.get("title"),
|
|
"agency_official": match.get("agency"),
|
|
"release_date_official": release_iso,
|
|
"release_date_raw": match.get("release_date"),
|
|
"incident_date_official": incident_iso,
|
|
"incident_date_raw": match.get("incident_date"),
|
|
"incident_date_confidence": date_conf,
|
|
"incident_location_official": match.get("incident_location"),
|
|
"document_type_official": match.get("document_type"),
|
|
"match_reason": reason,
|
|
"availability": "pending-upstream" if match["record_id"] in PLACEHOLDER_RECORDS else "downloaded",
|
|
"extracted_from_war_gov_at": match.get("_extracted_at"),
|
|
}
|
|
|
|
new_fm = dict(fm)
|
|
new_fm["war_gov"] = war_block
|
|
# Promote some fields to top-level if they were "NA" or empty
|
|
if (new_fm.get("document_date") in (None, "", "NA")) and incident_iso != "NA":
|
|
new_fm["document_date"] = incident_iso
|
|
|
|
if write_md(doc_path, new_fm, body, dry_run=args.dry_run):
|
|
enriched += 1
|
|
print(f" ✓ {doc_path.name} ← {match['record_id']} ({reason})")
|
|
# Compute potential event rename if applicable
|
|
if args.rename_events and incident_iso != "NA":
|
|
# Look for events referenced in this document that start with EV-XXXX-
|
|
key_events = (new_fm.get("key_entities") or {}).get("events") or []
|
|
for ref in key_events:
|
|
if isinstance(ref, str):
|
|
m = re.search(r"\[\[event/(EV-XXXX-XX-XX-[a-z0-9-]+)\]\]", ref)
|
|
if m:
|
|
old = m.group(1)
|
|
slug = old.replace("EV-XXXX-XX-XX-", "", 1)
|
|
new_id = event_id_from_date_and_slug(incident_iso, slug)
|
|
if new_id != old:
|
|
event_renames.append((old, new_id))
|
|
else:
|
|
unchanged += 1
|
|
|
|
# Apply event renames
|
|
rename_count = 0
|
|
for old, new in set(event_renames):
|
|
old_path = EVENTS_DIR / f"{old}.md"
|
|
new_path = EVENTS_DIR / f"{new}.md"
|
|
if not old_path.exists():
|
|
continue
|
|
if new_path.exists() and new_path != old_path:
|
|
print(f" ⚠ skip rename {old} → {new} (target exists)")
|
|
continue
|
|
if args.dry_run:
|
|
print(f" [dry] rename {old} → {new}")
|
|
rename_count += 1
|
|
continue
|
|
# Read, update event_id field, write to new path, delete old
|
|
fm, body = read_md(old_path)
|
|
fm["event_id"] = new
|
|
# Update date_start/date_end if currently NA
|
|
parts = new.split("-")
|
|
if len(parts) >= 4:
|
|
y, mo, d = parts[1], parts[2], parts[3]
|
|
if y != "XXXX" and (fm.get("date_start") in (None, "NA")):
|
|
if mo != "XX" and d != "XX":
|
|
fm["date_start"] = f"{y}-{mo}-{d}"
|
|
fm["date_end"] = fm.get("date_end") or f"{y}-{mo}-{d}"
|
|
fm["date_confidence"] = "high"
|
|
elif mo != "XX":
|
|
fm["date_start"] = f"{y}-{mo}"
|
|
write_md(new_path, fm, body)
|
|
old_path.unlink()
|
|
rename_count += 1
|
|
# Update all wiki-links pointing to the old event_id everywhere
|
|
for f in list(UFO_ROOT.rglob("*.md")):
|
|
if "/processing/" in str(f) or f == new_path:
|
|
continue
|
|
c = f.read_text(encoding="utf-8")
|
|
if old not in c:
|
|
continue
|
|
c2 = c.replace(f"[[event/{old}]]", f"[[event/{new}]]")
|
|
if c2 != c:
|
|
f.write_text(c2, encoding="utf-8")
|
|
print(f" ↺ renamed {old} → {new}")
|
|
|
|
# Log
|
|
print(f"\nEnriched: {enriched}, unchanged: {unchanged}, unmatched: {len(unmatched)}, event renames: {rename_count}")
|
|
if unmatched:
|
|
print("Unmatched docs (no war.gov record found):")
|
|
for n in unmatched[:20]:
|
|
print(f" - {n}")
|
|
if len(unmatched) > 20:
|
|
print(f" … and {len(unmatched) - 20} more")
|
|
if not args.dry_run and enriched > 0:
|
|
with open(LOG_PATH, "a", encoding="utf-8") as fh:
|
|
fh.write(
|
|
f"\n## {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')} — ENRICH WAR.GOV (Phase 0.5)\n"
|
|
f"- operator: archivist\n- script: scripts/02b-enrich-with-web-metadata.py\n"
|
|
f"- json_source: {', '.join(p.name for p in json_paths)}\n"
|
|
f"- enriched: {enriched}\n- unchanged: {unchanged}\n- unmatched: {len(unmatched)}\n"
|
|
f"- event_renames: {rename_count}\n"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|