#!/usr/bin/env python3 """ 02b-enrich-with-web-metadata.py — Phase 0.5 Injects the war.gov-extracted metadata (record_id, incident_date, incident_location, agency, etc.) into each wiki/documents/.md frontmatter. Also marks the 4 placeholder records as `availability: pending-upstream`. For each document.md we already created from a local PDF, we find the matching war.gov record using the same 3-tier matcher as 00b-coverage: 1. exact-norm 2. primary-id (DOW-UAP-D74, DOS-UAP-D1, etc.) 3. Jaccard ≥0.5 on signature tokens The matched record's fields are added under a `war_gov` block in the frontmatter (non-destructive — never overwrites existing manual data). If `--rename-events` is passed, events file `EV-XXXX-XX-XX-…` are renamed to `EV-YYYY-MM-DD-…` based on the matched document's incident_date. The script updates all wiki-link references to the renamed event ids. Usage: ./02b-enrich-with-web-metadata.py [--dry-run] [--rename-events] """ from __future__ import annotations import argparse import json import re import sys import unicodedata from datetime import datetime, timezone from pathlib import Path try: import yaml except ImportError: sys.stderr.write("Missing pyyaml. Run: pip3 install pyyaml\n") sys.exit(1) UFO_ROOT = Path("/Users/guto/ufo") DOCS_DIR = UFO_ROOT / "wiki" / "documents" EVENTS_DIR = UFO_ROOT / "wiki" / "entities" / "events" LOG_PATH = UFO_ROOT / "wiki" / "log.md" METADATA_JSON = UFO_ROOT / "processing" / "war-gov-metadata" / "all-documents-release-01-basic.json" # Records whose Download serves a placeholder file (verified 2026-05-13) PLACEHOLDER_RECORDS = {"record-140", "record-154", "record-155", "record-156"} COMMON = { "mission", "report", "uap", "the", "of", "and", "a", "in", "on", "for", "with", "to", "from", "department", "war", "fbi", "nasa", "state", "unresolved", "debrief", "summary", "transcript", "crew", "general", "vol", "incident", "summaries", "photo", "video", "cable", "email", "correspondence", "correspondance", "launch", "range", "fouler", "force", "air", "navy", "between", "or", "year", "month", "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december", "redacted", "sub", "sighting", "about", "kuwait", "kazakhstan", "papua", "guinea", "syria", "iraq", "iran", "yemen", "djibouti", "japan", "greece", "mexico", "germany", "turkey", "turkmenistan", "georgia", "tbilisi", "indopacom", "middle", "east", "africa", "europe", "western", "united", "states", "north", "south", "america", } def normalize(s: str) -> str: if not s: return "" nfkd = unicodedata.normalize("NFKD", s) ascii_s = "".join(c for c in nfkd if not unicodedata.combining(c)) lower = ascii_s.lower().replace("'", "").replace(",", "-").replace("[", "").replace("]", "") replaced = re.sub(r"[^a-z0-9]+", "-", lower) norm = re.sub(r"-+", "-", replaced).strip("-") prev = None while prev != norm: prev = norm norm = re.sub(r"(?<=[a-z-])0+(\d)", r"\1", norm) return norm def signature_tokens(s: str) -> set[str]: return {t for t in normalize(s).split("-") if t and t not in COMMON} def jaccard(a: set, b: set) -> float: return len(a & b) / len(a | b) if a and b else 0.0 def primary_id(s: str) -> str | None: n = normalize(s) for p in ( r"^(dow-uap-[a-z]{1,4}\d+)", r"^(dos-uap-d\d+)", r"^(nasa-uap-[a-z]{1,3}\d+[a-z]?)", r"^(fbi-photo-[a-z]\d+)", ): m = re.match(p, n) if m: return m.group(1) return None def parse_us_date(s: str) -> tuple[str, str]: """Parse a US-format date like '12/30/47' or '11/9/23' into (iso_date, confidence). Year handling: 2-digit years <=30 → 20xx, else 19xx. Returns (iso, confidence_band) e.g. ('1947-12-30','high'). Special cases: 'N/A' → ('NA','none'), 'LATE 2025' → ('2025-12-XX','low'). Year-only '1969' → ('1969-XX-XX','medium'). Range '4/10/2025-4/11/2025' → first date with confidence medium. """ if not s or s.strip() == "" or s.strip().upper() in ("N/A", "NA", "NULL"): return ("NA", "none") s = s.strip() # Take first half of range if "-" in s and any(c.isdigit() for c in s.split("-")[0]): first = s.split("-")[0].strip() # Try parsing the first half iso, conf = parse_us_date(first) if iso != "NA": return (iso, "medium") # Fuzzy patterns if re.match(r"^late\s+\d{4}$", s, re.I): y = re.search(r"\d{4}", s).group(0) return (f"{y}-12-XX", "low") if re.match(r"^\d{4}$", s): return (f"{s}-XX-XX", "medium") # M/D/YY or M/D/YYYY m = re.match(r"^(\d{1,2})/(\d{1,2})/(\d{2,4})$", s) if m: mo, d, y = m.groups() y_int = int(y) if len(y) == 2: y_int = 2000 + y_int if y_int <= 30 else 1900 + y_int iso = f"{y_int:04d}-{int(mo):02d}-{int(d):02d}" return (iso, "high") return ("NA", "speculation") def event_id_from_date_and_slug(iso_date: str, slug_seed: str) -> str: """Build EV-YYYY-MM-DD- id.""" if iso_date == "NA": y, mo, d = "XXXX", "XX", "XX" else: parts = iso_date.split("-") y = parts[0] if len(parts) > 0 else "XXXX" mo = parts[1] if len(parts) > 1 else "XX" d = parts[2] if len(parts) > 2 else "XX" slug = normalize(slug_seed)[:50].strip("-") or "unlabeled" return f"EV-{y}-{mo}-{d}-{slug}" def read_md(path: Path) -> tuple[dict, str]: c = path.read_text(encoding="utf-8") if not c.startswith("---"): return {}, c end = c.find("---", 4) if end == -1: return {}, c try: return (yaml.safe_load(c[3:end].strip()) or {}), c[end + 3 :].lstrip("\n") except yaml.YAMLError: return {}, c[end + 3 :].lstrip("\n") def write_md(path: Path, fm: dict, body: str, dry_run: bool = False) -> bool: yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False) new = f"---\n{yaml_str}---\n\n{body}" if not body.startswith("\n") else f"---\n{yaml_str}---\n{body}" if path.exists() and path.read_text(encoding="utf-8") == new: return False if dry_run: return True path.write_text(new, encoding="utf-8") return True # ---------------------------------------------------------------------- main def build_war_index(records: list[dict]) -> list[tuple[dict, str, set[str], str | None]]: """Return list of (record, norm_title, sig_tokens, primary_id).""" out = [] for r in records: t = r.get("title", "") out.append((r, normalize(t), signature_tokens(t), primary_id(t))) return out def match_doc_to_war(doc_norm: str, doc_sig: set[str], doc_pid: str | None, war_index: list) -> tuple[dict | None, str]: # Tier 1 for r, wnorm, _wsig, _wpid in war_index: if wnorm == doc_norm: return r, "exact-norm" # Tier 2 if doc_pid: for r, _wnorm, _wsig, wpid in war_index: if wpid and wpid == doc_pid: return r, f"primary-id={doc_pid}" # Tier 3 containment for r, wnorm, _wsig, _wpid in war_index: if len(doc_norm) >= 12 and len(wnorm) >= 12 and (doc_norm in wnorm or wnorm in doc_norm): return r, "containment" # Tier 4 Jaccard best, best_j = None, 0.0 for r, _wnorm, wsig, _wpid in war_index: j = jaccard(doc_sig, wsig) if j > best_j: best_j = j; best = r if best and best_j >= 0.50: return best, f"jaccard={best_j:.2f}" return None, "no-match" def main(): ap = argparse.ArgumentParser() ap.add_argument("--dry-run", action="store_true") ap.add_argument("--rename-events", action="store_true", help="Rename EV-XXXX events to EV-YYYY-MM-DD") args = ap.parse_args() if not METADATA_JSON.exists(): sys.stderr.write(f"Metadata JSON not found: {METADATA_JSON}\n") sys.exit(1) data = json.loads(METADATA_JSON.read_text(encoding="utf-8")) records = data.get("documents", []) print(f"war.gov records: {len(records)}") war_index = build_war_index(records) docs = sorted(DOCS_DIR.glob("*.md")) print(f"local document.md files: {len(docs)}") enriched = 0 unchanged = 0 unmatched = [] event_renames: list[tuple[str, str]] = [] # (old_event_id, new_event_id) for doc_path in docs: fm, body = read_md(doc_path) if fm.get("type") != "document": continue title_candidates = [ fm.get("canonical_title", ""), fm.get("original_filename", ""), doc_path.stem, ] doc_norm = normalize(title_candidates[0]) or normalize(title_candidates[1]) or normalize(title_candidates[2]) doc_sig = signature_tokens(title_candidates[0]) | signature_tokens(title_candidates[1]) doc_pid = primary_id(title_candidates[0]) or primary_id(title_candidates[1]) or primary_id(doc_path.stem) match, reason = match_doc_to_war(doc_norm, doc_sig, doc_pid, war_index) if not match: unmatched.append(doc_path.name) continue # Build war_gov block incident_iso, date_conf = parse_us_date(match.get("incident_date") or "") release_iso, _ = parse_us_date(match.get("release_date") or "") war_block = { "record_id": match["record_id"], "title_official": match.get("title"), "agency_official": match.get("agency"), "release_date_official": release_iso, "release_date_raw": match.get("release_date"), "incident_date_official": incident_iso, "incident_date_raw": match.get("incident_date"), "incident_date_confidence": date_conf, "incident_location_official": match.get("incident_location"), "document_type_official": match.get("document_type"), "match_reason": reason, "availability": "pending-upstream" if match["record_id"] in PLACEHOLDER_RECORDS else "downloaded", "extracted_from_war_gov_at": data.get("extracted_at"), } new_fm = dict(fm) new_fm["war_gov"] = war_block # Promote some fields to top-level if they were "NA" or empty if (new_fm.get("document_date") in (None, "", "NA")) and incident_iso != "NA": new_fm["document_date"] = incident_iso if write_md(doc_path, new_fm, body, dry_run=args.dry_run): enriched += 1 print(f" ✓ {doc_path.name} ← {match['record_id']} ({reason})") # Compute potential event rename if applicable if args.rename_events and incident_iso != "NA": # Look for events referenced in this document that start with EV-XXXX- key_events = (new_fm.get("key_entities") or {}).get("events") or [] for ref in key_events: if isinstance(ref, str): m = re.search(r"\[\[event/(EV-XXXX-XX-XX-[a-z0-9-]+)\]\]", ref) if m: old = m.group(1) slug = old.replace("EV-XXXX-XX-XX-", "", 1) new_id = event_id_from_date_and_slug(incident_iso, slug) if new_id != old: event_renames.append((old, new_id)) else: unchanged += 1 # Apply event renames rename_count = 0 for old, new in set(event_renames): old_path = EVENTS_DIR / f"{old}.md" new_path = EVENTS_DIR / f"{new}.md" if not old_path.exists(): continue if new_path.exists() and new_path != old_path: print(f" ⚠ skip rename {old} → {new} (target exists)") continue if args.dry_run: print(f" [dry] rename {old} → {new}") rename_count += 1 continue # Read, update event_id field, write to new path, delete old fm, body = read_md(old_path) fm["event_id"] = new # Update date_start/date_end if currently NA parts = new.split("-") if len(parts) >= 4: y, mo, d = parts[1], parts[2], parts[3] if y != "XXXX" and (fm.get("date_start") in (None, "NA")): if mo != "XX" and d != "XX": fm["date_start"] = f"{y}-{mo}-{d}" fm["date_end"] = fm.get("date_end") or f"{y}-{mo}-{d}" fm["date_confidence"] = "high" elif mo != "XX": fm["date_start"] = f"{y}-{mo}" write_md(new_path, fm, body) old_path.unlink() rename_count += 1 # Update all wiki-links pointing to the old event_id everywhere for f in list(UFO_ROOT.rglob("*.md")): if "/processing/" in str(f) or f == new_path: continue c = f.read_text(encoding="utf-8") if old not in c: continue c2 = c.replace(f"[[event/{old}]]", f"[[event/{new}]]") if c2 != c: f.write_text(c2, encoding="utf-8") print(f" ↺ renamed {old} → {new}") # Log print(f"\nEnriched: {enriched}, unchanged: {unchanged}, unmatched: {len(unmatched)}, event renames: {rename_count}") if unmatched: print("Unmatched docs (no war.gov record found):") for n in unmatched[:20]: print(f" - {n}") if len(unmatched) > 20: print(f" … and {len(unmatched) - 20} more") if not args.dry_run and enriched > 0: with open(LOG_PATH, "a", encoding="utf-8") as fh: fh.write( f"\n## {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')} — ENRICH WAR.GOV (Phase 0.5)\n" f"- operator: archivist\n- script: scripts/02b-enrich-with-web-metadata.py\n" f"- json_source: {METADATA_JSON.name}\n" f"- enriched: {enriched}\n- unchanged: {unchanged}\n- unmatched: {len(unmatched)}\n" f"- event_renames: {rename_count}\n" ) if __name__ == "__main__": main()