disclosure-bureau/scripts/02b-enrich-with-web-metadata.py

#!/usr/bin/env python3
"""
02b-enrich-with-web-metadata.py — Phase 0.5

Injects the war.gov-extracted metadata (record_id, incident_date,
incident_location, agency, etc.) into each wiki/documents/<doc-id>.md
frontmatter. Also marks the 4 placeholder records as `availability:
pending-upstream`.

For each document.md we already created from a local PDF, we find the
matching war.gov record using the same 3-tier matcher as 00b-coverage:
  1. exact-norm
  2. primary-id (DOW-UAP-D74, DOS-UAP-D1, etc.)
  3. Jaccard ≥0.5 on signature tokens

The matched record's fields are added under a `war_gov` block in the
frontmatter (non-destructive — never overwrites existing manual data).

If `--rename-events` is passed, events file `EV-XXXX-XX-XX-…` are renamed
to `EV-YYYY-MM-DD-…` based on the matched document's incident_date.
The script updates all wiki-link references to the renamed event ids.

Usage:
  ./02b-enrich-with-web-metadata.py [--dry-run] [--rename-events]
"""
from __future__ import annotations

import argparse
import json
import re
import sys
import unicodedata
from datetime import datetime, timezone
from pathlib import Path

try:
    import yaml
except ImportError:
    sys.stderr.write("Missing pyyaml. Run: pip3 install pyyaml\n")
    sys.exit(1)

UFO_ROOT = Path("/Users/guto/ufo")
DOCS_DIR = UFO_ROOT / "wiki" / "documents"
EVENTS_DIR = UFO_ROOT / "wiki" / "entities" / "events"
LOG_PATH = UFO_ROOT / "wiki" / "log.md"
METADATA_JSON = UFO_ROOT / "processing" / "war-gov-metadata" / "all-documents-release-01-basic.json"

# Records whose Download serves a placeholder file (verified 2026-05-13)
PLACEHOLDER_RECORDS = {"record-140", "record-154", "record-155", "record-156"}

COMMON = {
    "mission", "report", "uap", "the", "of", "and", "a", "in", "on", "for",
    "with", "to", "from", "department", "war", "fbi", "nasa", "state",
    "unresolved", "debrief", "summary", "transcript", "crew", "general",
    "vol", "incident", "summaries", "photo", "video", "cable", "email",
    "correspondence", "correspondance", "launch", "range", "fouler",
    "force", "air", "navy", "between", "or", "year", "month",
    "january", "february", "march", "april", "may", "june", "july",
    "august", "september", "october", "november", "december", "redacted",
    "sub", "sighting", "about", "kuwait", "kazakhstan", "papua", "guinea",
    "syria", "iraq", "iran", "yemen", "djibouti", "japan", "greece",
    "mexico", "germany", "turkey", "turkmenistan", "georgia", "tbilisi",
    "indopacom", "middle", "east", "africa", "europe", "western", "united",
    "states", "north", "south", "america",
}


def normalize(s: str) -> str:
    if not s:
        return ""
    nfkd = unicodedata.normalize("NFKD", s)
    ascii_s = "".join(c for c in nfkd if not unicodedata.combining(c))
    lower = ascii_s.lower().replace("'", "").replace(",", "-").replace("[", "").replace("]", "")
    replaced = re.sub(r"[^a-z0-9]+", "-", lower)
    norm = re.sub(r"-+", "-", replaced).strip("-")
    prev = None
    while prev != norm:
        prev = norm
        norm = re.sub(r"(?<=[a-z-])0+(\d)", r"\1", norm)
    return norm


def signature_tokens(s: str) -> set[str]:
    return {t for t in normalize(s).split("-") if t and t not in COMMON}


def jaccard(a: set, b: set) -> float:
    return len(a & b) / len(a | b) if a and b else 0.0


def primary_id(s: str) -> str | None:
    n = normalize(s)
    # Catch (agency)-uap-d(\d+) once and rest of the dedicated patterns. Match
    # "cia-uap-d001", "doe-uap-d002", "odni-uap-d001", "dow-uap-d017", etc.
    m = re.match(r"^((?:cia|doe|dod|dow|dos|odni|nasa|fbi)-uap-[a-z]{1,4}\d+[a-z]?)", n)
    if m:
        return m.group(1)
    for p in (
        r"^(fbi-photo-[a-z]\d+)",
    ):
        m = re.match(p, n)
        if m:
            return m.group(1)
    return None


def parse_us_date(s: str) -> tuple[str, str]:
    """Parse a US-format date like '12/30/47' or '11/9/23' into
    (iso_date, confidence). Year handling: 2-digit years <=30 → 20xx, else 19xx.
    Returns (iso, confidence_band) e.g. ('1947-12-30','high').
    Special cases: 'N/A' → ('NA','none'), 'LATE 2025' → ('2025-12-XX','low').
    Year-only '1969' → ('1969-XX-XX','medium').
    Range '4/10/2025-4/11/2025' → first date with confidence medium.
    """
    if not s or s.strip() == "" or s.strip().upper() in ("N/A", "NA", "NULL"):
        return ("NA", "none")
    s = s.strip()
    # Take first half of range
    if "-" in s and any(c.isdigit() for c in s.split("-")[0]):
        first = s.split("-")[0].strip()
        # Try parsing the first half
        iso, conf = parse_us_date(first)
        if iso != "NA":
            return (iso, "medium")
    # Fuzzy patterns
    if re.match(r"^late\s+\d{4}$", s, re.I):
        y = re.search(r"\d{4}", s).group(0)
        return (f"{y}-12-XX", "low")
    if re.match(r"^\d{4}$", s):
        return (f"{s}-XX-XX", "medium")
    # M/D/YY or M/D/YYYY
    m = re.match(r"^(\d{1,2})/(\d{1,2})/(\d{2,4})$", s)
    if m:
        mo, d, y = m.groups()
        y_int = int(y)
        if len(y) == 2:
            y_int = 2000 + y_int if y_int <= 30 else 1900 + y_int
        iso = f"{y_int:04d}-{int(mo):02d}-{int(d):02d}"
        return (iso, "high")
    return ("NA", "speculation")


def event_id_from_date_and_slug(iso_date: str, slug_seed: str) -> str:
    """Build EV-YYYY-MM-DD-<slug> id."""
    if iso_date == "NA":
        y, mo, d = "XXXX", "XX", "XX"
    else:
        parts = iso_date.split("-")
        y = parts[0] if len(parts) > 0 else "XXXX"
        mo = parts[1] if len(parts) > 1 else "XX"
        d = parts[2] if len(parts) > 2 else "XX"
    slug = normalize(slug_seed)[:50].strip("-") or "unlabeled"
    return f"EV-{y}-{mo}-{d}-{slug}"


def read_md(path: Path) -> tuple[dict, str]:
    c = path.read_text(encoding="utf-8")
    if not c.startswith("---"):
        return {}, c
    end = c.find("---", 4)
    if end == -1:
        return {}, c
    try:
        return (yaml.safe_load(c[3:end].strip()) or {}), c[end + 3 :].lstrip("\n")
    except yaml.YAMLError:
        return {}, c[end + 3 :].lstrip("\n")


def write_md(path: Path, fm: dict, body: str, dry_run: bool = False) -> bool:
    yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
    new = f"---\n{yaml_str}---\n\n{body}" if not body.startswith("\n") else f"---\n{yaml_str}---\n{body}"
    if path.exists() and path.read_text(encoding="utf-8") == new:
        return False
    if dry_run:
        return True
    path.write_text(new, encoding="utf-8")
    return True


# ---------------------------------------------------------------------- main


def build_war_index(records: list[dict]) -> list[tuple[dict, str, set[str], str | None]]:
    """Return list of (record, norm_title, sig_tokens, primary_id)."""
    out = []
    for r in records:
        t = r.get("title", "")
        out.append((r, normalize(t), signature_tokens(t), primary_id(t)))
    return out


def match_doc_to_war(doc_norm: str, doc_sig: set[str], doc_pid: str | None, war_index: list) -> tuple[dict | None, str]:
    # Tier 1
    for r, wnorm, _wsig, _wpid in war_index:
        if wnorm == doc_norm:
            return r, "exact-norm"
    # Tier 2
    if doc_pid:
        for r, _wnorm, _wsig, wpid in war_index:
            if wpid and wpid == doc_pid:
                return r, f"primary-id={doc_pid}"
    # Tier 3 containment
    for r, wnorm, _wsig, _wpid in war_index:
        if len(doc_norm) >= 12 and len(wnorm) >= 12 and (doc_norm in wnorm or wnorm in doc_norm):
            return r, "containment"
    # Tier 4 Jaccard
    best, best_j = None, 0.0
    for r, _wnorm, wsig, _wpid in war_index:
        j = jaccard(doc_sig, wsig)
        if j > best_j:
            best_j = j; best = r
    if best and best_j >= 0.50:
        return best, f"jaccard={best_j:.2f}"
    return None, "no-match"


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--dry-run", action="store_true")
    ap.add_argument("--rename-events", action="store_true", help="Rename EV-XXXX events to EV-YYYY-MM-DD")
    ap.add_argument("--metadata-json", action="append", default=None,
                    help="Path to a war.gov metadata JSON. Pass multiple times to merge releases. "
                         "Defaults to release-01 + release-02 if present.")
    args = ap.parse_args()

    if args.metadata_json:
        json_paths = [Path(p) for p in args.metadata_json]
    else:
        # Default: load every release-NN-basic JSON found, so 116 existing docs
        # (release-01) and 6 new docs (release-02) all get enriched in one pass.
        json_paths = sorted((UFO_ROOT / "processing" / "war-gov-metadata").glob("all-documents-release-*-basic.json"))
        if not json_paths:
            json_paths = [METADATA_JSON]

    records: list[dict] = []
    for p in json_paths:
        if not p.exists():
            sys.stderr.write(f"Metadata JSON not found: {p}\n"); sys.exit(1)
        d = json.loads(p.read_text(encoding="utf-8"))
        recs = d.get("documents", [])
        extracted_at = d.get("extracted_at")
        for r in recs:
            r.setdefault("_extracted_at", extracted_at)
            r.setdefault("_source_json", p.name)
        print(f"war.gov records from {p.name}: {len(recs)}")
        records.extend(recs)
    print(f"war.gov records total: {len(records)}")

    war_index = build_war_index(records)
    docs = sorted(DOCS_DIR.glob("*.md"))
    print(f"local document.md files: {len(docs)}")

    enriched = 0
    unchanged = 0
    unmatched = []
    event_renames: list[tuple[str, str]] = []  # (old_event_id, new_event_id)

    for doc_path in docs:
        fm, body = read_md(doc_path)
        if fm.get("type") != "document":
            continue
        title_candidates = [
            fm.get("canonical_title", ""),
            fm.get("original_filename", ""),
            doc_path.stem,
        ]
        doc_norm = normalize(title_candidates[0]) or normalize(title_candidates[1]) or normalize(title_candidates[2])
        doc_sig = signature_tokens(title_candidates[0]) | signature_tokens(title_candidates[1])
        doc_pid = primary_id(title_candidates[0]) or primary_id(title_candidates[1]) or primary_id(doc_path.stem)

        match, reason = match_doc_to_war(doc_norm, doc_sig, doc_pid, war_index)
        if not match:
            unmatched.append(doc_path.name)
            continue

        # Build war_gov block
        incident_iso, date_conf = parse_us_date(match.get("incident_date") or "")
        release_iso, _ = parse_us_date(match.get("release_date") or "")
        war_block = {
            "record_id": match["record_id"],
            "title_official": match.get("title"),
            "agency_official": match.get("agency"),
            "release_date_official": release_iso,
            "release_date_raw": match.get("release_date"),
            "incident_date_official": incident_iso,
            "incident_date_raw": match.get("incident_date"),
            "incident_date_confidence": date_conf,
            "incident_location_official": match.get("incident_location"),
            "document_type_official": match.get("document_type"),
            "match_reason": reason,
            "availability": "pending-upstream" if match["record_id"] in PLACEHOLDER_RECORDS else "downloaded",
            "extracted_from_war_gov_at": match.get("_extracted_at"),
        }

        new_fm = dict(fm)
        new_fm["war_gov"] = war_block
        # Promote some fields to top-level if they were "NA" or empty
        if (new_fm.get("document_date") in (None, "", "NA")) and incident_iso != "NA":
            new_fm["document_date"] = incident_iso

        if write_md(doc_path, new_fm, body, dry_run=args.dry_run):
            enriched += 1
            print(f"  ✓ {doc_path.name} ← {match['record_id']} ({reason})")
            # Compute potential event rename if applicable
            if args.rename_events and incident_iso != "NA":
                # Look for events referenced in this document that start with EV-XXXX-
                key_events = (new_fm.get("key_entities") or {}).get("events") or []
                for ref in key_events:
                    if isinstance(ref, str):
                        m = re.search(r"\[\[event/(EV-XXXX-XX-XX-[a-z0-9-]+)\]\]", ref)
                        if m:
                            old = m.group(1)
                            slug = old.replace("EV-XXXX-XX-XX-", "", 1)
                            new_id = event_id_from_date_and_slug(incident_iso, slug)
                            if new_id != old:
                                event_renames.append((old, new_id))
        else:
            unchanged += 1

    # Apply event renames
    rename_count = 0
    for old, new in set(event_renames):
        old_path = EVENTS_DIR / f"{old}.md"
        new_path = EVENTS_DIR / f"{new}.md"
        if not old_path.exists():
            continue
        if new_path.exists() and new_path != old_path:
            print(f"  ⚠ skip rename {old} → {new} (target exists)")
            continue
        if args.dry_run:
            print(f"  [dry] rename {old} → {new}")
            rename_count += 1
            continue
        # Read, update event_id field, write to new path, delete old
        fm, body = read_md(old_path)
        fm["event_id"] = new
        # Update date_start/date_end if currently NA
        parts = new.split("-")
        if len(parts) >= 4:
            y, mo, d = parts[1], parts[2], parts[3]
            if y != "XXXX" and (fm.get("date_start") in (None, "NA")):
                if mo != "XX" and d != "XX":
                    fm["date_start"] = f"{y}-{mo}-{d}"
                    fm["date_end"] = fm.get("date_end") or f"{y}-{mo}-{d}"
                    fm["date_confidence"] = "high"
                elif mo != "XX":
                    fm["date_start"] = f"{y}-{mo}"
        write_md(new_path, fm, body)
        old_path.unlink()
        rename_count += 1
        # Update all wiki-links pointing to the old event_id everywhere
        for f in list(UFO_ROOT.rglob("*.md")):
            if "/processing/" in str(f) or f == new_path:
                continue
            c = f.read_text(encoding="utf-8")
            if old not in c:
                continue
            c2 = c.replace(f"[[event/{old}]]", f"[[event/{new}]]")
            if c2 != c:
                f.write_text(c2, encoding="utf-8")
        print(f"  ↺ renamed {old} → {new}")

    # Log
    print(f"\nEnriched: {enriched}, unchanged: {unchanged}, unmatched: {len(unmatched)}, event renames: {rename_count}")
    if unmatched:
        print("Unmatched docs (no war.gov record found):")
        for n in unmatched[:20]:
            print(f"  - {n}")
        if len(unmatched) > 20:
            print(f"  … and {len(unmatched) - 20} more")
    if not args.dry_run and enriched > 0:
        with open(LOG_PATH, "a", encoding="utf-8") as fh:
            fh.write(
                f"\n## {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')} — ENRICH WAR.GOV (Phase 0.5)\n"
                f"- operator: archivist\n- script: scripts/02b-enrich-with-web-metadata.py\n"
                f"- json_source: {', '.join(p.name for p in json_paths)}\n"
                f"- enriched: {enriched}\n- unchanged: {unchanged}\n- unmatched: {len(unmatched)}\n"
                f"- event_renames: {rename_count}\n"
            )


if __name__ == "__main__":
    main()