disclosure-bureau/scripts/16-extract-table-csv.py

#!/usr/bin/env python3
"""
16-extract-table-csv.py — Row-by-row extraction of multi-page tables → CSV

For each `wiki/tables/<TBL-id>.md`:
  1. Resolve each span's PNG path (processing/png/<doc-id>/p-NNN.png)
  2. Crop the table region using bbox (Pillow)
  3. Send all crops in order to Haiku with a prompt to extract the full table
     preserving multi-page row continuity
  4. Receive JSON: { headers: [...], rows: [[...], ...] }
  5. Save:
       - processing/tables/<TBL-id>.csv          (extracted CSV)
       - processing/tables/<TBL-id>.json         (raw extraction + metadata)
       - processing/table-crops/<TBL-id>/        (the crop JPGs for inspection)
       - Update wiki/tables/<TBL-id>.md frontmatter:
            csv_path, extraction_quality, headers, row_count_extracted,
            extracted_at, extraction_model

Idempotent: skip if CSV exists and not --force.

Usage:
  ./16-extract-table-csv.py                       # all multi-page tables
  ./16-extract-table-csv.py --table-id <id>       # single
  ./16-extract-table-csv.py --force               # re-extract
  ./16-extract-table-csv.py --model haiku         # default; or sonnet
"""
from __future__ import annotations

import argparse
import csv
import json
import re
import subprocess
import sys
import time
from datetime import datetime, timezone
from pathlib import Path

try:
    import yaml
except ImportError:
    sys.stderr.write("Missing pyyaml. pip3 install pyyaml\n")
    sys.exit(1)

try:
    from PIL import Image
except ImportError:
    sys.stderr.write("Missing pillow. pip3 install pillow\n")
    sys.exit(1)


UFO_ROOT = Path("/Users/guto/ufo")
TABLES_BASE = UFO_ROOT / "wiki" / "tables"
PNG_BASE = UFO_ROOT / "processing" / "png"
CSV_BASE = UFO_ROOT / "processing" / "tables"
CROPS_BASE = UFO_ROOT / "processing" / "table-crops"
LOG_PATH = UFO_ROOT / "wiki" / "log.md"

DEFAULT_MODEL = "haiku"
MAX_TURNS = 4
DEFAULT_TIMEOUT = 240


EXTRACT_PROMPT = """You are extracting a multi-page table from a US Department of War declassified UAP document.

You will see {n_crops} image crops in order. They represent ONE logical table split across {n_pages} consecutive pages. The first crop is the start, the last is the end, and any middle ones continue the rows.

STEPS:
1. Use the Read tool on EACH of these crop image paths, IN ORDER:
{crop_list}

2. Identify the column headers (typically only on the first page; subsequent pages may repeat headers — skip those repeats).

3. Concatenate all rows from all pages into a single ordered list. A row that visually appears to span a page break (e.g. a cell continues onto the next page) should be merged into ONE row when possible.

4. Output ONE JSON object (no fence, no preamble) with this exact schema:

{{
  "headers": ["col1", "col2", ...],
  "rows": [
    ["row1_col1_value", "row1_col2_value", ...],
    ["row2_col1_value", "row2_col2_value", ...]
  ],
  "row_count": <int — total rows extracted, excluding header repeats>,
  "column_count": <int — number of columns>,
  "headers_repeat_on_each_page": true|false,
  "merged_cross_page_rows": <int — how many rows you merged across page breaks>,
  "extraction_quality": <float 0..1 — your confidence the extraction is complete and accurate>,
  "notes": "Any caveats: illegible cells, redactions inside cells, merged headers, ambiguous values, etc. Use 'REDACTED' for cell values that are blacked out, and '???' for illegible content."
}}

RULES:
- Preserve ORIGINAL LANGUAGE of all cell text. Do NOT translate.
- For redacted cells: "REDACTED" or "REDACTED (1.4(a))" if the code is visible.
- For illegible cells: "???".
- For empty cells: empty string "".
- If a cell contains a list (multiple values), preserve as comma-separated.
- Numbers stay as strings (preserve formatting like "24,989" or "1319Z").
- Headers should be short, snake_case-friendly (e.g. "incident_date", "shape", "altitude_ft").
- Output ONLY the JSON. No fence, no commentary."""


def utc_now_iso() -> str:
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def read_md(path: Path) -> tuple[dict, str]:
    c = path.read_text(encoding="utf-8")
    if not c.startswith("---"):
        return {}, c
    end = c.find("---", 4)
    if end == -1:
        return {}, c
    try:
        return (yaml.safe_load(c[3:end].strip()) or {}), c[end + 3 :].lstrip("\n")
    except yaml.YAMLError:
        return {}, c[end + 3 :].lstrip("\n")


def write_md(path: Path, fm: dict, body: str) -> bool:
    yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
    new = f"---\n{yaml_str}---\n\n{body}" if not body.startswith("\n") else f"---\n{yaml_str}---\n{body}"
    if path.exists() and path.read_text(encoding="utf-8") == new:
        return False
    path.write_text(new, encoding="utf-8")
    return True


def resolve_page_png(page_link: str) -> Path | None:
    """[[doc-id/p059]] → /Users/guto/ufo/processing/png/doc-id/p-059.png"""
    m = re.match(r"\[\[([a-z0-9-]+)/p(\d+)\]\]", page_link)
    if not m:
        return None
    doc_id = m.group(1)
    page_num = int(m.group(2))
    png = PNG_BASE / doc_id / f"p-{page_num:03d}.png"
    return png if png.exists() else None


def crop_table_region(png_path: Path, bbox: dict, out_path: Path, padding: float = 0.005) -> bool:
    """Crop bbox from page PNG (Pillow) and save as JPEG."""
    try:
        with Image.open(png_path) as im:
            W, H = im.size
            x = max(0.0, float(bbox.get("x", 0)) - padding)
            y = max(0.0, float(bbox.get("y", 0)) - padding)
            w = min(1.0 - x, float(bbox.get("w", 0)) + 2 * padding)
            h = min(1.0 - y, float(bbox.get("h", 0)) + 2 * padding)
            if w <= 0 or h <= 0:
                return False
            px = int(round(x * W))
            py = int(round(y * H))
            pw = max(1, int(round(w * W)))
            ph = max(1, int(round(h * H)))
            crop = im.crop((px, py, px + pw, py + ph))
            out_path.parent.mkdir(parents=True, exist_ok=True)
            if crop.mode != "RGB":
                crop = crop.convert("RGB")
            crop.save(out_path, "JPEG", quality=92)
        return True
    except Exception as e:
        sys.stderr.write(f"  ✗ crop failed: {e}\n")
        return False


def call_haiku_extract(crops: list[Path], n_pages: int) -> tuple[dict | None, str]:
    """Call Haiku via claude CLI with the crops and structured-output prompt."""
    crop_list = "\n".join(f"   {i+1}. {str(p)}" for i, p in enumerate(crops))
    prompt = EXTRACT_PROMPT.format(n_crops=len(crops), n_pages=n_pages, crop_list=crop_list)
    cmd = [
        "claude", "-p",
        "--model", DEFAULT_MODEL,
        "--output-format", "json",
        "--max-turns", str(MAX_TURNS),
        "--allowedTools", "Read",
        "--add-dir", str(crops[0].parent),
        "--",
        prompt,
    ]
    try:
        res = subprocess.run(cmd, capture_output=True, text=True, timeout=DEFAULT_TIMEOUT, check=False)
    except subprocess.TimeoutExpired:
        return None, "timeout"
    if res.returncode != 0:
        return None, f"rc={res.returncode}: {res.stderr[-300:]}"
    try:
        cli = json.loads(res.stdout)
    except json.JSONDecodeError:
        return None, "cli-stdout-not-json"
    if cli.get("is_error"):
        return None, "is_error"
    text = (cli.get("result") or "").strip()
    parsed, err = robust_json_parse(text)
    if parsed is not None:
        return parsed, ""
    return None, f"result-not-json: {err}"


def robust_json_parse(text: str) -> tuple[dict | None, str]:
    """Parse JSON tolerant of fences, trailing commentary, unbalanced edges.

    Strategy:
      1. Strip ``` fences.
      2. Try direct json.loads.
      3. Find first balanced { ... } block and parse it.
      4. As a last resort: rewrite typical Haiku gotchas (smart quotes, trailing
         comma before }, unescaped newlines inside strings).
    """
    t = text.strip()
    t = re.sub(r"^```(?:json)?\s*", "", t)
    t = re.sub(r"\s*```$", "", t)
    try:
        return json.loads(t), ""
    except json.JSONDecodeError as e:
        first_err = str(e)
    # Find balanced { ... }
    start = t.find("{")
    if start >= 0:
        depth = 0
        for i in range(start, len(t)):
            if t[i] == "{":
                depth += 1
            elif t[i] == "}":
                depth -= 1
                if depth == 0:
                    cand = t[start:i + 1]
                    try:
                        return json.loads(cand), ""
                    except json.JSONDecodeError:
                        break
    # Final pass: remove trailing commas before } or ]
    cleaned = re.sub(r",\s*([}\]])", r"\1", t)
    try:
        return json.loads(cleaned), ""
    except json.JSONDecodeError:
        return None, first_err


def save_csv(out_csv: Path, headers: list[str], rows: list[list]) -> None:
    out_csv.parent.mkdir(parents=True, exist_ok=True)
    with out_csv.open("w", newline="", encoding="utf-8") as fh:
        w = csv.writer(fh)
        w.writerow(headers)
        for row in rows:
            # Normalize row length to header length
            padded = list(row) + [""] * (len(headers) - len(row))
            w.writerow(padded[: len(headers)])


def render_table_md_body(table_id: str, fm: dict, parsed: dict | None) -> str:
    spans = fm.get("spans_pages") or []
    body = f"# {fm.get('canonical_title', table_id)}\n\n"
    body += f"> Multi-page table spanning {len(spans)} pages of {fm.get('source_doc','')}\n\n"
    body += "## Pages\n\n"
    for sp in spans:
        body += f"- {sp.get('role','?')}: {sp.get('page','')} · bbox {sp.get('bbox')}\n"
    body += "\n"
    if parsed:
        headers = parsed.get("headers") or []
        rows = parsed.get("rows") or []
        body += f"## Extracted data ({parsed.get('row_count', len(rows))} rows × {len(headers)} cols)\n\n"
        body += f"_Extraction quality: `{parsed.get('extraction_quality')}` · "
        body += f"merged cross-page rows: {parsed.get('merged_cross_page_rows', 0)} · "
        body += f"CSV: `{fm.get('csv_path')}`_\n\n"
        if parsed.get("notes"):
            body += f"> **Notes from extraction:** {parsed['notes']}\n\n"
        if headers and rows:
            body += "| " + " | ".join(headers) + " |\n"
            body += "|" + "|".join(["---"] * len(headers)) + "|\n"
            for row in rows[:50]:
                cells = [str(c).replace("|", "\\|").replace("\n", " ") for c in row]
                # pad
                cells = cells + [""] * (len(headers) - len(cells))
                body += "| " + " | ".join(cells[: len(headers)]) + " |\n"
            if len(rows) > 50:
                body += f"\n_(showing first 50 of {len(rows)} rows — full CSV in `{fm.get('csv_path')}`)_\n"
    else:
        body += "## Extracted data\n\n_Extraction not yet run or failed. Run `scripts/16-extract-table-csv.py`._\n"
    body += "\n## Notes\n\nPer-page table snippets live in each page.md's `tables_detected[]`. Full row-by-row data is in the CSV at `csv_path`.\n"
    return body


def process_table(md_path: Path, force: bool) -> bool:
    fm, _ = read_md(md_path)
    if fm.get("type") != "table":
        return False
    if not fm.get("multi_page"):
        return False  # single-page tables stay inline
    table_id = fm.get("table_id") or md_path.stem
    csv_path = CSV_BASE / f"{table_id}.csv"
    json_path = CSV_BASE / f"{table_id}.json"
    crops_dir = CROPS_BASE / table_id

    if csv_path.exists() and json_path.exists() and not force:
        return False

    spans = fm.get("spans_pages") or []
    if len(spans) < 2:
        return False

    print(f"\n=== {table_id} — {len(spans)} pages ===", flush=True)
    crops: list[Path] = []
    for i, sp in enumerate(spans):
        page_link = sp.get("page", "")
        bbox = sp.get("bbox") or {}
        png = resolve_page_png(page_link)
        if not png:
            sys.stderr.write(f"  ✗ no PNG for {page_link}\n")
            return False
        crop_out = crops_dir / f"span-{i+1:02d}.jpg"
        if not crop_out.exists() or force:
            if not crop_table_region(png, bbox, crop_out):
                return False
        crops.append(crop_out)
        print(f"  ✓ crop {i+1}: {crop_out.name}", flush=True)

    t0 = time.time()
    parsed = None
    err = ""
    for attempt in range(1, 4):
        print(f"  → calling Haiku (attempt {attempt}/3) to extract CSV from {len(crops)} crops…", flush=True)
        parsed, err = call_haiku_extract(crops, n_pages=len(spans))
        if parsed:
            break
        print(f"  · attempt {attempt} failed: {err[:120]}", flush=True)
        time.sleep(4 * attempt)
    elapsed = time.time() - t0
    if not parsed:
        print(f"  ✗ extraction failed after 3 attempts ({elapsed:.1f}s): {err}", flush=True)
        return False

    headers = parsed.get("headers") or []
    rows = parsed.get("rows") or []
    if not headers or not rows:
        print(f"  ⚠ extraction returned empty headers/rows", flush=True)
        return False

    save_csv(csv_path, headers, rows)
    json_path.write_text(json.dumps(parsed, indent=2, ensure_ascii=False), encoding="utf-8")
    print(f"  ✓ {csv_path.relative_to(UFO_ROOT)} ({len(rows)} rows × {len(headers)} cols, {elapsed:.1f}s)", flush=True)

    # Update table.md frontmatter
    fm["csv_path"] = str(csv_path.relative_to(UFO_ROOT))
    fm["json_path"] = str(json_path.relative_to(UFO_ROOT))
    fm["headers"] = headers
    fm["row_count_extracted"] = parsed.get("row_count", len(rows))
    fm["column_count_extracted"] = parsed.get("column_count", len(headers))
    fm["extraction_quality"] = parsed.get("extraction_quality")
    fm["extraction_notes"] = parsed.get("notes", "")
    fm["extraction_model"] = "claude-haiku-4-5"
    fm["extracted_at"] = utc_now_iso()
    fm["last_ingest"] = utc_now_iso()
    body = render_table_md_body(table_id, fm, parsed)
    write_md(md_path, fm, body)
    return True


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--table-id", help="single table")
    ap.add_argument("--force", action="store_true")
    args = ap.parse_args()

    CSV_BASE.mkdir(parents=True, exist_ok=True)
    CROPS_BASE.mkdir(parents=True, exist_ok=True)

    if args.table_id:
        targets = [TABLES_BASE / f"{args.table_id}.md"]
    else:
        targets = sorted(TABLES_BASE.glob("*.md"))
    print(f"Processing {len(targets)} table(s)…")
    extracted = 0
    for t in targets:
        if not t.exists():
            sys.stderr.write(f"  ✗ no table.md: {t}\n")
            continue
        if process_table(t, args.force):
            extracted += 1
    print(f"\nExtracted: {extracted} table(s)")
    if extracted > 0:
        with open(LOG_PATH, "a", encoding="utf-8") as fh:
            fh.write(
                f"\n## {utc_now_iso()} — EXTRACT TABLE CSV\n"
                f"- operator: archivist + evidence-officer\n- script: scripts/16-extract-table-csv.py\n"
                f"- tables_extracted: {extracted}\n"
            )


if __name__ == "__main__":
    main()