#!/usr/bin/env python3 """ 16-extract-table-csv.py — Row-by-row extraction of multi-page tables → CSV For each `wiki/tables/.md`: 1. Resolve each span's PNG path (processing/png//p-NNN.png) 2. Crop the table region using bbox (Pillow) 3. Send all crops in order to Haiku with a prompt to extract the full table preserving multi-page row continuity 4. Receive JSON: { headers: [...], rows: [[...], ...] } 5. Save: - processing/tables/.csv (extracted CSV) - processing/tables/.json (raw extraction + metadata) - processing/table-crops// (the crop JPGs for inspection) - Update wiki/tables/.md frontmatter: csv_path, extraction_quality, headers, row_count_extracted, extracted_at, extraction_model Idempotent: skip if CSV exists and not --force. Usage: ./16-extract-table-csv.py # all multi-page tables ./16-extract-table-csv.py --table-id # single ./16-extract-table-csv.py --force # re-extract ./16-extract-table-csv.py --model haiku # default; or sonnet """ from __future__ import annotations import argparse import csv import json import re import subprocess import sys import time from datetime import datetime, timezone from pathlib import Path try: import yaml except ImportError: sys.stderr.write("Missing pyyaml. pip3 install pyyaml\n") sys.exit(1) try: from PIL import Image except ImportError: sys.stderr.write("Missing pillow. pip3 install pillow\n") sys.exit(1) UFO_ROOT = Path("/Users/guto/ufo") TABLES_BASE = UFO_ROOT / "wiki" / "tables" PNG_BASE = UFO_ROOT / "processing" / "png" CSV_BASE = UFO_ROOT / "processing" / "tables" CROPS_BASE = UFO_ROOT / "processing" / "table-crops" LOG_PATH = UFO_ROOT / "wiki" / "log.md" DEFAULT_MODEL = "haiku" MAX_TURNS = 4 DEFAULT_TIMEOUT = 240 EXTRACT_PROMPT = """You are extracting a multi-page table from a US Department of War declassified UAP document. You will see {n_crops} image crops in order. They represent ONE logical table split across {n_pages} consecutive pages. The first crop is the start, the last is the end, and any middle ones continue the rows. STEPS: 1. Use the Read tool on EACH of these crop image paths, IN ORDER: {crop_list} 2. Identify the column headers (typically only on the first page; subsequent pages may repeat headers — skip those repeats). 3. Concatenate all rows from all pages into a single ordered list. A row that visually appears to span a page break (e.g. a cell continues onto the next page) should be merged into ONE row when possible. 4. Output ONE JSON object (no fence, no preamble) with this exact schema: {{ "headers": ["col1", "col2", ...], "rows": [ ["row1_col1_value", "row1_col2_value", ...], ["row2_col1_value", "row2_col2_value", ...] ], "row_count": , "column_count": , "headers_repeat_on_each_page": true|false, "merged_cross_page_rows": , "extraction_quality": , "notes": "Any caveats: illegible cells, redactions inside cells, merged headers, ambiguous values, etc. Use 'REDACTED' for cell values that are blacked out, and '???' for illegible content." }} RULES: - Preserve ORIGINAL LANGUAGE of all cell text. Do NOT translate. - For redacted cells: "REDACTED" or "REDACTED (1.4(a))" if the code is visible. - For illegible cells: "???". - For empty cells: empty string "". - If a cell contains a list (multiple values), preserve as comma-separated. - Numbers stay as strings (preserve formatting like "24,989" or "1319Z"). - Headers should be short, snake_case-friendly (e.g. "incident_date", "shape", "altitude_ft"). - Output ONLY the JSON. No fence, no commentary.""" def utc_now_iso() -> str: return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def read_md(path: Path) -> tuple[dict, str]: c = path.read_text(encoding="utf-8") if not c.startswith("---"): return {}, c end = c.find("---", 4) if end == -1: return {}, c try: return (yaml.safe_load(c[3:end].strip()) or {}), c[end + 3 :].lstrip("\n") except yaml.YAMLError: return {}, c[end + 3 :].lstrip("\n") def write_md(path: Path, fm: dict, body: str) -> bool: yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False) new = f"---\n{yaml_str}---\n\n{body}" if not body.startswith("\n") else f"---\n{yaml_str}---\n{body}" if path.exists() and path.read_text(encoding="utf-8") == new: return False path.write_text(new, encoding="utf-8") return True def resolve_page_png(page_link: str) -> Path | None: """[[doc-id/p059]] → /Users/guto/ufo/processing/png/doc-id/p-059.png""" m = re.match(r"\[\[([a-z0-9-]+)/p(\d+)\]\]", page_link) if not m: return None doc_id = m.group(1) page_num = int(m.group(2)) png = PNG_BASE / doc_id / f"p-{page_num:03d}.png" return png if png.exists() else None def crop_table_region(png_path: Path, bbox: dict, out_path: Path, padding: float = 0.005) -> bool: """Crop bbox from page PNG (Pillow) and save as JPEG.""" try: with Image.open(png_path) as im: W, H = im.size x = max(0.0, float(bbox.get("x", 0)) - padding) y = max(0.0, float(bbox.get("y", 0)) - padding) w = min(1.0 - x, float(bbox.get("w", 0)) + 2 * padding) h = min(1.0 - y, float(bbox.get("h", 0)) + 2 * padding) if w <= 0 or h <= 0: return False px = int(round(x * W)) py = int(round(y * H)) pw = max(1, int(round(w * W))) ph = max(1, int(round(h * H))) crop = im.crop((px, py, px + pw, py + ph)) out_path.parent.mkdir(parents=True, exist_ok=True) if crop.mode != "RGB": crop = crop.convert("RGB") crop.save(out_path, "JPEG", quality=92) return True except Exception as e: sys.stderr.write(f" ✗ crop failed: {e}\n") return False def call_haiku_extract(crops: list[Path], n_pages: int) -> tuple[dict | None, str]: """Call Haiku via claude CLI with the crops and structured-output prompt.""" crop_list = "\n".join(f" {i+1}. {str(p)}" for i, p in enumerate(crops)) prompt = EXTRACT_PROMPT.format(n_crops=len(crops), n_pages=n_pages, crop_list=crop_list) cmd = [ "claude", "-p", "--model", DEFAULT_MODEL, "--output-format", "json", "--max-turns", str(MAX_TURNS), "--allowedTools", "Read", "--add-dir", str(crops[0].parent), "--", prompt, ] try: res = subprocess.run(cmd, capture_output=True, text=True, timeout=DEFAULT_TIMEOUT, check=False) except subprocess.TimeoutExpired: return None, "timeout" if res.returncode != 0: return None, f"rc={res.returncode}: {res.stderr[-300:]}" try: cli = json.loads(res.stdout) except json.JSONDecodeError: return None, "cli-stdout-not-json" if cli.get("is_error"): return None, "is_error" text = (cli.get("result") or "").strip() parsed, err = robust_json_parse(text) if parsed is not None: return parsed, "" return None, f"result-not-json: {err}" def robust_json_parse(text: str) -> tuple[dict | None, str]: """Parse JSON tolerant of fences, trailing commentary, unbalanced edges. Strategy: 1. Strip ``` fences. 2. Try direct json.loads. 3. Find first balanced { ... } block and parse it. 4. As a last resort: rewrite typical Haiku gotchas (smart quotes, trailing comma before }, unescaped newlines inside strings). """ t = text.strip() t = re.sub(r"^```(?:json)?\s*", "", t) t = re.sub(r"\s*```$", "", t) try: return json.loads(t), "" except json.JSONDecodeError as e: first_err = str(e) # Find balanced { ... } start = t.find("{") if start >= 0: depth = 0 for i in range(start, len(t)): if t[i] == "{": depth += 1 elif t[i] == "}": depth -= 1 if depth == 0: cand = t[start:i + 1] try: return json.loads(cand), "" except json.JSONDecodeError: break # Final pass: remove trailing commas before } or ] cleaned = re.sub(r",\s*([}\]])", r"\1", t) try: return json.loads(cleaned), "" except json.JSONDecodeError: return None, first_err def save_csv(out_csv: Path, headers: list[str], rows: list[list]) -> None: out_csv.parent.mkdir(parents=True, exist_ok=True) with out_csv.open("w", newline="", encoding="utf-8") as fh: w = csv.writer(fh) w.writerow(headers) for row in rows: # Normalize row length to header length padded = list(row) + [""] * (len(headers) - len(row)) w.writerow(padded[: len(headers)]) def render_table_md_body(table_id: str, fm: dict, parsed: dict | None) -> str: spans = fm.get("spans_pages") or [] body = f"# {fm.get('canonical_title', table_id)}\n\n" body += f"> Multi-page table spanning {len(spans)} pages of {fm.get('source_doc','')}\n\n" body += "## Pages\n\n" for sp in spans: body += f"- {sp.get('role','?')}: {sp.get('page','')} · bbox {sp.get('bbox')}\n" body += "\n" if parsed: headers = parsed.get("headers") or [] rows = parsed.get("rows") or [] body += f"## Extracted data ({parsed.get('row_count', len(rows))} rows × {len(headers)} cols)\n\n" body += f"_Extraction quality: `{parsed.get('extraction_quality')}` · " body += f"merged cross-page rows: {parsed.get('merged_cross_page_rows', 0)} · " body += f"CSV: `{fm.get('csv_path')}`_\n\n" if parsed.get("notes"): body += f"> **Notes from extraction:** {parsed['notes']}\n\n" if headers and rows: body += "| " + " | ".join(headers) + " |\n" body += "|" + "|".join(["---"] * len(headers)) + "|\n" for row in rows[:50]: cells = [str(c).replace("|", "\\|").replace("\n", " ") for c in row] # pad cells = cells + [""] * (len(headers) - len(cells)) body += "| " + " | ".join(cells[: len(headers)]) + " |\n" if len(rows) > 50: body += f"\n_(showing first 50 of {len(rows)} rows — full CSV in `{fm.get('csv_path')}`)_\n" else: body += "## Extracted data\n\n_Extraction not yet run or failed. Run `scripts/16-extract-table-csv.py`._\n" body += "\n## Notes\n\nPer-page table snippets live in each page.md's `tables_detected[]`. Full row-by-row data is in the CSV at `csv_path`.\n" return body def process_table(md_path: Path, force: bool) -> bool: fm, _ = read_md(md_path) if fm.get("type") != "table": return False if not fm.get("multi_page"): return False # single-page tables stay inline table_id = fm.get("table_id") or md_path.stem csv_path = CSV_BASE / f"{table_id}.csv" json_path = CSV_BASE / f"{table_id}.json" crops_dir = CROPS_BASE / table_id if csv_path.exists() and json_path.exists() and not force: return False spans = fm.get("spans_pages") or [] if len(spans) < 2: return False print(f"\n=== {table_id} — {len(spans)} pages ===", flush=True) crops: list[Path] = [] for i, sp in enumerate(spans): page_link = sp.get("page", "") bbox = sp.get("bbox") or {} png = resolve_page_png(page_link) if not png: sys.stderr.write(f" ✗ no PNG for {page_link}\n") return False crop_out = crops_dir / f"span-{i+1:02d}.jpg" if not crop_out.exists() or force: if not crop_table_region(png, bbox, crop_out): return False crops.append(crop_out) print(f" ✓ crop {i+1}: {crop_out.name}", flush=True) t0 = time.time() parsed = None err = "" for attempt in range(1, 4): print(f" → calling Haiku (attempt {attempt}/3) to extract CSV from {len(crops)} crops…", flush=True) parsed, err = call_haiku_extract(crops, n_pages=len(spans)) if parsed: break print(f" · attempt {attempt} failed: {err[:120]}", flush=True) time.sleep(4 * attempt) elapsed = time.time() - t0 if not parsed: print(f" ✗ extraction failed after 3 attempts ({elapsed:.1f}s): {err}", flush=True) return False headers = parsed.get("headers") or [] rows = parsed.get("rows") or [] if not headers or not rows: print(f" ⚠ extraction returned empty headers/rows", flush=True) return False save_csv(csv_path, headers, rows) json_path.write_text(json.dumps(parsed, indent=2, ensure_ascii=False), encoding="utf-8") print(f" ✓ {csv_path.relative_to(UFO_ROOT)} ({len(rows)} rows × {len(headers)} cols, {elapsed:.1f}s)", flush=True) # Update table.md frontmatter fm["csv_path"] = str(csv_path.relative_to(UFO_ROOT)) fm["json_path"] = str(json_path.relative_to(UFO_ROOT)) fm["headers"] = headers fm["row_count_extracted"] = parsed.get("row_count", len(rows)) fm["column_count_extracted"] = parsed.get("column_count", len(headers)) fm["extraction_quality"] = parsed.get("extraction_quality") fm["extraction_notes"] = parsed.get("notes", "") fm["extraction_model"] = "claude-haiku-4-5" fm["extracted_at"] = utc_now_iso() fm["last_ingest"] = utc_now_iso() body = render_table_md_body(table_id, fm, parsed) write_md(md_path, fm, body) return True def main(): ap = argparse.ArgumentParser() ap.add_argument("--table-id", help="single table") ap.add_argument("--force", action="store_true") args = ap.parse_args() CSV_BASE.mkdir(parents=True, exist_ok=True) CROPS_BASE.mkdir(parents=True, exist_ok=True) if args.table_id: targets = [TABLES_BASE / f"{args.table_id}.md"] else: targets = sorted(TABLES_BASE.glob("*.md")) print(f"Processing {len(targets)} table(s)…") extracted = 0 for t in targets: if not t.exists(): sys.stderr.write(f" ✗ no table.md: {t}\n") continue if process_table(t, args.force): extracted += 1 print(f"\nExtracted: {extracted} table(s)") if extracted > 0: with open(LOG_PATH, "a", encoding="utf-8") as fh: fh.write( f"\n## {utc_now_iso()} — EXTRACT TABLE CSV\n" f"- operator: archivist + evidence-officer\n- script: scripts/16-extract-table-csv.py\n" f"- tables_extracted: {extracted}\n" ) if __name__ == "__main__": main()