disclosure-bureau/scripts/16-extract-table-csv.py

390 lines
15 KiB
Python
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
16-extract-table-csv.py — Row-by-row extraction of multi-page tables → CSV
For each `wiki/tables/<TBL-id>.md`:
1. Resolve each span's PNG path (processing/png/<doc-id>/p-NNN.png)
2. Crop the table region using bbox (Pillow)
3. Send all crops in order to Haiku with a prompt to extract the full table
preserving multi-page row continuity
4. Receive JSON: { headers: [...], rows: [[...], ...] }
5. Save:
- processing/tables/<TBL-id>.csv (extracted CSV)
- processing/tables/<TBL-id>.json (raw extraction + metadata)
- processing/table-crops/<TBL-id>/ (the crop JPGs for inspection)
- Update wiki/tables/<TBL-id>.md frontmatter:
csv_path, extraction_quality, headers, row_count_extracted,
extracted_at, extraction_model
Idempotent: skip if CSV exists and not --force.
Usage:
./16-extract-table-csv.py # all multi-page tables
./16-extract-table-csv.py --table-id <id> # single
./16-extract-table-csv.py --force # re-extract
./16-extract-table-csv.py --model haiku # default; or sonnet
"""
from __future__ import annotations
import argparse
import csv
import json
import re
import subprocess
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
try:
import yaml
except ImportError:
sys.stderr.write("Missing pyyaml. pip3 install pyyaml\n")
sys.exit(1)
try:
from PIL import Image
except ImportError:
sys.stderr.write("Missing pillow. pip3 install pillow\n")
sys.exit(1)
UFO_ROOT = Path("/Users/guto/ufo")
TABLES_BASE = UFO_ROOT / "wiki" / "tables"
PNG_BASE = UFO_ROOT / "processing" / "png"
CSV_BASE = UFO_ROOT / "processing" / "tables"
CROPS_BASE = UFO_ROOT / "processing" / "table-crops"
LOG_PATH = UFO_ROOT / "wiki" / "log.md"
DEFAULT_MODEL = "haiku"
MAX_TURNS = 4
DEFAULT_TIMEOUT = 240
EXTRACT_PROMPT = """You are extracting a multi-page table from a US Department of War declassified UAP document.
You will see {n_crops} image crops in order. They represent ONE logical table split across {n_pages} consecutive pages. The first crop is the start, the last is the end, and any middle ones continue the rows.
STEPS:
1. Use the Read tool on EACH of these crop image paths, IN ORDER:
{crop_list}
2. Identify the column headers (typically only on the first page; subsequent pages may repeat headers — skip those repeats).
3. Concatenate all rows from all pages into a single ordered list. A row that visually appears to span a page break (e.g. a cell continues onto the next page) should be merged into ONE row when possible.
4. Output ONE JSON object (no fence, no preamble) with this exact schema:
{{
"headers": ["col1", "col2", ...],
"rows": [
["row1_col1_value", "row1_col2_value", ...],
["row2_col1_value", "row2_col2_value", ...]
],
"row_count": <int — total rows extracted, excluding header repeats>,
"column_count": <int — number of columns>,
"headers_repeat_on_each_page": true|false,
"merged_cross_page_rows": <int — how many rows you merged across page breaks>,
"extraction_quality": <float 0..1 — your confidence the extraction is complete and accurate>,
"notes": "Any caveats: illegible cells, redactions inside cells, merged headers, ambiguous values, etc. Use 'REDACTED' for cell values that are blacked out, and '???' for illegible content."
}}
RULES:
- Preserve ORIGINAL LANGUAGE of all cell text. Do NOT translate.
- For redacted cells: "REDACTED" or "REDACTED (1.4(a))" if the code is visible.
- For illegible cells: "???".
- For empty cells: empty string "".
- If a cell contains a list (multiple values), preserve as comma-separated.
- Numbers stay as strings (preserve formatting like "24,989" or "1319Z").
- Headers should be short, snake_case-friendly (e.g. "incident_date", "shape", "altitude_ft").
- Output ONLY the JSON. No fence, no commentary."""
def utc_now_iso() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def read_md(path: Path) -> tuple[dict, str]:
c = path.read_text(encoding="utf-8")
if not c.startswith("---"):
return {}, c
end = c.find("---", 4)
if end == -1:
return {}, c
try:
return (yaml.safe_load(c[3:end].strip()) or {}), c[end + 3 :].lstrip("\n")
except yaml.YAMLError:
return {}, c[end + 3 :].lstrip("\n")
def write_md(path: Path, fm: dict, body: str) -> bool:
yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
new = f"---\n{yaml_str}---\n\n{body}" if not body.startswith("\n") else f"---\n{yaml_str}---\n{body}"
if path.exists() and path.read_text(encoding="utf-8") == new:
return False
path.write_text(new, encoding="utf-8")
return True
def resolve_page_png(page_link: str) -> Path | None:
"""[[doc-id/p059]] → /Users/guto/ufo/processing/png/doc-id/p-059.png"""
m = re.match(r"\[\[([a-z0-9-]+)/p(\d+)\]\]", page_link)
if not m:
return None
doc_id = m.group(1)
page_num = int(m.group(2))
png = PNG_BASE / doc_id / f"p-{page_num:03d}.png"
return png if png.exists() else None
def crop_table_region(png_path: Path, bbox: dict, out_path: Path, padding: float = 0.005) -> bool:
"""Crop bbox from page PNG (Pillow) and save as JPEG."""
try:
with Image.open(png_path) as im:
W, H = im.size
x = max(0.0, float(bbox.get("x", 0)) - padding)
y = max(0.0, float(bbox.get("y", 0)) - padding)
w = min(1.0 - x, float(bbox.get("w", 0)) + 2 * padding)
h = min(1.0 - y, float(bbox.get("h", 0)) + 2 * padding)
if w <= 0 or h <= 0:
return False
px = int(round(x * W))
py = int(round(y * H))
pw = max(1, int(round(w * W)))
ph = max(1, int(round(h * H)))
crop = im.crop((px, py, px + pw, py + ph))
out_path.parent.mkdir(parents=True, exist_ok=True)
if crop.mode != "RGB":
crop = crop.convert("RGB")
crop.save(out_path, "JPEG", quality=92)
return True
except Exception as e:
sys.stderr.write(f" ✗ crop failed: {e}\n")
return False
def call_haiku_extract(crops: list[Path], n_pages: int) -> tuple[dict | None, str]:
"""Call Haiku via claude CLI with the crops and structured-output prompt."""
crop_list = "\n".join(f" {i+1}. {str(p)}" for i, p in enumerate(crops))
prompt = EXTRACT_PROMPT.format(n_crops=len(crops), n_pages=n_pages, crop_list=crop_list)
cmd = [
"claude", "-p",
"--model", DEFAULT_MODEL,
"--output-format", "json",
"--max-turns", str(MAX_TURNS),
"--allowedTools", "Read",
"--add-dir", str(crops[0].parent),
"--",
prompt,
]
try:
res = subprocess.run(cmd, capture_output=True, text=True, timeout=DEFAULT_TIMEOUT, check=False)
except subprocess.TimeoutExpired:
return None, "timeout"
if res.returncode != 0:
return None, f"rc={res.returncode}: {res.stderr[-300:]}"
try:
cli = json.loads(res.stdout)
except json.JSONDecodeError:
return None, "cli-stdout-not-json"
if cli.get("is_error"):
return None, "is_error"
text = (cli.get("result") or "").strip()
parsed, err = robust_json_parse(text)
if parsed is not None:
return parsed, ""
return None, f"result-not-json: {err}"
def robust_json_parse(text: str) -> tuple[dict | None, str]:
"""Parse JSON tolerant of fences, trailing commentary, unbalanced edges.
Strategy:
1. Strip ``` fences.
2. Try direct json.loads.
3. Find first balanced { ... } block and parse it.
4. As a last resort: rewrite typical Haiku gotchas (smart quotes, trailing
comma before }, unescaped newlines inside strings).
"""
t = text.strip()
t = re.sub(r"^```(?:json)?\s*", "", t)
t = re.sub(r"\s*```$", "", t)
try:
return json.loads(t), ""
except json.JSONDecodeError as e:
first_err = str(e)
# Find balanced { ... }
start = t.find("{")
if start >= 0:
depth = 0
for i in range(start, len(t)):
if t[i] == "{":
depth += 1
elif t[i] == "}":
depth -= 1
if depth == 0:
cand = t[start:i + 1]
try:
return json.loads(cand), ""
except json.JSONDecodeError:
break
# Final pass: remove trailing commas before } or ]
cleaned = re.sub(r",\s*([}\]])", r"\1", t)
try:
return json.loads(cleaned), ""
except json.JSONDecodeError:
return None, first_err
def save_csv(out_csv: Path, headers: list[str], rows: list[list]) -> None:
out_csv.parent.mkdir(parents=True, exist_ok=True)
with out_csv.open("w", newline="", encoding="utf-8") as fh:
w = csv.writer(fh)
w.writerow(headers)
for row in rows:
# Normalize row length to header length
padded = list(row) + [""] * (len(headers) - len(row))
w.writerow(padded[: len(headers)])
def render_table_md_body(table_id: str, fm: dict, parsed: dict | None) -> str:
spans = fm.get("spans_pages") or []
body = f"# {fm.get('canonical_title', table_id)}\n\n"
body += f"> Multi-page table spanning {len(spans)} pages of {fm.get('source_doc','')}\n\n"
body += "## Pages\n\n"
for sp in spans:
body += f"- {sp.get('role','?')}: {sp.get('page','')} · bbox {sp.get('bbox')}\n"
body += "\n"
if parsed:
headers = parsed.get("headers") or []
rows = parsed.get("rows") or []
body += f"## Extracted data ({parsed.get('row_count', len(rows))} rows × {len(headers)} cols)\n\n"
body += f"_Extraction quality: `{parsed.get('extraction_quality')}` · "
body += f"merged cross-page rows: {parsed.get('merged_cross_page_rows', 0)} · "
body += f"CSV: `{fm.get('csv_path')}`_\n\n"
if parsed.get("notes"):
body += f"> **Notes from extraction:** {parsed['notes']}\n\n"
if headers and rows:
body += "| " + " | ".join(headers) + " |\n"
body += "|" + "|".join(["---"] * len(headers)) + "|\n"
for row in rows[:50]:
cells = [str(c).replace("|", "\\|").replace("\n", " ") for c in row]
# pad
cells = cells + [""] * (len(headers) - len(cells))
body += "| " + " | ".join(cells[: len(headers)]) + " |\n"
if len(rows) > 50:
body += f"\n_(showing first 50 of {len(rows)} rows — full CSV in `{fm.get('csv_path')}`)_\n"
else:
body += "## Extracted data\n\n_Extraction not yet run or failed. Run `scripts/16-extract-table-csv.py`._\n"
body += "\n## Notes\n\nPer-page table snippets live in each page.md's `tables_detected[]`. Full row-by-row data is in the CSV at `csv_path`.\n"
return body
def process_table(md_path: Path, force: bool) -> bool:
fm, _ = read_md(md_path)
if fm.get("type") != "table":
return False
if not fm.get("multi_page"):
return False # single-page tables stay inline
table_id = fm.get("table_id") or md_path.stem
csv_path = CSV_BASE / f"{table_id}.csv"
json_path = CSV_BASE / f"{table_id}.json"
crops_dir = CROPS_BASE / table_id
if csv_path.exists() and json_path.exists() and not force:
return False
spans = fm.get("spans_pages") or []
if len(spans) < 2:
return False
print(f"\n=== {table_id}{len(spans)} pages ===", flush=True)
crops: list[Path] = []
for i, sp in enumerate(spans):
page_link = sp.get("page", "")
bbox = sp.get("bbox") or {}
png = resolve_page_png(page_link)
if not png:
sys.stderr.write(f" ✗ no PNG for {page_link}\n")
return False
crop_out = crops_dir / f"span-{i+1:02d}.jpg"
if not crop_out.exists() or force:
if not crop_table_region(png, bbox, crop_out):
return False
crops.append(crop_out)
print(f" ✓ crop {i+1}: {crop_out.name}", flush=True)
t0 = time.time()
parsed = None
err = ""
for attempt in range(1, 4):
print(f" → calling Haiku (attempt {attempt}/3) to extract CSV from {len(crops)} crops…", flush=True)
parsed, err = call_haiku_extract(crops, n_pages=len(spans))
if parsed:
break
print(f" · attempt {attempt} failed: {err[:120]}", flush=True)
time.sleep(4 * attempt)
elapsed = time.time() - t0
if not parsed:
print(f" ✗ extraction failed after 3 attempts ({elapsed:.1f}s): {err}", flush=True)
return False
headers = parsed.get("headers") or []
rows = parsed.get("rows") or []
if not headers or not rows:
print(f" ⚠ extraction returned empty headers/rows", flush=True)
return False
save_csv(csv_path, headers, rows)
json_path.write_text(json.dumps(parsed, indent=2, ensure_ascii=False), encoding="utf-8")
print(f"{csv_path.relative_to(UFO_ROOT)} ({len(rows)} rows × {len(headers)} cols, {elapsed:.1f}s)", flush=True)
# Update table.md frontmatter
fm["csv_path"] = str(csv_path.relative_to(UFO_ROOT))
fm["json_path"] = str(json_path.relative_to(UFO_ROOT))
fm["headers"] = headers
fm["row_count_extracted"] = parsed.get("row_count", len(rows))
fm["column_count_extracted"] = parsed.get("column_count", len(headers))
fm["extraction_quality"] = parsed.get("extraction_quality")
fm["extraction_notes"] = parsed.get("notes", "")
fm["extraction_model"] = "claude-haiku-4-5"
fm["extracted_at"] = utc_now_iso()
fm["last_ingest"] = utc_now_iso()
body = render_table_md_body(table_id, fm, parsed)
write_md(md_path, fm, body)
return True
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--table-id", help="single table")
ap.add_argument("--force", action="store_true")
args = ap.parse_args()
CSV_BASE.mkdir(parents=True, exist_ok=True)
CROPS_BASE.mkdir(parents=True, exist_ok=True)
if args.table_id:
targets = [TABLES_BASE / f"{args.table_id}.md"]
else:
targets = sorted(TABLES_BASE.glob("*.md"))
print(f"Processing {len(targets)} table(s)…")
extracted = 0
for t in targets:
if not t.exists():
sys.stderr.write(f" ✗ no table.md: {t}\n")
continue
if process_table(t, args.force):
extracted += 1
print(f"\nExtracted: {extracted} table(s)")
if extracted > 0:
with open(LOG_PATH, "a", encoding="utf-8") as fh:
fh.write(
f"\n## {utc_now_iso()} — EXTRACT TABLE CSV\n"
f"- operator: archivist + evidence-officer\n- script: scripts/16-extract-table-csv.py\n"
f"- tables_extracted: {extracted}\n"
)
if __name__ == "__main__":
main()