disclosure-bureau/scripts/15-consolidate-tables.py

#!/usr/bin/env python3
"""
15-consolidate-tables.py — Multi-page table consolidation

For each document, walks pages in order and stitches together tables that
span multiple pages (detected by Haiku's `tables_detected[]` flags
`spans_multi_page`, `continues_from_prev_page`, `likely_continues_next_page`).

Output:
  - wiki/tables/<TBL-<DOCSHORT>-<NNNN>>.md per consolidated multi-page table
  - Updates each page.md's tables_detected[].table_id to reference the new TBL id
    (so pages can cross-link to the master table.md)

Single-page tables stay inline in page.md (no separate table.md). Only spanning
tables get promoted.

Algorithm:
  1. For each doc-id directory under wiki/pages/, sort pages by page_number.
  2. Maintain a list of "open" tables (started, not yet ended).
  3. For each page's tables in order:
     a. If table has `continues_from_prev_page=true` and there's an open table
        compatible with similar bbox/column-count, append this page as a span.
        If `likely_continues_next_page=false`, finalize.
     b. Otherwise start a new table. If `likely_continues_next_page=false` and
        `spans_multi_page=false`, single-page → skip (don't promote).
        Otherwise add to open tables.
  4. Finalize each open table at end-of-doc.
  5. Write wiki/tables/<id>.md and inject `table_id` back into each page's
     tables_detected entry.

Idempotent: writes only when content changes.

Usage:
  ./15-consolidate-tables.py
  ./15-consolidate-tables.py --doc-id <id>
  ./15-consolidate-tables.py --force
"""
from __future__ import annotations

import argparse
import re
import sys
from datetime import datetime, timezone
from pathlib import Path

try:
    import yaml
except ImportError:
    sys.stderr.write("Missing pyyaml. pip3 install pyyaml\n")
    sys.exit(1)


UFO_ROOT = Path("/Users/guto/ufo")
PAGES_BASE = UFO_ROOT / "wiki" / "pages"
TABLES_BASE = UFO_ROOT / "wiki" / "tables"
LOG_PATH = UFO_ROOT / "wiki" / "log.md"

SCHEMA_VERSION = "0.1.0"
WIKI_VERSION = "0.1.0"


def utc_now_iso() -> str:
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def read_md(path: Path) -> tuple[dict, str]:
    c = path.read_text(encoding="utf-8")
    if not c.startswith("---"):
        return {}, c
    end = c.find("---", 4)
    if end == -1:
        return {}, c
    try:
        return (yaml.safe_load(c[3:end].strip()) or {}), c[end + 3 :].lstrip("\n")
    except yaml.YAMLError:
        return {}, c[end + 3 :].lstrip("\n")


def write_md(path: Path, fm: dict, body: str) -> bool:
    yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
    new = f"---\n{yaml_str}---\n\n{body}" if not body.startswith("\n") else f"---\n{yaml_str}---\n{body}"
    if path.exists() and path.read_text(encoding="utf-8") == new:
        return False
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(new, encoding="utf-8")
    return True


def doc_short_id(doc_id: str) -> str:
    """Compact uppercase identifier for use in TBL-<DOCSHORT>-NNNN."""
    s = re.sub(r"[^A-Z0-9]", "", doc_id.upper())
    # Drop common prefixes
    for prefix in ("DOWUAP", "DOSUAP", "NASAUAP", "FBIPHOTO", "DOC"):
        if s.startswith(prefix):
            s = s[len(prefix):]
            break
    return s[:8] or "X"


def list_doc_dirs() -> list[Path]:
    if not PAGES_BASE.exists():
        return []
    return sorted(d for d in PAGES_BASE.iterdir() if d.is_dir())


def consolidate_doc(doc_dir: Path, force: bool) -> int:
    """Walk pages in order, identify multi-page tables, write wiki/tables/<id>.md.
    Returns number of tables created/updated."""
    doc_id = doc_dir.name
    pages = sorted(doc_dir.glob("p*.md"))
    if not pages:
        return 0

    # Load all pages' tables_detected in order
    pages_data = []
    for p in pages:
        fm, body = read_md(p)
        if not fm:
            continue
        m = re.match(r"p(\d+)", p.stem)
        if not m:
            continue
        page_num = int(m.group(1))
        pages_data.append({
            "path": p,
            "page_num": page_num,
            "page_id": fm.get("page_id", f"{doc_id}/p{page_num:03d}"),
            "tables": fm.get("tables_detected") or [],
            "fm": fm,
            "body": body,
        })

    if not pages_data:
        return 0

    # Walk and stitch
    open_tables = []  # list of dicts with `spans`, last bbox, last col count
    finalized = []   # list of finalized tables ready to write
    short = doc_short_id(doc_id)

    def new_table():
        return {
            "spans": [],   # [{page_num, page_id, bbox, role}]
            "headers_summaries": [],
            "row_count_estimates": [],
            "col_count_estimates": [],
        }

    for page in pages_data:
        if not page["tables"]:
            # Close any open tables — they didn't continue to this page
            for ot in open_tables:
                if ot["spans"]:
                    ot["spans"][-1]["role"] = "end"
                    finalized.append(ot)
            open_tables = []
            continue

        # Match each table on this page
        matched_open = []
        for t in page["tables"]:
            bbox = t.get("bbox") or {}
            continues_from = bool(t.get("continues_from_prev_page"))
            likely_continues = bool(t.get("likely_continues_next_page"))
            spans_multi = bool(t.get("spans_multi_page"))

            tbl = None
            if continues_from and open_tables:
                # Continue the oldest open table (simple FIFO)
                tbl = open_tables.pop(0)
            elif spans_multi or likely_continues:
                tbl = new_table()
            else:
                # Single-page table — skip, lives inline in page.md
                continue

            role = "start" if not tbl["spans"] else ("middle" if likely_continues else "end")
            tbl["spans"].append({
                "page_num": page["page_num"],
                "page_id": page["page_id"],
                "bbox": bbox,
                "role": role,
            })
            if t.get("headers_summary"):
                tbl["headers_summaries"].append(t["headers_summary"])
            if t.get("row_count_estimate"):
                tbl["row_count_estimates"].append(t["row_count_estimate"])
            if t.get("col_count_estimate"):
                tbl["col_count_estimates"].append(t["col_count_estimate"])

            if likely_continues:
                matched_open.append(tbl)
            else:
                finalized.append(tbl)

        # Any open tables not matched on this page are stranded
        for ot in open_tables:
            if ot["spans"]:
                ot["spans"][-1]["role"] = "end"
                finalized.append(ot)
        open_tables = matched_open

    # Finalize any remaining open at end-of-doc
    for ot in open_tables:
        if ot["spans"]:
            ot["spans"][-1]["role"] = "end"
            finalized.append(ot)

    # Filter to multi-page only (single-page slipped in via spans_multi_page=true on 1 page)
    multi_page = [t for t in finalized if len(t["spans"]) >= 2]
    if not multi_page:
        return 0

    # Write wiki/tables/<id>.md and update page.md back-refs
    page_table_refs: dict[str, list[str]] = {}  # page_id → [table_id...]
    n_written = 0
    for idx, tbl in enumerate(multi_page, start=1):
        tbl_id = f"TBL-{short}-{idx:04d}"
        # Pick best canonical headers (first non-empty)
        headers = next((h for h in tbl["headers_summaries"] if h), "")
        row_est = max(tbl["row_count_estimates"], default=0)
        col_est = max(tbl["col_count_estimates"], default=0)
        span_pages_yaml = []
        for sp in tbl["spans"]:
            span_pages_yaml.append({
                "page": f"[[{sp['page_id']}]]",
                "bbox": sp["bbox"],
                "role": sp["role"],
            })
            page_table_refs.setdefault(sp["page_id"], []).append(tbl_id)

        fm = {
            "schema_version": SCHEMA_VERSION,
            "type": "table",
            "table_id": tbl_id,
            "canonical_title": (headers or f"Multi-page table {idx} of {doc_id}")[:200],
            "source_doc": f"[[{doc_id}]]",
            "multi_page": True,
            "page_count": len(tbl["spans"]),
            "spans_pages": span_pages_yaml,
            "headers_summary": headers,
            "total_rows_estimate": row_est,
            "total_columns_estimate": col_est,
            "extraction_quality": None,  # to be set when actually extracted to CSV
            "last_ingest": utc_now_iso(),
            "wiki_version": WIKI_VERSION,
        }
        body = (
            f"# {fm['canonical_title']}\n\n"
            f"> Multi-page table spanning {len(tbl['spans'])} pages of [[{doc_id}]]\n\n"
            f"## Pages\n\n"
        )
        for sp in tbl["spans"]:
            body += f"- {sp['role']}: [[{sp['page_id']}]] · bbox {sp['bbox']}\n"
        body += "\n## Headers\n\n"
        body += f"{headers or '_(not extracted)_'}\n\n"
        body += "## Notes\n\n"
        body += "Per-page table snippets live in each page.md's `tables_detected[]`. This consolidated record stitches them together. Full data extraction (row-by-row CSV) is deferred to a future enrichment pass.\n"

        out = TABLES_BASE / f"{tbl_id}.md"
        if write_md(out, fm, body):
            n_written += 1

    # Inject table_id back into each page.md (idempotent)
    for page in pages_data:
        refs = page_table_refs.get(page["page_id"])
        if not refs:
            continue
        fm = page["fm"]
        tables = fm.get("tables_detected") or []
        if not tables:
            continue
        # Mark the first N matching tables with table_id (simple sequential mapping)
        modified = False
        ref_iter = iter(refs)
        for t in tables:
            if t.get("spans_multi_page") or t.get("continues_from_prev_page") or t.get("likely_continues_next_page"):
                try:
                    next_id = next(ref_iter)
                except StopIteration:
                    break
                if t.get("table_id") != next_id:
                    t["table_id"] = next_id
                    modified = True
        if modified:
            fm["tables_detected"] = tables
            write_md(page["path"], fm, page["body"])

    return n_written


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--doc-id", help="single doc")
    ap.add_argument("--force", action="store_true")
    args = ap.parse_args()

    TABLES_BASE.mkdir(parents=True, exist_ok=True)
    targets = [PAGES_BASE / args.doc_id] if args.doc_id else list_doc_dirs()
    total_tables = 0
    docs_with_tables = 0
    for d in targets:
        if not d.exists():
            continue
        n = consolidate_doc(d, args.force)
        if n > 0:
            print(f"  ✓ {d.name}: {n} multi-page table(s)")
            total_tables += n
            docs_with_tables += 1

    print(f"\nTotal: {total_tables} multi-page tables across {docs_with_tables} doc(s)")
    if total_tables > 0:
        with open(LOG_PATH, "a", encoding="utf-8") as fh:
            fh.write(
                f"\n## {utc_now_iso()} — CONSOLIDATE TABLES\n"
                f"- operator: archivist\n- script: scripts/15-consolidate-tables.py\n"
                f"- tables_written: {total_tables}\n- docs_with_tables: {docs_with_tables}\n"
            )


if __name__ == "__main__":
    main()