disclosure-bureau/scripts/reextract/build_doc_text.py

#!/usr/bin/env python3
"""
build_doc_text.py — Reconstruct the FULL document text from already-extracted
chunks, with chunk-id markers so Sonnet can cite back via evidence_chunks.

Input:  raw/<doc-id>--subagent/_index.json + chunks/c*.md
Output: stdout — concatenated EN text of the document, with markers:
        [chunk c0042 · page 7]
        <content_en verbatim>

        [chunk c0043 · page 7]
        <content_en verbatim>
        ...

Run:
  python3 scripts/reextract/build_doc_text.py <doc-id>
"""
from __future__ import annotations
import json
import re
import sys
from pathlib import Path

RAW = Path("/Users/guto/ufo/raw")


def split_frontmatter(text: str) -> tuple[dict, str]:
    if not text.startswith("---"):
        return {}, text
    parts = text.split("---", 2)
    if len(parts) < 3:
        return {}, text
    fm_raw = parts[1]
    body = parts[2]
    # Tolerant key:value extraction (chunks have free-text fields that break
    # strict YAML — we only need a handful of keys)
    fm: dict = {}
    for line in fm_raw.splitlines():
        m = re.match(r"^([a-zA-Z_]+):\s*(.*)$", line)
        if not m: continue
        fm[m.group(1)] = m.group(2).strip()
    return fm, body


def extract_en_section(body: str) -> str:
    """Pull the EN: paragraph from a bilingual chunk body."""
    # Bodies look like:
    #   **EN:** <english text>
    #   **PT-BR:** <portuguese text>
    m = re.search(r"\*\*EN:\*\*\s*(.*?)(?:\n\s*\*\*PT-BR:\*\*|\Z)", body, re.S)
    if m: return m.group(1).strip()
    # Some chunks store the text in `extracted_text:` field only (e.g. images)
    return body.strip()


def main() -> int:
    if len(sys.argv) < 2:
        sys.exit("usage: build_doc_text.py <doc-id>")
    doc_id = sys.argv[1]
    chunks_dir = RAW / f"{doc_id}--subagent" / "chunks"
    idx_path = RAW / f"{doc_id}--subagent" / "_index.json"
    if not idx_path.is_file():
        sys.exit(f"_index.json not found for {doc_id}")

    idx = json.loads(idx_path.read_text(encoding="utf-8"))
    entries = sorted(idx.get("chunks", []), key=lambda x: x.get("order_global", 0))

    out_lines: list[str] = [f"DOCUMENT_ID: {doc_id}",
                            f"TOTAL_PAGES: {idx.get('total_pages')}",
                            f"TOTAL_CHUNKS: {len(entries)}", ""]
    for entry in entries:
        cid = entry.get("chunk_id")
        page = entry.get("page")
        ctype = entry.get("type", "?")
        chunk_path = chunks_dir / f"{cid}.md"
        if not chunk_path.is_file(): continue
        text = chunk_path.read_text(encoding="utf-8")
        fm, body = split_frontmatter(text)
        en = extract_en_section(body)

        # For pure-image chunks the EN body itself describes the image.
        # Fall back to image_description_en if extracted text is empty.
        if not en or len(en) < 5:
            # Try the description in frontmatter
            en = fm.get("image_description_en") or fm.get("extracted_text") or ""
            en = en.strip().strip('"\'')

        if not en: continue

        out_lines.append(f"[chunk {cid} · page {page} · type:{ctype}]")
        out_lines.append(en.strip())
        out_lines.append("")

    print("\n".join(out_lines))
    return 0


if __name__ == "__main__":
    sys.exit(main())