#!/usr/bin/env python3 """ build_doc_text.py — Reconstruct the FULL document text from already-extracted chunks, with chunk-id markers so Sonnet can cite back via evidence_chunks. Input: raw/--subagent/_index.json + chunks/c*.md Output: stdout — concatenated EN text of the document, with markers: [chunk c0042 · page 7] [chunk c0043 · page 7] ... Run: python3 scripts/reextract/build_doc_text.py """ from __future__ import annotations import json import re import sys from pathlib import Path RAW = Path("/Users/guto/ufo/raw") def split_frontmatter(text: str) -> tuple[dict, str]: if not text.startswith("---"): return {}, text parts = text.split("---", 2) if len(parts) < 3: return {}, text fm_raw = parts[1] body = parts[2] # Tolerant key:value extraction (chunks have free-text fields that break # strict YAML — we only need a handful of keys) fm: dict = {} for line in fm_raw.splitlines(): m = re.match(r"^([a-zA-Z_]+):\s*(.*)$", line) if not m: continue fm[m.group(1)] = m.group(2).strip() return fm, body def extract_en_section(body: str) -> str: """Pull the EN: paragraph from a bilingual chunk body.""" # Bodies look like: # **EN:** # **PT-BR:** m = re.search(r"\*\*EN:\*\*\s*(.*?)(?:\n\s*\*\*PT-BR:\*\*|\Z)", body, re.S) if m: return m.group(1).strip() # Some chunks store the text in `extracted_text:` field only (e.g. images) return body.strip() def main() -> int: if len(sys.argv) < 2: sys.exit("usage: build_doc_text.py ") doc_id = sys.argv[1] chunks_dir = RAW / f"{doc_id}--subagent" / "chunks" idx_path = RAW / f"{doc_id}--subagent" / "_index.json" if not idx_path.is_file(): sys.exit(f"_index.json not found for {doc_id}") idx = json.loads(idx_path.read_text(encoding="utf-8")) entries = sorted(idx.get("chunks", []), key=lambda x: x.get("order_global", 0)) out_lines: list[str] = [f"DOCUMENT_ID: {doc_id}", f"TOTAL_PAGES: {idx.get('total_pages')}", f"TOTAL_CHUNKS: {len(entries)}", ""] for entry in entries: cid = entry.get("chunk_id") page = entry.get("page") ctype = entry.get("type", "?") chunk_path = chunks_dir / f"{cid}.md" if not chunk_path.is_file(): continue text = chunk_path.read_text(encoding="utf-8") fm, body = split_frontmatter(text) en = extract_en_section(body) # For pure-image chunks the EN body itself describes the image. # Fall back to image_description_en if extracted text is empty. if not en or len(en) < 5: # Try the description in frontmatter en = fm.get("image_description_en") or fm.get("extracted_text") or "" en = en.strip().strip('"\'') if not en: continue out_lines.append(f"[chunk {cid} · page {page} · type:{ctype}]") out_lines.append(en.strip()) out_lines.append("") print("\n".join(out_lines)) return 0 if __name__ == "__main__": sys.exit(main())