disclosure-bureau/scripts/reextract/build_doc_text.py
Luiz Gustavo a7e9dce6d2 rebuild entity layer from Sonnet-vision reextract pipeline
Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity
JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page
extraction. Add synthesize scripts to regenerate wiki/entities from the 116
_reextract.json (30), aggregate missing page.md from chunks (31), and reprocess
805 pages the doc-rebuilder agent dropped on context overflow (32). Add
maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and
typed relation extraction.

Web: wire relations API + entity-relations component; entity/timeline/doc
pages consume the rebuilt layer.

Note: raw/, processing/, wiki/ remain gitignored (bulk data managed
separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on
disk only. The 27 curated anchor events under wiki/entities/events/ are
preserved.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-21 12:20:24 -03:00

99 lines
3.2 KiB
Python

#!/usr/bin/env python3
"""
build_doc_text.py — Reconstruct the FULL document text from already-extracted
chunks, with chunk-id markers so Sonnet can cite back via evidence_chunks.
Input: raw/<doc-id>--subagent/_index.json + chunks/c*.md
Output: stdout — concatenated EN text of the document, with markers:
[chunk c0042 · page 7]
<content_en verbatim>
[chunk c0043 · page 7]
<content_en verbatim>
...
Run:
python3 scripts/reextract/build_doc_text.py <doc-id>
"""
from __future__ import annotations
import json
import re
import sys
from pathlib import Path
RAW = Path("/Users/guto/ufo/raw")
def split_frontmatter(text: str) -> tuple[dict, str]:
if not text.startswith("---"):
return {}, text
parts = text.split("---", 2)
if len(parts) < 3:
return {}, text
fm_raw = parts[1]
body = parts[2]
# Tolerant key:value extraction (chunks have free-text fields that break
# strict YAML — we only need a handful of keys)
fm: dict = {}
for line in fm_raw.splitlines():
m = re.match(r"^([a-zA-Z_]+):\s*(.*)$", line)
if not m: continue
fm[m.group(1)] = m.group(2).strip()
return fm, body
def extract_en_section(body: str) -> str:
"""Pull the EN: paragraph from a bilingual chunk body."""
# Bodies look like:
# **EN:** <english text>
# **PT-BR:** <portuguese text>
m = re.search(r"\*\*EN:\*\*\s*(.*?)(?:\n\s*\*\*PT-BR:\*\*|\Z)", body, re.S)
if m: return m.group(1).strip()
# Some chunks store the text in `extracted_text:` field only (e.g. images)
return body.strip()
def main() -> int:
if len(sys.argv) < 2:
sys.exit("usage: build_doc_text.py <doc-id>")
doc_id = sys.argv[1]
chunks_dir = RAW / f"{doc_id}--subagent" / "chunks"
idx_path = RAW / f"{doc_id}--subagent" / "_index.json"
if not idx_path.is_file():
sys.exit(f"_index.json not found for {doc_id}")
idx = json.loads(idx_path.read_text(encoding="utf-8"))
entries = sorted(idx.get("chunks", []), key=lambda x: x.get("order_global", 0))
out_lines: list[str] = [f"DOCUMENT_ID: {doc_id}",
f"TOTAL_PAGES: {idx.get('total_pages')}",
f"TOTAL_CHUNKS: {len(entries)}", ""]
for entry in entries:
cid = entry.get("chunk_id")
page = entry.get("page")
ctype = entry.get("type", "?")
chunk_path = chunks_dir / f"{cid}.md"
if not chunk_path.is_file(): continue
text = chunk_path.read_text(encoding="utf-8")
fm, body = split_frontmatter(text)
en = extract_en_section(body)
# For pure-image chunks the EN body itself describes the image.
# Fall back to image_description_en if extracted text is empty.
if not en or len(en) < 5:
# Try the description in frontmatter
en = fm.get("image_description_en") or fm.get("extracted_text") or ""
en = en.strip().strip('"\'')
if not en: continue
out_lines.append(f"[chunk {cid} · page {page} · type:{ctype}]")
out_lines.append(en.strip())
out_lines.append("")
print("\n".join(out_lines))
return 0
if __name__ == "__main__":
sys.exit(main())