Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
99 lines
3.2 KiB
Python
99 lines
3.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
build_doc_text.py — Reconstruct the FULL document text from already-extracted
|
|
chunks, with chunk-id markers so Sonnet can cite back via evidence_chunks.
|
|
|
|
Input: raw/<doc-id>--subagent/_index.json + chunks/c*.md
|
|
Output: stdout — concatenated EN text of the document, with markers:
|
|
[chunk c0042 · page 7]
|
|
<content_en verbatim>
|
|
|
|
[chunk c0043 · page 7]
|
|
<content_en verbatim>
|
|
...
|
|
|
|
Run:
|
|
python3 scripts/reextract/build_doc_text.py <doc-id>
|
|
"""
|
|
from __future__ import annotations
|
|
import json
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
RAW = Path("/Users/guto/ufo/raw")
|
|
|
|
|
|
def split_frontmatter(text: str) -> tuple[dict, str]:
|
|
if not text.startswith("---"):
|
|
return {}, text
|
|
parts = text.split("---", 2)
|
|
if len(parts) < 3:
|
|
return {}, text
|
|
fm_raw = parts[1]
|
|
body = parts[2]
|
|
# Tolerant key:value extraction (chunks have free-text fields that break
|
|
# strict YAML — we only need a handful of keys)
|
|
fm: dict = {}
|
|
for line in fm_raw.splitlines():
|
|
m = re.match(r"^([a-zA-Z_]+):\s*(.*)$", line)
|
|
if not m: continue
|
|
fm[m.group(1)] = m.group(2).strip()
|
|
return fm, body
|
|
|
|
|
|
def extract_en_section(body: str) -> str:
|
|
"""Pull the EN: paragraph from a bilingual chunk body."""
|
|
# Bodies look like:
|
|
# **EN:** <english text>
|
|
# **PT-BR:** <portuguese text>
|
|
m = re.search(r"\*\*EN:\*\*\s*(.*?)(?:\n\s*\*\*PT-BR:\*\*|\Z)", body, re.S)
|
|
if m: return m.group(1).strip()
|
|
# Some chunks store the text in `extracted_text:` field only (e.g. images)
|
|
return body.strip()
|
|
|
|
|
|
def main() -> int:
|
|
if len(sys.argv) < 2:
|
|
sys.exit("usage: build_doc_text.py <doc-id>")
|
|
doc_id = sys.argv[1]
|
|
chunks_dir = RAW / f"{doc_id}--subagent" / "chunks"
|
|
idx_path = RAW / f"{doc_id}--subagent" / "_index.json"
|
|
if not idx_path.is_file():
|
|
sys.exit(f"_index.json not found for {doc_id}")
|
|
|
|
idx = json.loads(idx_path.read_text(encoding="utf-8"))
|
|
entries = sorted(idx.get("chunks", []), key=lambda x: x.get("order_global", 0))
|
|
|
|
out_lines: list[str] = [f"DOCUMENT_ID: {doc_id}",
|
|
f"TOTAL_PAGES: {idx.get('total_pages')}",
|
|
f"TOTAL_CHUNKS: {len(entries)}", ""]
|
|
for entry in entries:
|
|
cid = entry.get("chunk_id")
|
|
page = entry.get("page")
|
|
ctype = entry.get("type", "?")
|
|
chunk_path = chunks_dir / f"{cid}.md"
|
|
if not chunk_path.is_file(): continue
|
|
text = chunk_path.read_text(encoding="utf-8")
|
|
fm, body = split_frontmatter(text)
|
|
en = extract_en_section(body)
|
|
|
|
# For pure-image chunks the EN body itself describes the image.
|
|
# Fall back to image_description_en if extracted text is empty.
|
|
if not en or len(en) < 5:
|
|
# Try the description in frontmatter
|
|
en = fm.get("image_description_en") or fm.get("extracted_text") or ""
|
|
en = en.strip().strip('"\'')
|
|
|
|
if not en: continue
|
|
|
|
out_lines.append(f"[chunk {cid} · page {page} · type:{ctype}]")
|
|
out_lines.append(en.strip())
|
|
out_lines.append("")
|
|
|
|
print("\n".join(out_lines))
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|