disclosure-bureau/scripts/synthesize/32_reprocess_missing_pages.py
Luiz Gustavo a7e9dce6d2 rebuild entity layer from Sonnet-vision reextract pipeline
Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity
JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page
extraction. Add synthesize scripts to regenerate wiki/entities from the 116
_reextract.json (30), aggregate missing page.md from chunks (31), and reprocess
805 pages the doc-rebuilder agent dropped on context overflow (32). Add
maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and
typed relation extraction.

Web: wire relations API + entity-relations component; entity/timeline/doc
pages consume the rebuilt layer.

Note: raw/, processing/, wiki/ remain gitignored (bulk data managed
separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on
disk only. The 27 curated anchor events under wiki/entities/events/ are
preserved.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-21 12:20:24 -03:00

375 lines
16 KiB
Python

#!/usr/bin/env python3
"""
32_reprocess_missing_pages.py — Reprocess pages that the doc-rebuilder agent
silently dropped due to context-window overflow.
For each doc:
1. Read raw/<doc>--subagent/_index.json (current chunk inventory)
2. Find missing pages: PNGs that exist but have no chunks
3. For each missing page, call `claude -p --model sonnet` with the page PNG
and ask for a chunks JSON (matching the page-rebuilder schema)
4. Append new chunks to _index.json with continued global IDs (chunk_id, order_global)
5. Write new chunks/c<NNNN>.md files
Idempotent — re-running skips pages already processed.
Uses WORKERS=2 to avoid hammering OAuth rate limits.
Usage:
python3 scripts/synthesize/32_reprocess_missing_pages.py --dry-run
python3 scripts/synthesize/32_reprocess_missing_pages.py --doc-id <id>
WORKERS=2 python3 scripts/synthesize/32_reprocess_missing_pages.py
"""
from __future__ import annotations
import argparse
import json
import os
import re
import subprocess
import sys
import tempfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timezone
from pathlib import Path
UFO = Path("/Users/guto/ufo")
RAW = UFO / "raw"
PNG_BASE = UFO / "processing" / "png"
NOW = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
SONNET_MODEL = "sonnet"
WORKERS = int(os.environ.get("WORKERS", "2"))
PROMPT_TEMPLATE = """You are a page-rebuilder for a declassified UAP/UFO government document.
You will receive the path of ONE page image. Read it with the Read tool, then analyze it carefully and extract ALL content as structured chunks.
DOCUMENT_ID: {doc_id}
PAGE_NUMBER: {page_num}
PNG_PATH: {png_path}
Return ONE JSON object with this exact structure (no markdown fence, no preamble, no postscript):
{{
"page_number": {page_num},
"chunks": [
{{
"order_in_page": 1,
"type": "<chunk_type>",
"content_en": "<English verbatim text or visual description>",
"content_pt_br": "<Brazilian Portuguese translation>",
"bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
"classification": null,
"formatting": [],
"cross_page_hint": "self_contained",
"ocr_confidence": 0.85,
"redaction_code": null,
"redaction_inferred_content_type": null,
"image_type": null,
"ufo_anomaly_detected": false,
"ufo_anomaly_type": null,
"ufo_anomaly_rationale": null,
"cryptid_anomaly_detected": false,
"cryptid_anomaly_type": null,
"cryptid_anomaly_rationale": null,
"image_description_en": null,
"image_description_pt_br": null,
"extracted_text": null
}}
]
}}
CHUNK TYPES (use only these): letterhead, classification_banner, header, subheader, paragraph, list_item, caption, footnote, page_number, signature_block, stamp, redaction_block, image, table_marker, form_field, watermark, separator, blank, annotation, transcript_block.
RULES:
1. Extract EVERY element on the page — nothing is skipped.
2. bbox is normalized coords (0.0..1.0) relative to the page image.
3. content_en is verbatim OCR text for text chunks; for images, describe what you see.
4. content_pt_br is Brazilian Portuguese (NOT European Portuguese). Preserve UTF-8 accents.
5. For redactions: type="redaction_block", content_en="[REDACTED]", content_pt_br="[REDACTADO]".
6. For images/photos/diagrams/sketches/maps: type="image", describe in image_description_en/pt_br.
7. For stamps: type="stamp".
8. classification: extract markings if visible ("SECRET", "CONFIDENTIAL", "UNCLASSIFIED", etc.).
9. formatting: any of ["bold", "italic", "underline", "all_caps", "handwritten", "typewritten"].
10. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev".
11. ufo_anomaly_detected: true if chunk contains UAP/UFO sighting data, coordinates, witness accounts, anomalous phenomena.
12. If page is truly blank: return one chunk with type="blank".
13. Order chunks top-to-bottom, left-to-right.
Return ONLY the JSON. No markdown. No commentary.
"""
DISALLOWED = (
"AskUserQuestion,Bash,Edit,Write,Task,Glob,Grep,"
"TaskCreate,TaskUpdate,TaskList,TaskGet,TaskStop,TaskOutput,"
"Skill,ScheduleWakeup,Monitor,WebSearch,WebFetch,NotebookEdit,"
"EnterPlanMode,ExitPlanMode,EnterWorktree,ExitWorktree,"
"CronCreate,CronDelete,CronList,RemoteTrigger,ToolSearch,"
"PushNotification,ListMcpResourcesTool,ReadMcpResourceTool,"
"ShareOnboardingGuide"
) # NOTE: Read is allowed (we need vision)
def extract_json_block(s: str) -> str:
s = s.strip()
if s.startswith("```"):
s = "\n".join(line for line in s.splitlines() if not line.startswith("```"))
s = s.strip()
start = s.find("{")
end = s.rfind("}")
if start >= 0 and end > start: return s[start:end + 1]
return s
def call_sonnet_vision(doc_id: str, page_num: int) -> dict | None:
png_path = PNG_BASE / doc_id / f"p-{page_num:03d}.png"
if not png_path.is_file(): return None
prompt = PROMPT_TEMPLATE.format(doc_id=doc_id, page_num=page_num, png_path=str(png_path))
env = {**os.environ, "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000"}
with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding="utf-8") as tmp:
tmp_path = tmp.name
try:
with open(tmp_path, "wb") as out_f:
r = subprocess.run(
["claude", "-p", "--model", SONNET_MODEL, "--output-format", "text",
"--disallowed-tools", DISALLOWED],
input=prompt.encode("utf-8"),
stdout=out_f, stderr=subprocess.PIPE, env=env,
timeout=300,
)
if r.returncode != 0:
print(f" [FAIL] {doc_id} p{page_num:03d} — claude rc={r.returncode}: {r.stderr.decode('utf-8', errors='replace')[:300]}", file=sys.stderr)
return None
with open(tmp_path, "r", encoding="utf-8") as f:
raw = f.read()
js = extract_json_block(raw)
try:
return json.loads(js)
except json.JSONDecodeError as e:
print(f" [JSON] {doc_id} p{page_num:03d}{e} | raw_len={len(raw)}", file=sys.stderr)
return None
finally:
try: os.unlink(tmp_path)
except OSError: pass
def find_missing_pages_per_doc() -> dict[str, list[int]]:
"""For each doc, find pages that have a PNG but no chunks in _index.json.
Excludes the Poppler-phantom (last) page only if pdf_pages is known and PNG == pdf+1."""
result: dict[str, list[int]] = {}
import subprocess as sp
# Try to map pdf_pages by exact filename matching
pdf_pages_map: dict[str, int] = {}
for p in RAW.glob("*.pdf"):
try:
out = sp.run(["pdfinfo", str(p)], capture_output=True, text=True, timeout=30).stdout
m = re.search(r"Pages:\s+(\d+)", out)
if m:
# filename → doc_id (same algorithm as page-rebuilder did)
import unicodedata
nfd = unicodedata.normalize("NFD", p.stem)
ascii_str = "".join(c for c in nfd if not unicodedata.combining(c))
slug = re.sub(r"-+", "-", re.sub(r"[^a-z0-9-]", "-", ascii_str.lower())).strip("-")
if slug and slug[0].isdigit(): slug = "doc-" + slug
pdf_pages_map[slug] = int(m.group(1))
except Exception: pass
for png_dir in PNG_BASE.glob("*/"):
doc_id = png_dir.name
pngs = sorted(
int(re.match(r"p-(\d+)\.png", p.name).group(1))
for p in png_dir.glob("p-*.png") if re.match(r"p-\d+\.png", p.name)
)
if not pngs: continue
idx_path = RAW / f"{doc_id}--subagent" / "_index.json"
if not idx_path.is_file(): continue
try:
idx = json.loads(idx_path.read_text(encoding="utf-8"))
except Exception: continue
pages_in_chunks = {c.get("page") for c in idx.get("chunks", []) if c.get("page")}
# Filter: only pages 1..pdf_pages (avoid Poppler phantom)
pdf_pages = pdf_pages_map.get(doc_id)
upper_bound = pdf_pages if pdf_pages else pngs[-1]
missing = [p for p in pngs if p <= upper_bound and p not in pages_in_chunks]
if missing: result[doc_id] = missing
return result
def render_chunk_md(chunk: dict) -> str:
"""Render a chunk dict to the chunk.md format."""
import yaml
body_en = chunk.pop("_body_en", "")
body_pt = chunk.pop("_body_pt", "")
# YAML keys in stable order
fm_keys = [
"chunk_id", "type", "page", "order_in_page", "order_global", "bbox",
"classification", "formatting", "cross_page_hint", "prev_chunk", "next_chunk",
"related_image", "related_table", "ocr_confidence", "ocr_source_lines",
"redaction_code", "redaction_inferred_content_type", "image_type",
"ufo_anomaly_detected", "cryptid_anomaly_detected",
"ufo_anomaly_type", "ufo_anomaly_rationale",
"cryptid_anomaly_type", "cryptid_anomaly_rationale",
"image_description_en", "image_description_pt_br", "extracted_text",
"source_png",
]
fm = {k: chunk.get(k) for k in fm_keys if k in chunk}
yaml_block = yaml.safe_dump(fm, sort_keys=False, allow_unicode=True,
default_flow_style=False, width=10_000).rstrip()
body = f"**EN:** {body_en}\n\n**PT-BR:** {body_pt}\n" if (body_en or body_pt) else ""
return f"---\n{yaml_block}\n---\n\n{body}"
def integrate_page_chunks(doc_id: str, page_num: int, page_result: dict, idx: dict) -> int:
"""Add new page chunks to idx + write chunk .md files. Returns chunks added."""
chunks = page_result.get("chunks") or []
if not chunks: return 0
sub = RAW / f"{doc_id}--subagent"
chunks_dir = sub / "chunks"
chunks_dir.mkdir(exist_ok=True)
# Determine next global order
next_global = max((c.get("order_global", 0) for c in idx.get("chunks") or []), default=0) + 1
# Determine next chunk_id numeric
next_id_num = next_global
rel_png = f"../../processing/png/{doc_id}/p-{page_num:03d}.png"
added = 0
new_index_entries = []
for i, c in enumerate(chunks, 1):
cid = f"c{next_id_num:04d}"
ctype = c.get("type") or "paragraph"
en = c.get("content_en") or ""
pt = c.get("content_pt_br") or ""
entry = {
"chunk_id": cid,
"type": ctype,
"page": page_num,
"order_in_page": c.get("order_in_page") or i,
"order_global": next_id_num,
"file": f"chunks/{cid}.md",
"bbox": c.get("bbox") or {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
"preview": (en or pt or "")[:120],
}
new_index_entries.append(entry)
chunk_dict = {
"chunk_id": cid,
"type": ctype,
"page": page_num,
"order_in_page": entry["order_in_page"],
"order_global": next_id_num,
"bbox": entry["bbox"],
"classification": c.get("classification"),
"formatting": c.get("formatting") or [],
"cross_page_hint": c.get("cross_page_hint") or "self_contained",
"prev_chunk": f"c{next_id_num-1:04d}" if next_id_num > 1 else None,
"next_chunk": None, # patched after all known
"related_image": None,
"related_table": None,
"ocr_confidence": c.get("ocr_confidence") or 0.85,
"ocr_source_lines": [],
"redaction_code": c.get("redaction_code"),
"redaction_inferred_content_type": c.get("redaction_inferred_content_type"),
"image_type": c.get("image_type"),
"ufo_anomaly_detected": bool(c.get("ufo_anomaly_detected")),
"cryptid_anomaly_detected": bool(c.get("cryptid_anomaly_detected")),
"ufo_anomaly_type": c.get("ufo_anomaly_type"),
"ufo_anomaly_rationale": c.get("ufo_anomaly_rationale"),
"cryptid_anomaly_type": c.get("cryptid_anomaly_type"),
"cryptid_anomaly_rationale": c.get("cryptid_anomaly_rationale"),
"image_description_en": c.get("image_description_en"),
"image_description_pt_br": c.get("image_description_pt_br"),
"extracted_text": c.get("extracted_text"),
"source_png": rel_png,
"_body_en": en, "_body_pt": pt,
}
(chunks_dir / f"{cid}.md").write_text(render_chunk_md(chunk_dict), encoding="utf-8")
next_id_num += 1
added += 1
idx.setdefault("chunks", []).extend(new_index_entries)
return added
import threading
# One lock per doc_id (only contended when 2+ workers process pages of same doc)
_doc_locks: dict[str, threading.Lock] = {}
_locks_mutex = threading.Lock()
def _doc_lock(doc_id: str) -> threading.Lock:
with _locks_mutex:
if doc_id not in _doc_locks: _doc_locks[doc_id] = threading.Lock()
return _doc_locks[doc_id]
def process_one_page(doc_id: str, page_num: int) -> tuple[bool, int]:
"""Process a single page and persist to _index.json under doc lock.
Returns (ok, chunks_added)."""
result = call_sonnet_vision(doc_id, page_num)
if not result:
print(f" [SKIP] {doc_id} p{page_num:03d} — no result", flush=True)
return (False, 0)
sub = RAW / f"{doc_id}--subagent"
idx_path = sub / "_index.json"
with _doc_lock(doc_id):
idx = json.loads(idx_path.read_text(encoding="utf-8"))
# Idempotent: if page already integrated meanwhile, skip
if any(c.get("page") == page_num for c in idx.get("chunks") or []):
print(f" [SKIP] {doc_id} p{page_num:03d} — already present", flush=True)
return (False, 0)
try:
n = integrate_page_chunks(doc_id, page_num, result, idx)
except Exception as e:
print(f" [ERR ] {doc_id} p{page_num:03d} — integrate: {e}", flush=True)
return (False, 0)
idx_path.write_text(json.dumps(idx, indent=2, ensure_ascii=False), encoding="utf-8")
print(f" [OK ] {doc_id} p{page_num:03d}{n} chunks", flush=True)
return (True, n)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--doc-id", default=None)
ap.add_argument("--page", type=int, default=None, help="single page for testing")
ap.add_argument("--dry-run", action="store_true")
args = ap.parse_args()
missing = find_missing_pages_per_doc()
if args.doc_id:
missing = {args.doc_id: missing.get(args.doc_id, [])}
if args.page and args.doc_id:
missing = {args.doc_id: [args.page]}
# Flatten (doc, page) job list — page-level parallelism
jobs: list[tuple[str, int]] = []
for d, ps in missing.items():
for p in ps: jobs.append((d, p))
total = len(jobs)
print(f"[1/2] {len(missing)} docs · {total} page-jobs")
if args.dry_run:
for d, ps in sorted(missing.items(), key=lambda kv: -len(kv[1])):
if ps: print(f" {d}: {len(ps)} pages → {ps[:5]}{'...' if len(ps)>5 else ''}")
return 0
if total == 0: print("Nothing to do."); return 0
print(f"\n[2/2] Processing with WORKERS={WORKERS} (page-level parallel) ...")
pages_done = chunks_added = 0
completed = 0
with ThreadPoolExecutor(max_workers=WORKERS) as pool:
futs = {pool.submit(process_one_page, d, p): (d, p) for d, p in jobs}
for fut in as_completed(futs):
d, p = futs[fut]
completed += 1
try:
ok, n = fut.result()
if ok: pages_done += 1; chunks_added += n
except Exception as e:
print(f" [ERR ] {d} p{p:03d}: {e}", flush=True)
if completed % 25 == 0:
print(f" ... [progress] {completed}/{total} pages_done={pages_done} chunks={chunks_added}", flush=True)
print(f"\n{pages_done}/{total} pages processed, {chunks_added} new chunks.")
return 0
if __name__ == "__main__":
sys.exit(main())