Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
375 lines
16 KiB
Python
375 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
32_reprocess_missing_pages.py — Reprocess pages that the doc-rebuilder agent
|
|
silently dropped due to context-window overflow.
|
|
|
|
For each doc:
|
|
1. Read raw/<doc>--subagent/_index.json (current chunk inventory)
|
|
2. Find missing pages: PNGs that exist but have no chunks
|
|
3. For each missing page, call `claude -p --model sonnet` with the page PNG
|
|
and ask for a chunks JSON (matching the page-rebuilder schema)
|
|
4. Append new chunks to _index.json with continued global IDs (chunk_id, order_global)
|
|
5. Write new chunks/c<NNNN>.md files
|
|
|
|
Idempotent — re-running skips pages already processed.
|
|
Uses WORKERS=2 to avoid hammering OAuth rate limits.
|
|
|
|
Usage:
|
|
python3 scripts/synthesize/32_reprocess_missing_pages.py --dry-run
|
|
python3 scripts/synthesize/32_reprocess_missing_pages.py --doc-id <id>
|
|
WORKERS=2 python3 scripts/synthesize/32_reprocess_missing_pages.py
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
UFO = Path("/Users/guto/ufo")
|
|
RAW = UFO / "raw"
|
|
PNG_BASE = UFO / "processing" / "png"
|
|
|
|
NOW = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
SONNET_MODEL = "sonnet"
|
|
WORKERS = int(os.environ.get("WORKERS", "2"))
|
|
|
|
PROMPT_TEMPLATE = """You are a page-rebuilder for a declassified UAP/UFO government document.
|
|
|
|
You will receive the path of ONE page image. Read it with the Read tool, then analyze it carefully and extract ALL content as structured chunks.
|
|
|
|
DOCUMENT_ID: {doc_id}
|
|
PAGE_NUMBER: {page_num}
|
|
PNG_PATH: {png_path}
|
|
|
|
Return ONE JSON object with this exact structure (no markdown fence, no preamble, no postscript):
|
|
|
|
{{
|
|
"page_number": {page_num},
|
|
"chunks": [
|
|
{{
|
|
"order_in_page": 1,
|
|
"type": "<chunk_type>",
|
|
"content_en": "<English verbatim text or visual description>",
|
|
"content_pt_br": "<Brazilian Portuguese translation>",
|
|
"bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
|
|
"classification": null,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.85,
|
|
"redaction_code": null,
|
|
"redaction_inferred_content_type": null,
|
|
"image_type": null,
|
|
"ufo_anomaly_detected": false,
|
|
"ufo_anomaly_type": null,
|
|
"ufo_anomaly_rationale": null,
|
|
"cryptid_anomaly_detected": false,
|
|
"cryptid_anomaly_type": null,
|
|
"cryptid_anomaly_rationale": null,
|
|
"image_description_en": null,
|
|
"image_description_pt_br": null,
|
|
"extracted_text": null
|
|
}}
|
|
]
|
|
}}
|
|
|
|
CHUNK TYPES (use only these): letterhead, classification_banner, header, subheader, paragraph, list_item, caption, footnote, page_number, signature_block, stamp, redaction_block, image, table_marker, form_field, watermark, separator, blank, annotation, transcript_block.
|
|
|
|
RULES:
|
|
1. Extract EVERY element on the page — nothing is skipped.
|
|
2. bbox is normalized coords (0.0..1.0) relative to the page image.
|
|
3. content_en is verbatim OCR text for text chunks; for images, describe what you see.
|
|
4. content_pt_br is Brazilian Portuguese (NOT European Portuguese). Preserve UTF-8 accents.
|
|
5. For redactions: type="redaction_block", content_en="[REDACTED]", content_pt_br="[REDACTADO]".
|
|
6. For images/photos/diagrams/sketches/maps: type="image", describe in image_description_en/pt_br.
|
|
7. For stamps: type="stamp".
|
|
8. classification: extract markings if visible ("SECRET", "CONFIDENTIAL", "UNCLASSIFIED", etc.).
|
|
9. formatting: any of ["bold", "italic", "underline", "all_caps", "handwritten", "typewritten"].
|
|
10. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev".
|
|
11. ufo_anomaly_detected: true if chunk contains UAP/UFO sighting data, coordinates, witness accounts, anomalous phenomena.
|
|
12. If page is truly blank: return one chunk with type="blank".
|
|
13. Order chunks top-to-bottom, left-to-right.
|
|
|
|
Return ONLY the JSON. No markdown. No commentary.
|
|
"""
|
|
|
|
DISALLOWED = (
|
|
"AskUserQuestion,Bash,Edit,Write,Task,Glob,Grep,"
|
|
"TaskCreate,TaskUpdate,TaskList,TaskGet,TaskStop,TaskOutput,"
|
|
"Skill,ScheduleWakeup,Monitor,WebSearch,WebFetch,NotebookEdit,"
|
|
"EnterPlanMode,ExitPlanMode,EnterWorktree,ExitWorktree,"
|
|
"CronCreate,CronDelete,CronList,RemoteTrigger,ToolSearch,"
|
|
"PushNotification,ListMcpResourcesTool,ReadMcpResourceTool,"
|
|
"ShareOnboardingGuide"
|
|
) # NOTE: Read is allowed (we need vision)
|
|
|
|
|
|
def extract_json_block(s: str) -> str:
|
|
s = s.strip()
|
|
if s.startswith("```"):
|
|
s = "\n".join(line for line in s.splitlines() if not line.startswith("```"))
|
|
s = s.strip()
|
|
start = s.find("{")
|
|
end = s.rfind("}")
|
|
if start >= 0 and end > start: return s[start:end + 1]
|
|
return s
|
|
|
|
|
|
def call_sonnet_vision(doc_id: str, page_num: int) -> dict | None:
|
|
png_path = PNG_BASE / doc_id / f"p-{page_num:03d}.png"
|
|
if not png_path.is_file(): return None
|
|
prompt = PROMPT_TEMPLATE.format(doc_id=doc_id, page_num=page_num, png_path=str(png_path))
|
|
|
|
env = {**os.environ, "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000"}
|
|
with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding="utf-8") as tmp:
|
|
tmp_path = tmp.name
|
|
try:
|
|
with open(tmp_path, "wb") as out_f:
|
|
r = subprocess.run(
|
|
["claude", "-p", "--model", SONNET_MODEL, "--output-format", "text",
|
|
"--disallowed-tools", DISALLOWED],
|
|
input=prompt.encode("utf-8"),
|
|
stdout=out_f, stderr=subprocess.PIPE, env=env,
|
|
timeout=300,
|
|
)
|
|
if r.returncode != 0:
|
|
print(f" [FAIL] {doc_id} p{page_num:03d} — claude rc={r.returncode}: {r.stderr.decode('utf-8', errors='replace')[:300]}", file=sys.stderr)
|
|
return None
|
|
with open(tmp_path, "r", encoding="utf-8") as f:
|
|
raw = f.read()
|
|
js = extract_json_block(raw)
|
|
try:
|
|
return json.loads(js)
|
|
except json.JSONDecodeError as e:
|
|
print(f" [JSON] {doc_id} p{page_num:03d} — {e} | raw_len={len(raw)}", file=sys.stderr)
|
|
return None
|
|
finally:
|
|
try: os.unlink(tmp_path)
|
|
except OSError: pass
|
|
|
|
|
|
def find_missing_pages_per_doc() -> dict[str, list[int]]:
|
|
"""For each doc, find pages that have a PNG but no chunks in _index.json.
|
|
Excludes the Poppler-phantom (last) page only if pdf_pages is known and PNG == pdf+1."""
|
|
result: dict[str, list[int]] = {}
|
|
import subprocess as sp
|
|
# Try to map pdf_pages by exact filename matching
|
|
pdf_pages_map: dict[str, int] = {}
|
|
for p in RAW.glob("*.pdf"):
|
|
try:
|
|
out = sp.run(["pdfinfo", str(p)], capture_output=True, text=True, timeout=30).stdout
|
|
m = re.search(r"Pages:\s+(\d+)", out)
|
|
if m:
|
|
# filename → doc_id (same algorithm as page-rebuilder did)
|
|
import unicodedata
|
|
nfd = unicodedata.normalize("NFD", p.stem)
|
|
ascii_str = "".join(c for c in nfd if not unicodedata.combining(c))
|
|
slug = re.sub(r"-+", "-", re.sub(r"[^a-z0-9-]", "-", ascii_str.lower())).strip("-")
|
|
if slug and slug[0].isdigit(): slug = "doc-" + slug
|
|
pdf_pages_map[slug] = int(m.group(1))
|
|
except Exception: pass
|
|
|
|
for png_dir in PNG_BASE.glob("*/"):
|
|
doc_id = png_dir.name
|
|
pngs = sorted(
|
|
int(re.match(r"p-(\d+)\.png", p.name).group(1))
|
|
for p in png_dir.glob("p-*.png") if re.match(r"p-\d+\.png", p.name)
|
|
)
|
|
if not pngs: continue
|
|
idx_path = RAW / f"{doc_id}--subagent" / "_index.json"
|
|
if not idx_path.is_file(): continue
|
|
try:
|
|
idx = json.loads(idx_path.read_text(encoding="utf-8"))
|
|
except Exception: continue
|
|
pages_in_chunks = {c.get("page") for c in idx.get("chunks", []) if c.get("page")}
|
|
|
|
# Filter: only pages 1..pdf_pages (avoid Poppler phantom)
|
|
pdf_pages = pdf_pages_map.get(doc_id)
|
|
upper_bound = pdf_pages if pdf_pages else pngs[-1]
|
|
missing = [p for p in pngs if p <= upper_bound and p not in pages_in_chunks]
|
|
if missing: result[doc_id] = missing
|
|
return result
|
|
|
|
|
|
def render_chunk_md(chunk: dict) -> str:
|
|
"""Render a chunk dict to the chunk.md format."""
|
|
import yaml
|
|
body_en = chunk.pop("_body_en", "")
|
|
body_pt = chunk.pop("_body_pt", "")
|
|
# YAML keys in stable order
|
|
fm_keys = [
|
|
"chunk_id", "type", "page", "order_in_page", "order_global", "bbox",
|
|
"classification", "formatting", "cross_page_hint", "prev_chunk", "next_chunk",
|
|
"related_image", "related_table", "ocr_confidence", "ocr_source_lines",
|
|
"redaction_code", "redaction_inferred_content_type", "image_type",
|
|
"ufo_anomaly_detected", "cryptid_anomaly_detected",
|
|
"ufo_anomaly_type", "ufo_anomaly_rationale",
|
|
"cryptid_anomaly_type", "cryptid_anomaly_rationale",
|
|
"image_description_en", "image_description_pt_br", "extracted_text",
|
|
"source_png",
|
|
]
|
|
fm = {k: chunk.get(k) for k in fm_keys if k in chunk}
|
|
yaml_block = yaml.safe_dump(fm, sort_keys=False, allow_unicode=True,
|
|
default_flow_style=False, width=10_000).rstrip()
|
|
body = f"**EN:** {body_en}\n\n**PT-BR:** {body_pt}\n" if (body_en or body_pt) else ""
|
|
return f"---\n{yaml_block}\n---\n\n{body}"
|
|
|
|
|
|
def integrate_page_chunks(doc_id: str, page_num: int, page_result: dict, idx: dict) -> int:
|
|
"""Add new page chunks to idx + write chunk .md files. Returns chunks added."""
|
|
chunks = page_result.get("chunks") or []
|
|
if not chunks: return 0
|
|
sub = RAW / f"{doc_id}--subagent"
|
|
chunks_dir = sub / "chunks"
|
|
chunks_dir.mkdir(exist_ok=True)
|
|
# Determine next global order
|
|
next_global = max((c.get("order_global", 0) for c in idx.get("chunks") or []), default=0) + 1
|
|
# Determine next chunk_id numeric
|
|
next_id_num = next_global
|
|
rel_png = f"../../processing/png/{doc_id}/p-{page_num:03d}.png"
|
|
added = 0
|
|
new_index_entries = []
|
|
for i, c in enumerate(chunks, 1):
|
|
cid = f"c{next_id_num:04d}"
|
|
ctype = c.get("type") or "paragraph"
|
|
en = c.get("content_en") or ""
|
|
pt = c.get("content_pt_br") or ""
|
|
entry = {
|
|
"chunk_id": cid,
|
|
"type": ctype,
|
|
"page": page_num,
|
|
"order_in_page": c.get("order_in_page") or i,
|
|
"order_global": next_id_num,
|
|
"file": f"chunks/{cid}.md",
|
|
"bbox": c.get("bbox") or {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
|
"preview": (en or pt or "")[:120],
|
|
}
|
|
new_index_entries.append(entry)
|
|
|
|
chunk_dict = {
|
|
"chunk_id": cid,
|
|
"type": ctype,
|
|
"page": page_num,
|
|
"order_in_page": entry["order_in_page"],
|
|
"order_global": next_id_num,
|
|
"bbox": entry["bbox"],
|
|
"classification": c.get("classification"),
|
|
"formatting": c.get("formatting") or [],
|
|
"cross_page_hint": c.get("cross_page_hint") or "self_contained",
|
|
"prev_chunk": f"c{next_id_num-1:04d}" if next_id_num > 1 else None,
|
|
"next_chunk": None, # patched after all known
|
|
"related_image": None,
|
|
"related_table": None,
|
|
"ocr_confidence": c.get("ocr_confidence") or 0.85,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": c.get("redaction_code"),
|
|
"redaction_inferred_content_type": c.get("redaction_inferred_content_type"),
|
|
"image_type": c.get("image_type"),
|
|
"ufo_anomaly_detected": bool(c.get("ufo_anomaly_detected")),
|
|
"cryptid_anomaly_detected": bool(c.get("cryptid_anomaly_detected")),
|
|
"ufo_anomaly_type": c.get("ufo_anomaly_type"),
|
|
"ufo_anomaly_rationale": c.get("ufo_anomaly_rationale"),
|
|
"cryptid_anomaly_type": c.get("cryptid_anomaly_type"),
|
|
"cryptid_anomaly_rationale": c.get("cryptid_anomaly_rationale"),
|
|
"image_description_en": c.get("image_description_en"),
|
|
"image_description_pt_br": c.get("image_description_pt_br"),
|
|
"extracted_text": c.get("extracted_text"),
|
|
"source_png": rel_png,
|
|
"_body_en": en, "_body_pt": pt,
|
|
}
|
|
(chunks_dir / f"{cid}.md").write_text(render_chunk_md(chunk_dict), encoding="utf-8")
|
|
next_id_num += 1
|
|
added += 1
|
|
|
|
idx.setdefault("chunks", []).extend(new_index_entries)
|
|
return added
|
|
|
|
|
|
import threading
|
|
|
|
# One lock per doc_id (only contended when 2+ workers process pages of same doc)
|
|
_doc_locks: dict[str, threading.Lock] = {}
|
|
_locks_mutex = threading.Lock()
|
|
def _doc_lock(doc_id: str) -> threading.Lock:
|
|
with _locks_mutex:
|
|
if doc_id not in _doc_locks: _doc_locks[doc_id] = threading.Lock()
|
|
return _doc_locks[doc_id]
|
|
|
|
|
|
def process_one_page(doc_id: str, page_num: int) -> tuple[bool, int]:
|
|
"""Process a single page and persist to _index.json under doc lock.
|
|
Returns (ok, chunks_added)."""
|
|
result = call_sonnet_vision(doc_id, page_num)
|
|
if not result:
|
|
print(f" [SKIP] {doc_id} p{page_num:03d} — no result", flush=True)
|
|
return (False, 0)
|
|
sub = RAW / f"{doc_id}--subagent"
|
|
idx_path = sub / "_index.json"
|
|
with _doc_lock(doc_id):
|
|
idx = json.loads(idx_path.read_text(encoding="utf-8"))
|
|
# Idempotent: if page already integrated meanwhile, skip
|
|
if any(c.get("page") == page_num for c in idx.get("chunks") or []):
|
|
print(f" [SKIP] {doc_id} p{page_num:03d} — already present", flush=True)
|
|
return (False, 0)
|
|
try:
|
|
n = integrate_page_chunks(doc_id, page_num, result, idx)
|
|
except Exception as e:
|
|
print(f" [ERR ] {doc_id} p{page_num:03d} — integrate: {e}", flush=True)
|
|
return (False, 0)
|
|
idx_path.write_text(json.dumps(idx, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
print(f" [OK ] {doc_id} p{page_num:03d} — {n} chunks", flush=True)
|
|
return (True, n)
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--doc-id", default=None)
|
|
ap.add_argument("--page", type=int, default=None, help="single page for testing")
|
|
ap.add_argument("--dry-run", action="store_true")
|
|
args = ap.parse_args()
|
|
|
|
missing = find_missing_pages_per_doc()
|
|
if args.doc_id:
|
|
missing = {args.doc_id: missing.get(args.doc_id, [])}
|
|
if args.page and args.doc_id:
|
|
missing = {args.doc_id: [args.page]}
|
|
|
|
# Flatten (doc, page) job list — page-level parallelism
|
|
jobs: list[tuple[str, int]] = []
|
|
for d, ps in missing.items():
|
|
for p in ps: jobs.append((d, p))
|
|
total = len(jobs)
|
|
print(f"[1/2] {len(missing)} docs · {total} page-jobs")
|
|
if args.dry_run:
|
|
for d, ps in sorted(missing.items(), key=lambda kv: -len(kv[1])):
|
|
if ps: print(f" {d}: {len(ps)} pages → {ps[:5]}{'...' if len(ps)>5 else ''}")
|
|
return 0
|
|
if total == 0: print("Nothing to do."); return 0
|
|
|
|
print(f"\n[2/2] Processing with WORKERS={WORKERS} (page-level parallel) ...")
|
|
pages_done = chunks_added = 0
|
|
completed = 0
|
|
with ThreadPoolExecutor(max_workers=WORKERS) as pool:
|
|
futs = {pool.submit(process_one_page, d, p): (d, p) for d, p in jobs}
|
|
for fut in as_completed(futs):
|
|
d, p = futs[fut]
|
|
completed += 1
|
|
try:
|
|
ok, n = fut.result()
|
|
if ok: pages_done += 1; chunks_added += n
|
|
except Exception as e:
|
|
print(f" [ERR ] {d} p{p:03d}: {e}", flush=True)
|
|
if completed % 25 == 0:
|
|
print(f" ... [progress] {completed}/{total} pages_done={pages_done} chunks={chunks_added}", flush=True)
|
|
|
|
print(f"\n✓ {pages_done}/{total} pages processed, {chunks_added} new chunks.")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|