Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
417 lines
17 KiB
Python
417 lines
17 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
run.py — Re-extract a document via Claude Code OAuth (Sonnet), with chunked
|
||
processing for large docs.
|
||
|
||
Strategy:
|
||
- Build doc text from already-extracted chunks (build_doc_text.py).
|
||
- If text fits in one Sonnet window (default 50k tokens input budget),
|
||
run a single call producing the full JSON.
|
||
- Otherwise, split the doc into overlapping segments of ~50k input tokens
|
||
each, run Sonnet on each segment (preserving chunk_id markers), then
|
||
MERGE the JSONs deduping by (name, class) within each entity list and
|
||
by (source_name, type, target_name, evidence_chunks) for relations.
|
||
|
||
The merged JSON faithfully covers the entire document — no entity is dropped
|
||
because the doc was "too big".
|
||
"""
|
||
from __future__ import annotations
|
||
import json
|
||
import os
|
||
import re
|
||
import subprocess
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
import yaml
|
||
|
||
REX_DIR = Path("/Users/guto/ufo/scripts/reextract")
|
||
RAW = Path("/Users/guto/ufo/raw")
|
||
BUILD_DOC = REX_DIR / "build_doc_text.py"
|
||
PROMPT_SYSTEM = REX_DIR / "prompt-system.md"
|
||
ENUMS_YAML = REX_DIR / "enums.yaml"
|
||
VALIDATE = REX_DIR / "validate.py"
|
||
|
||
# Token budget per call.
|
||
# Both Sonnet and Opus cap output at 32k tokens. We partition the extraction
|
||
# into 5 separate calls per segment, each producing a small piece of the JSON.
|
||
# Each piece is well under the ceiling.
|
||
SEGMENT_INPUT_CHARS = 60_000 # ~15k tokens input per segment
|
||
SEGMENT_OVERLAP_CHARS = 3_000
|
||
|
||
# Per-segment extraction is split into 5 passes. Each pass gets the same
|
||
# document text (so the claude CLI reuses its prompt cache) but a different
|
||
# "output mode" instruction asking for ONE category only.
|
||
PASSES = [
|
||
("events",
|
||
"Return a JSON object with ONLY the events array, exhaustively extracted "
|
||
"from THIS segment:\n\n"
|
||
"{\"events\": [{...event objects per the schema...}]}\n\n"
|
||
"Use the event schema rules from the system prompt. Include "
|
||
"uap_objects_observed, observers (with role_at_event), "
|
||
"primary_location_name/geo_class, evidence_chunks, both narratives. "
|
||
"Do NOT include people/organizations/locations/relations/doc-level fields."),
|
||
("people",
|
||
"Return a JSON object with ONLY the people array, exhaustively extracted "
|
||
"from THIS segment:\n\n"
|
||
"{\"people\": [{...person objects per the schema...}]}\n\n"
|
||
"Use the person schema rules. Each entry: name, aliases_in_doc, "
|
||
"person_class, affiliation, role_at_doc_date, evidence_chunks, confidence. "
|
||
"Skip pure routing-list entries (FBI distribution slips, etc.) unless they "
|
||
"are subjects/witnesses/authors of real content."),
|
||
("orgs_locs",
|
||
"Return a JSON object with ONLY organizations[] and locations[], exhaustively "
|
||
"extracted from THIS segment:\n\n"
|
||
"{\"organizations\": [...], \"locations\": [...]}\n\n"
|
||
"Use the schema rules. Include aliases_in_doc, class enums, country, "
|
||
"region_or_state (locations), evidence_chunks."),
|
||
("relations",
|
||
"Return a JSON object with ONLY the relations array, exhaustively extracted "
|
||
"from THIS segment:\n\n"
|
||
"{\"relations\": [{...relation objects...}]}\n\n"
|
||
"Priority: typed investigative relations (witnessed, occurred_at, signed, "
|
||
"involves_uap, investigated, authored, employed_by, commanded). "
|
||
"mentioned_by ONLY for clearly investigative people (authors of memos, "
|
||
"subjects of investigations); skip mentioned_by for pure routing-list "
|
||
"names (Tolson, Ladd routing slips). All other types as found."),
|
||
("doc_meta",
|
||
"Return a JSON object with ONLY the document-level fields:\n\n"
|
||
"{\"doc_classification\": \"...\", \"doc_classification_note\": null|\"...\", "
|
||
"\"doc_period\": \"YYYY\" or \"YYYY-YYYY\", "
|
||
"\"primary_topics\": [\"...\"], "
|
||
"\"noise_emission\": \"...\", \"investigative_value\": \"...\", "
|
||
"\"doc_summary_en\": \"2-3 sentences\", \"doc_summary_pt_br\": \"2-3 frases\"}\n\n"
|
||
"These reflect the FULL document this segment is part of (you may not see "
|
||
"every page in this segment, but classify based on what you do see)."),
|
||
]
|
||
|
||
|
||
def build_doc_text(doc_id: str) -> str:
|
||
r = subprocess.run(
|
||
["python3", str(BUILD_DOC), doc_id],
|
||
capture_output=True, text=True, encoding="utf-8",
|
||
)
|
||
if r.returncode != 0:
|
||
sys.exit(f"build_doc_text failed: {r.stderr}")
|
||
return r.stdout
|
||
|
||
|
||
def segment_doc(text: str) -> list[str]:
|
||
"""Split doc text into overlapping segments at chunk-marker boundaries.
|
||
|
||
A segment never breaks a chunk — we split at the `[chunk c... · ...]`
|
||
line boundaries closest to the char-budget cap.
|
||
"""
|
||
if len(text) <= SEGMENT_INPUT_CHARS:
|
||
return [text]
|
||
|
||
# Find all chunk marker line positions (line starts).
|
||
marker_re = re.compile(r"^\[chunk c\d+ ·.*$", re.MULTILINE)
|
||
starts = [m.start() for m in marker_re.finditer(text)]
|
||
if not starts: return [text]
|
||
|
||
segments: list[str] = []
|
||
seg_start = 0
|
||
while seg_start < len(text):
|
||
cap = seg_start + SEGMENT_INPUT_CHARS
|
||
if cap >= len(text):
|
||
segments.append(text[seg_start:])
|
||
break
|
||
# Pick the LAST chunk marker before cap (so we never break a chunk)
|
||
candidates = [s for s in starts if seg_start < s < cap]
|
||
if not candidates:
|
||
# No chunk marker fits — cut at cap (shouldn't happen with normal data)
|
||
seg_end = cap
|
||
else:
|
||
seg_end = candidates[-1]
|
||
segments.append(text[seg_start:seg_end])
|
||
# Next segment starts at the chunk marker that gives ~OVERLAP_CHARS overlap
|
||
target_overlap_start = seg_end - SEGMENT_OVERLAP_CHARS
|
||
overlap_candidates = [s for s in starts if target_overlap_start <= s < seg_end]
|
||
if overlap_candidates:
|
||
seg_start = overlap_candidates[0]
|
||
else:
|
||
seg_start = seg_end
|
||
return segments
|
||
|
||
|
||
def build_full_prompt(doc_id: str, doc_text: str, segment_meta: str = "",
|
||
pass_instruction: str = "",
|
||
error_feedback: str = "") -> str:
|
||
system = PROMPT_SYSTEM.read_text(encoding="utf-8")
|
||
enums = ENUMS_YAML.read_text(encoding="utf-8")
|
||
feedback_block = ""
|
||
if error_feedback:
|
||
feedback_block = (
|
||
"\n\n## PREVIOUS ATTEMPT FAILED VALIDATION\n\n"
|
||
"Your previous JSON had these errors. Fix them and return a corrected JSON:\n"
|
||
f"```\n{error_feedback}\n```\n\n"
|
||
"Re-emit the FULL corrected JSON.\n"
|
||
)
|
||
pass_block = ""
|
||
if pass_instruction:
|
||
pass_block = (
|
||
"\n\n## OUTPUT MODE (THIS CALL ONLY)\n\n"
|
||
f"{pass_instruction}\n\n"
|
||
"Return ONLY the JSON object described above. No markdown fence, "
|
||
"no preamble, no postscript. JSON only.\n"
|
||
)
|
||
return (
|
||
f"{system}\n\n"
|
||
"## CLOSED ENUMS (use ONLY these values)\n\n"
|
||
f"```yaml\n{enums}\n```\n\n"
|
||
f"## DOCUMENT TO ANALYZE — doc_id: {doc_id}\n"
|
||
f"{segment_meta}\n\n"
|
||
f"```\n{doc_text}\n```\n"
|
||
f"{pass_block}"
|
||
f"{feedback_block}\n"
|
||
"Return the JSON now."
|
||
)
|
||
|
||
|
||
def call_claude(prompt: str) -> tuple[str, dict]:
|
||
"""Returns (response_text, meta_dict).
|
||
|
||
Uses `--output-format text` + `--disallowed-tools` and redirects stdout
|
||
DIRECTLY to a temp file (not via subprocess.PIPE, which silently truncates
|
||
large outputs). Tools are disabled to prevent the model from initiating
|
||
multi-turn calls that fail in `-p` mode.
|
||
"""
|
||
import tempfile
|
||
env = {**os.environ, "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000"}
|
||
DISALLOWED = (
|
||
"AskUserQuestion,Bash,Edit,Write,Read,Task,Glob,Grep,"
|
||
"TaskCreate,TaskUpdate,TaskList,TaskGet,TaskStop,TaskOutput,"
|
||
"Skill,ScheduleWakeup,Monitor,WebSearch,WebFetch,NotebookEdit,"
|
||
"EnterPlanMode,ExitPlanMode,EnterWorktree,ExitWorktree,"
|
||
"CronCreate,CronDelete,CronList,RemoteTrigger,ToolSearch,"
|
||
"PushNotification,ListMcpResourcesTool,ReadMcpResourceTool,"
|
||
"ShareOnboardingGuide"
|
||
)
|
||
with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding="utf-8") as tmp:
|
||
tmp_path = tmp.name
|
||
try:
|
||
with open(tmp_path, "wb") as out_f:
|
||
r = subprocess.run(
|
||
["claude", "-p", "--model", "opus", "--output-format", "text",
|
||
"--disallowed-tools", DISALLOWED],
|
||
input=prompt.encode("utf-8"),
|
||
stdout=out_f,
|
||
stderr=subprocess.PIPE,
|
||
env=env,
|
||
)
|
||
if r.returncode != 0:
|
||
sys.exit(
|
||
f"claude CLI failed (rc={r.returncode})\n"
|
||
f"stderr: {r.stderr.decode('utf-8', errors='replace')[:4000]}"
|
||
)
|
||
with open(tmp_path, "r", encoding="utf-8") as f:
|
||
output = f.read()
|
||
return (output, {"stop_reason": "text-format-no-meta", "chars": len(output)})
|
||
finally:
|
||
try: os.unlink(tmp_path)
|
||
except OSError: pass
|
||
|
||
|
||
def extract_json_block(s: str) -> str:
|
||
s = s.strip()
|
||
if s.startswith("```"):
|
||
s = "\n".join(line for line in s.splitlines() if not line.startswith("```"))
|
||
s = s.strip()
|
||
start = s.find("{")
|
||
end = s.rfind("}")
|
||
if start >= 0 and end > start:
|
||
return s[start:end + 1]
|
||
return s
|
||
|
||
|
||
def merge_extractions(segments: list[dict], doc_id: str) -> dict:
|
||
"""Merge per-segment JSONs into a single doc-level JSON.
|
||
|
||
Dedup rules:
|
||
- events: by (label, date_start)
|
||
- people: by canonical name (lowercase, role prefixes stripped)
|
||
- orgs: by canonical name (lowercase)
|
||
- locations: by canonical name (lowercase)
|
||
- relations: by (source_name, type, target_name) — union of evidence_chunks
|
||
Top-level fields (classification, summary) come from the FIRST segment with
|
||
non-null values; primary_topics is union; noise_emission/investigative_value
|
||
is the MAX across segments (worst case takes precedence).
|
||
"""
|
||
out: dict = {
|
||
"doc_id": doc_id,
|
||
"doc_classification": None,
|
||
"doc_classification_note": None,
|
||
"doc_period": None,
|
||
"primary_topics": [],
|
||
"noise_emission": None,
|
||
"investigative_value": None,
|
||
"doc_summary_en": None,
|
||
"doc_summary_pt_br": None,
|
||
"events": [],
|
||
"people": [],
|
||
"organizations": [],
|
||
"locations": [],
|
||
"relations": [],
|
||
}
|
||
|
||
def lower_key(s) -> str:
|
||
return (s or "").strip().lower()
|
||
|
||
# Top-level pickup from first non-null
|
||
for seg in segments:
|
||
for k in ("doc_classification", "doc_classification_note", "doc_period",
|
||
"doc_summary_en", "doc_summary_pt_br"):
|
||
if not out.get(k) and seg.get(k):
|
||
out[k] = seg[k]
|
||
# primary_topics: union, preserve first-seen order
|
||
seen = set()
|
||
for seg in segments:
|
||
for t in seg.get("primary_topics") or []:
|
||
if t and t not in seen:
|
||
seen.add(t); out["primary_topics"].append(t)
|
||
|
||
# noise / investigative — take WORST across segments
|
||
NOISE_ORDER = {"none": 0, "low": 1, "medium": 2, "high": 3}
|
||
INV_ORDER = {"none": 0, "low": 1, "medium": 2, "high": 3, "critical": 4}
|
||
noise_max, inv_max = None, None
|
||
for seg in segments:
|
||
n = seg.get("noise_emission")
|
||
if n in NOISE_ORDER and (noise_max is None or NOISE_ORDER[n] > NOISE_ORDER[noise_max]):
|
||
noise_max = n
|
||
i = seg.get("investigative_value")
|
||
if i in INV_ORDER and (inv_max is None or INV_ORDER[i] > INV_ORDER[inv_max]):
|
||
inv_max = i
|
||
out["noise_emission"] = noise_max
|
||
out["investigative_value"] = inv_max
|
||
|
||
# Entities — dedup by canonical key
|
||
def merge_list(key: str, key_fn):
|
||
seen_keys: dict = {}
|
||
for seg in segments:
|
||
for item in seg.get(key) or []:
|
||
if not isinstance(item, dict): continue
|
||
k = key_fn(item)
|
||
if not k: continue
|
||
if k in seen_keys:
|
||
# merge: union evidence_chunks, aliases_in_doc; keep highest confidence
|
||
existing = seen_keys[k]
|
||
ev = set(existing.get("evidence_chunks") or []) | set(item.get("evidence_chunks") or [])
|
||
existing["evidence_chunks"] = sorted(ev)
|
||
al = set(existing.get("aliases_in_doc") or []) | set(item.get("aliases_in_doc") or [])
|
||
al.discard(existing.get("name", ""))
|
||
existing["aliases_in_doc"] = sorted(al)
|
||
# confidence: keep max
|
||
conf_order = {"low": 0, "medium": 1, "high": 2}
|
||
if conf_order.get(item.get("confidence"), 0) > conf_order.get(existing.get("confidence"), 0):
|
||
existing["confidence"] = item["confidence"]
|
||
else:
|
||
seen_keys[k] = item
|
||
return list(seen_keys.values())
|
||
|
||
out["events"] = merge_list("events",
|
||
lambda e: (lower_key(e.get("label")), e.get("date_start") or ""))
|
||
out["people"] = merge_list("people",
|
||
lambda p: lower_key(p.get("name")))
|
||
out["organizations"] = merge_list("organizations",
|
||
lambda o: lower_key(o.get("name")))
|
||
out["locations"] = merge_list("locations",
|
||
lambda l: lower_key(l.get("name")))
|
||
|
||
# Relations: dedup by (source_class, source_name_lower, type, target_class, target_name_lower)
|
||
rel_seen: dict = {}
|
||
for seg in segments:
|
||
for r in seg.get("relations") or []:
|
||
if not isinstance(r, dict): continue
|
||
key = (
|
||
r.get("source_class"),
|
||
lower_key(r.get("source_name")),
|
||
r.get("type"),
|
||
r.get("target_class"),
|
||
lower_key(r.get("target_name")),
|
||
)
|
||
if key in rel_seen:
|
||
existing = rel_seen[key]
|
||
ev = set(existing.get("evidence_chunks") or []) | set(r.get("evidence_chunks") or [])
|
||
existing["evidence_chunks"] = sorted(ev)
|
||
else:
|
||
rel_seen[key] = r
|
||
out["relations"] = list(rel_seen.values())
|
||
return out
|
||
|
||
|
||
def main() -> int:
|
||
if len(sys.argv) < 2:
|
||
sys.exit("usage: run.py <doc-id>")
|
||
doc_id = sys.argv[1]
|
||
out_path = RAW / f"{doc_id}--subagent" / "_reextract.json"
|
||
|
||
print(f"[1/N] Building doc text ...")
|
||
doc_text = build_doc_text(doc_id)
|
||
print(f" {len(doc_text)} chars (~{len(doc_text) // 4} tokens)")
|
||
|
||
segments = segment_doc(doc_text)
|
||
n_seg = len(segments)
|
||
print(f"[2/N] Splitting into {n_seg} segment(s) of ~{SEGMENT_INPUT_CHARS // 1000}k chars each")
|
||
for i, s in enumerate(segments, 1):
|
||
print(f" segment {i}: {len(s)} chars")
|
||
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
|
||
def run_pass(seg_idx: int, seg_text: str, pass_name: str, pass_instr: str) -> tuple[int, str, dict | None, str]:
|
||
meta_label = f"\n\n[SEGMENT {seg_idx + 1} OF {n_seg}] — extract everything in THIS segment exhaustively.\n"
|
||
prompt = build_full_prompt(doc_id, seg_text, segment_meta=meta_label,
|
||
pass_instruction=pass_instr)
|
||
raw, _meta = call_claude(prompt)
|
||
json_text = extract_json_block(raw)
|
||
try:
|
||
piece = json.loads(json_text)
|
||
return (seg_idx, pass_name, piece, "")
|
||
except json.JSONDecodeError as e:
|
||
return (seg_idx, pass_name, None, f"{e} | raw_len={len(raw)}")
|
||
|
||
# Fire ALL (segment, pass) jobs in parallel
|
||
extracted: list[dict] = [{"doc_id": doc_id} for _ in range(n_seg)]
|
||
n_jobs = n_seg * len(PASSES)
|
||
print(f"[3/N] Firing {n_jobs} parallel passes ({n_seg} segments × {len(PASSES)} passes)")
|
||
errors: list[str] = []
|
||
completed = 0
|
||
# Cap inner concurrency so outer-WORKERS × inner doesn't fork-bomb the box.
|
||
# 5 = one slot per pass; segments process serially within a doc.
|
||
INNER_MAX = int(os.environ.get("REEXTRACT_INNER_MAX", "5"))
|
||
with ThreadPoolExecutor(max_workers=min(INNER_MAX, n_jobs)) as pool:
|
||
futures = []
|
||
for seg_idx, seg in enumerate(segments):
|
||
for pass_name, pass_instr in PASSES:
|
||
futures.append(pool.submit(run_pass, seg_idx, seg, pass_name, pass_instr))
|
||
for fut in as_completed(futures):
|
||
seg_idx, pass_name, piece, err = fut.result()
|
||
completed += 1
|
||
tag = f"seg{seg_idx+1}/{pass_name}"
|
||
if err:
|
||
errors.append(f"{tag}: {err}")
|
||
debug = out_path.parent / f"_reextract_raw_seg{seg_idx+1}_{pass_name}.txt"
|
||
print(f" [{completed}/{n_jobs}] {tag} FAILED — {err[:120]}")
|
||
else:
|
||
if isinstance(piece, dict):
|
||
for k, v in piece.items():
|
||
extracted[seg_idx][k] = v
|
||
print(f" [{completed}/{n_jobs}] {tag} OK")
|
||
|
||
print(f"[4/N] Merging {n_seg} extraction(s) ...")
|
||
merged = merge_extractions(extracted, doc_id) if n_seg > 1 else {**extracted[0], "doc_id": doc_id}
|
||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||
out_path.write_text(json.dumps(merged, indent=2, ensure_ascii=False), encoding="utf-8")
|
||
print(f" saved {out_path}")
|
||
|
||
print(f"[5/N] Validating ...")
|
||
v = subprocess.run(
|
||
["python3", str(VALIDATE), doc_id, str(out_path)],
|
||
capture_output=True, text=True, encoding="utf-8",
|
||
)
|
||
print(v.stdout.strip())
|
||
return v.returncode
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|