Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
97 lines
2.5 KiB
Bash
Executable file
97 lines
2.5 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# Parallel re-extraction orchestrator.
|
|
#
|
|
# - Lists every doc that has raw/<doc>--subagent/_index.json
|
|
# - Skips docs that already have _reextract.json (idempotent)
|
|
# - Uses an mkdir-based per-doc lock to prevent two workers from racing
|
|
# - Runs N workers in parallel (default 8, override via WORKERS=N)
|
|
# - Logs each doc to raw/<doc>--subagent/_reextract.log
|
|
#
|
|
# Run:
|
|
# ./run_parallel.sh # all docs, 8 workers
|
|
# WORKERS=4 ./run_parallel.sh # 4 workers
|
|
# ./run_parallel.sh DOC1 DOC2 # specific docs only
|
|
set -uo pipefail
|
|
|
|
UFO="/Users/guto/ufo"
|
|
RAW="$UFO/raw"
|
|
RUN="$UFO/scripts/reextract/run.py"
|
|
WORKERS="${WORKERS:-4}"
|
|
|
|
# Build list of doc IDs
|
|
if [ "$#" -gt 0 ]; then
|
|
DOCS=("$@")
|
|
else
|
|
DOCS=()
|
|
for d in "$RAW"/*--subagent; do
|
|
[ -f "$d/_index.json" ] || continue
|
|
doc_id=$(basename "$d" | sed 's/--subagent$//')
|
|
DOCS+=("$doc_id")
|
|
done
|
|
fi
|
|
|
|
echo "=== Re-extract orchestrator ==="
|
|
echo " docs queued: ${#DOCS[@]}"
|
|
echo " workers: $WORKERS"
|
|
echo ""
|
|
|
|
process_one() {
|
|
local doc_id="$1"
|
|
local sub="$RAW/$doc_id--subagent"
|
|
local out="$sub/_reextract.json"
|
|
local log="$sub/_reextract.log"
|
|
local lock="$sub/.reextract.lock"
|
|
|
|
# Skip if already extracted
|
|
if [ -f "$out" ]; then
|
|
# Quick sanity: must parse as JSON
|
|
if python3 -c "import json,sys; json.load(open('$out'))" 2>/dev/null; then
|
|
echo "[SKIP] $doc_id (already extracted)"
|
|
return 0
|
|
fi
|
|
fi
|
|
|
|
# Acquire lock via mkdir (atomic)
|
|
if ! mkdir "$lock" 2>/dev/null; then
|
|
echo "[LOCK] $doc_id (another worker has it)"
|
|
return 0
|
|
fi
|
|
trap "rmdir '$lock' 2>/dev/null || true" EXIT
|
|
|
|
local started=$(date +%s)
|
|
echo "[BEGIN] $doc_id"
|
|
if python3 "$RUN" "$doc_id" > "$log" 2>&1; then
|
|
local elapsed=$(($(date +%s) - started))
|
|
echo "[OK] $doc_id (${elapsed}s)"
|
|
else
|
|
local elapsed=$(($(date +%s) - started))
|
|
echo "[FAIL] $doc_id (${elapsed}s) — see $log"
|
|
fi
|
|
|
|
rmdir "$lock" 2>/dev/null || true
|
|
trap - EXIT
|
|
}
|
|
|
|
export -f process_one
|
|
export RAW RUN
|
|
|
|
# Run in parallel via xargs
|
|
printf '%s\n' "${DOCS[@]}" | xargs -n 1 -P "$WORKERS" -I {} bash -c 'process_one "$@"' _ {}
|
|
|
|
echo ""
|
|
echo "=== Done. Summary: ==="
|
|
ok=0; skip=0; fail=0
|
|
for d in "${DOCS[@]}"; do
|
|
out="$RAW/$d--subagent/_reextract.json"
|
|
if [ -f "$out" ]; then
|
|
if python3 -c "import json,sys; json.load(open('$out'))" 2>/dev/null; then
|
|
ok=$((ok + 1))
|
|
else
|
|
fail=$((fail + 1))
|
|
fi
|
|
else
|
|
fail=$((fail + 1))
|
|
fi
|
|
done
|
|
echo " OK: $ok"
|
|
echo " FAIL: $fail"
|