disclosure-bureau/scripts/99-finalize-pipeline.sh

79 lines
3.1 KiB
Bash
Executable file

#!/usr/bin/env bash
#
# 99-finalize-pipeline.sh — Encadeia fases 3-retry → 4 → 4.8 → 5 → 6 → 7 → 8 → 9
# após o término da Fase 3 (vision Haiku).
#
# Cada fase é idempotente: re-rodar é seguro.
#
# Log único em /tmp/ufo-finalize.log com prefixo de fase, append-only.
set -uo pipefail
ROOT="/Users/guto/ufo"
LOG="/tmp/ufo-finalize.log"
PY="python3"
cd "$ROOT" || exit 1
phase() {
local name="$1"; shift
echo "" | tee -a "$LOG"
echo "================================================================" | tee -a "$LOG"
echo "=== $(date -u +%Y-%m-%dT%H:%M:%SZ)$name" | tee -a "$LOG"
echo "================================================================" | tee -a "$LOG"
"$@" 2>&1 | tee -a "$LOG"
local rc=${PIPESTATUS[0]}
echo "=== rc=$rc" | tee -a "$LOG"
return $rc
}
echo "" >> "$LOG"
echo "================================================================" >> "$LOG"
echo "==== FINALIZE PIPELINE STARTED $(date -u +%Y-%m-%dT%H:%M:%SZ) ====" >> "$LOG"
echo "================================================================" >> "$LOG"
# --- Phase 3 pass3 — last safety net (idempotent, processes only failures) ---
phase "Phase 3 pass3 (final retry)" \
$PY scripts/02-vision-page.py --all --workers 3 || true
# --- Phase 4 — Aggregate pages into document.md ---
phase "Phase 4 — build documents" \
$PY scripts/14-build-document-md.py || true
# --- Phase 4.8 retry — table CSV extraction (one had failed JSON parse) ---
phase "Phase 4.8 — retry remaining table CSVs" \
$PY scripts/16-extract-table-csv.py || true
# --- Phase 5 — Entity dedup / upsert ---
phase "Phase 5 — entity dedup" \
$PY scripts/03-dedup-entities.py || true
# --- Phase 7 — Crop bboxes (needs page.md but not enrichment) ---
phase "Phase 7 — crop bboxes" \
$PY scripts/05-crop-bboxes.py || true
# --- Phase 8 — Graph export (after entity stubs exist) ---
phase "Phase 8 — graph export" \
$PY scripts/06-graph-export.py || true
# --- Phase 6 — Enrichment (heaviest, runs after dedup creates entity stubs) ---
phase "Phase 6 — enrichment (deep tier only, 3 workers)" \
$PY scripts/17-enrich-entities.py --all --tier deep --workers 3 || true
# --- Phase 9 — Lint (LAST: rebuilds mentioned_in[] after enrichment) ---
phase "Phase 9 — lint + backlink rebuild" \
$PY scripts/04-lint.py || true
# --- Final stats ---
echo "" | tee -a "$LOG"
echo "================================================================" | tee -a "$LOG"
echo "==== FINALIZE PIPELINE FINISHED $(date -u +%Y-%m-%dT%H:%M:%SZ) ====" | tee -a "$LOG"
echo "================================================================" | tee -a "$LOG"
PAGES=$(find "$ROOT/wiki/pages" -name "p*.md" 2>/dev/null | wc -l | tr -d ' ')
DOCS=$(ls "$ROOT/wiki/documents/" 2>/dev/null | wc -l | tr -d ' ')
ENTITIES=$(find "$ROOT/wiki/entities" -name "*.md" 2>/dev/null | wc -l | tr -d ' ')
ENRICHED=$(grep -l "enrichment_status: deep\|enrichment_status: shallow" "$ROOT/wiki/entities/"*/*.md 2>/dev/null | wc -l | tr -d ' ')
TABLES=$(ls "$ROOT/wiki/tables/" 2>/dev/null | wc -l | tr -d ' ')
echo "pages: $PAGES · documents: $DOCS · entities: $ENTITIES (enriched: $ENRICHED) · tables: $TABLES" | tee -a "$LOG"