disclosure-bureau/scripts/01-convert-pdfs.sh
guto d5f6e6030a fix png-numbering: re-convert 34 zero-based docs + crop fallback
34 of 116 docs were generated with 0-based PNG numbering (p-000.png …
p-008.png) but the Sonnet chunks reference 1-based page numbers in their
YAML frontmatter (page: 9 means the 9th sheet of paper). The /api/crop
handler built p-009.png and got a 500, the browser's Next/Image surfaced
400, and the chunk rendered as a black box on screen.

Fixes:
- web/app/api/crop/route.ts: try p-NNN.png first, fall back to
  p-(NNN-1).png if the 1-based file is missing. Cheap insurance for any
  doc that comes in with the old convention.
- scripts/01-convert-pdfs.sh: previously printf '%03d' "$num" with $num
  starting at 0 (e.g. "008") raised "invalid number" because Bash
  parsed it as octal. Wrap with $((10#$num)) to force decimal — this
  was silently corrupting page sequences and producing gaps like p-001
  ... p-008, p-011 (missing p-009/p-010).
- All 34 affected docs re-converted from PDFs with the patched script;
  every directory now has continuous 1-based PNGs.
- /processing/png/ rsync'd to VPS, web redeployed.

Smoke: /api/crop?doc=doc-341-…&page=9&… now returns 200 image/webp
instead of 500. Tested in browser: chunk c0026 (diagram, p9) renders
the real engineering diagram.
2026-05-18 11:45:40 -03:00

208 lines
6.1 KiB
Bash
Executable file

#!/usr/bin/env bash
# 01-convert-pdfs.sh — Fase 2: PDF → PNG (200 DPI) + OCR (pdftotext -layout)
#
# Uso:
# ./01-convert-pdfs.sh --doc-id <doc-id> # single doc
# ./01-convert-pdfs.sh --filename <filename> # single PDF by filename
# ./01-convert-pdfs.sh --all # all 115 PDFs in raw/
#
# Idempotente: pula PNGs/OCR já gerados. Re-roda --force para sobrescrever.
set -euo pipefail
UFO_ROOT="/Users/guto/ufo"
RAW_DIR="$UFO_ROOT/raw"
PNG_BASE="$UFO_ROOT/processing/png"
OCR_BASE="$UFO_ROOT/processing/ocr"
DPI=72 # LLM vision downscales internally; 72 DPI matches PDF point grid and is sufficient for OCR + vision
FORCE=0
TARGET_FILENAME=""
TARGET_DOC_ID=""
PROCESS_ALL=0
usage() {
cat <<EOF
Usage: $0 [--doc-id <doc-id>] [--filename <filename.pdf>] [--all] [--force]
Options:
--doc-id ID Process single PDF by doc_id (kebab-case)
--filename F Process single PDF by raw filename
--all Process every PDF in $RAW_DIR
--force Re-convert even if outputs exist
-h, --help Show this help
EOF
exit "${1:-0}"
}
# Canonicalize filename → doc_id (matches CLAUDE.md algorithm)
filename_to_doc_id() {
local fname="$1"
local base="${fname%.*}" # strip extension
# ASCII fold (best-effort via iconv) + lowercase + replace non-alnum with -
local id
id=$(printf '%s' "$base" \
| iconv -f UTF-8 -t ASCII//TRANSLIT 2>/dev/null \
| tr '[:upper:]' '[:lower:]' \
| sed -e 's/[^a-z0-9-]/-/g' -e 's/--*/-/g' -e 's/^-//' -e 's/-$//')
# Prefix with doc- if starts with digit
if [[ "$id" =~ ^[0-9] ]]; then
id="doc-$id"
fi
printf '%s' "$id"
}
# Reverse: doc_id → filename (scan raw/ for match)
doc_id_to_filename() {
local target_id="$1"
for f in "$RAW_DIR"/*.pdf; do
[[ -f "$f" ]] || continue
local fname
fname=$(basename "$f")
local id
id=$(filename_to_doc_id "$fname")
if [[ "$id" == "$target_id" ]]; then
printf '%s' "$fname"
return 0
fi
done
return 1
}
convert_one_pdf() {
local pdf_path="$1"
local fname
fname=$(basename "$pdf_path")
local doc_id
doc_id=$(filename_to_doc_id "$fname")
local png_dir="$PNG_BASE/$doc_id"
local ocr_dir="$OCR_BASE/$doc_id"
mkdir -p "$png_dir" "$ocr_dir"
# Get page count
local page_count
page_count=$(pdfinfo "$pdf_path" 2>/dev/null | awk -F': +' '/^Pages/ {print $2}')
if [[ -z "$page_count" ]]; then
printf ' [skip] %s — could not read pdfinfo\n' "$fname" >&2
return 1
fi
printf '\n=== %s (%d pages) → %s ===\n' "$fname" "$page_count" "$doc_id"
# PNG generation (pdftoppm produces p-001.png, p-002.png, ...)
local need_png=1
if [[ $FORCE -eq 0 ]]; then
# Check if last expected PNG exists
local last_page_num
last_page_num=$(printf '%03d' "$page_count")
if [[ -f "$png_dir/p-$last_page_num.png" ]]; then
need_png=0
printf ' PNG: skip (already generated)\n'
fi
fi
if [[ $need_png -eq 1 ]]; then
printf ' PNG: pdftoppm @ %d DPI...\n' "$DPI"
pdftoppm -r "$DPI" -png "$pdf_path" "$png_dir/p"
# pdftoppm names files like p-1.png for pages 1-9, p-10.png for 10+
# Normalize to zero-padded p-001.png
for f in "$png_dir"/p-*.png; do
[[ -f "$f" ]] || continue
local bn
bn=$(basename "$f")
# Extract number, zero-pad to 3 digits
local num
num=$(printf '%s' "$bn" | sed -E 's/^p-([0-9]+)\.png$/\1/')
if [[ "$num" =~ ^[0-9]+$ ]]; then
local padded
# Force decimal interpretation: leading zeros would make bash
# treat "008" as invalid octal under printf '%03d'.
padded=$(printf '%03d' "$((10#$num))")
local new_name="p-$padded.png"
if [[ "$bn" != "$new_name" ]]; then
mv "$f" "$png_dir/$new_name"
fi
fi
done
printf ' PNG: done\n'
fi
# OCR per page (pdftotext -f N -l N -layout)
local need_ocr=1
if [[ $FORCE -eq 0 ]]; then
local last_page_num
last_page_num=$(printf '%03d' "$page_count")
if [[ -f "$ocr_dir/p-$last_page_num.txt" ]]; then
need_ocr=0
printf ' OCR: skip (already generated)\n'
fi
fi
if [[ $need_ocr -eq 1 ]]; then
printf ' OCR: pdftotext -layout per page...\n'
for (( p=1; p<=page_count; p++ )); do
local padded
padded=$(printf '%03d' "$p")
pdftotext -f "$p" -l "$p" -layout "$pdf_path" "$ocr_dir/p-$padded.txt" 2>/dev/null || true
done
printf ' OCR: done\n'
fi
printf ' ✓ %s\n' "$doc_id"
}
# Parse args
while [[ $# -gt 0 ]]; do
case "$1" in
--doc-id)
TARGET_DOC_ID="$2"
shift 2
;;
--filename)
TARGET_FILENAME="$2"
shift 2
;;
--all)
PROCESS_ALL=1
shift
;;
--force)
FORCE=1
shift
;;
-h|--help)
usage 0
;;
*)
printf 'Unknown arg: %s\n' "$1" >&2
usage 1
;;
esac
done
if [[ $PROCESS_ALL -eq 0 && -z "$TARGET_DOC_ID" && -z "$TARGET_FILENAME" ]]; then
usage 1
fi
if [[ -n "$TARGET_DOC_ID" ]]; then
fname=$(doc_id_to_filename "$TARGET_DOC_ID") || {
printf 'No PDF in %s matches doc_id %s\n' "$RAW_DIR" "$TARGET_DOC_ID" >&2
exit 1
}
convert_one_pdf "$RAW_DIR/$fname"
elif [[ -n "$TARGET_FILENAME" ]]; then
if [[ ! -f "$RAW_DIR/$TARGET_FILENAME" ]]; then
printf 'File not found: %s\n' "$RAW_DIR/$TARGET_FILENAME" >&2
exit 1
fi
convert_one_pdf "$RAW_DIR/$TARGET_FILENAME"
else
# --all
count=0
for pdf in "$RAW_DIR"/*.pdf; do
[[ -f "$pdf" ]] || continue
convert_one_pdf "$pdf" || true
count=$((count + 1))
done
printf '\n=== Total processed: %d PDFs ===\n' "$count"
fi