disclosure-bureau/scripts/01-convert-pdfs.sh

206 lines
5.9 KiB
Bash
Executable file

#!/usr/bin/env bash
# 01-convert-pdfs.sh — Fase 2: PDF → PNG (200 DPI) + OCR (pdftotext -layout)
#
# Uso:
# ./01-convert-pdfs.sh --doc-id <doc-id> # single doc
# ./01-convert-pdfs.sh --filename <filename> # single PDF by filename
# ./01-convert-pdfs.sh --all # all 115 PDFs in raw/
#
# Idempotente: pula PNGs/OCR já gerados. Re-roda --force para sobrescrever.
set -euo pipefail
UFO_ROOT="/Users/guto/ufo"
RAW_DIR="$UFO_ROOT/raw"
PNG_BASE="$UFO_ROOT/processing/png"
OCR_BASE="$UFO_ROOT/processing/ocr"
DPI=72 # LLM vision downscales internally; 72 DPI matches PDF point grid and is sufficient for OCR + vision
FORCE=0
TARGET_FILENAME=""
TARGET_DOC_ID=""
PROCESS_ALL=0
usage() {
cat <<EOF
Usage: $0 [--doc-id <doc-id>] [--filename <filename.pdf>] [--all] [--force]
Options:
--doc-id ID Process single PDF by doc_id (kebab-case)
--filename F Process single PDF by raw filename
--all Process every PDF in $RAW_DIR
--force Re-convert even if outputs exist
-h, --help Show this help
EOF
exit "${1:-0}"
}
# Canonicalize filename → doc_id (matches CLAUDE.md algorithm)
filename_to_doc_id() {
local fname="$1"
local base="${fname%.*}" # strip extension
# ASCII fold (best-effort via iconv) + lowercase + replace non-alnum with -
local id
id=$(printf '%s' "$base" \
| iconv -f UTF-8 -t ASCII//TRANSLIT 2>/dev/null \
| tr '[:upper:]' '[:lower:]' \
| sed -e 's/[^a-z0-9-]/-/g' -e 's/--*/-/g' -e 's/^-//' -e 's/-$//')
# Prefix with doc- if starts with digit
if [[ "$id" =~ ^[0-9] ]]; then
id="doc-$id"
fi
printf '%s' "$id"
}
# Reverse: doc_id → filename (scan raw/ for match)
doc_id_to_filename() {
local target_id="$1"
for f in "$RAW_DIR"/*.pdf; do
[[ -f "$f" ]] || continue
local fname
fname=$(basename "$f")
local id
id=$(filename_to_doc_id "$fname")
if [[ "$id" == "$target_id" ]]; then
printf '%s' "$fname"
return 0
fi
done
return 1
}
convert_one_pdf() {
local pdf_path="$1"
local fname
fname=$(basename "$pdf_path")
local doc_id
doc_id=$(filename_to_doc_id "$fname")
local png_dir="$PNG_BASE/$doc_id"
local ocr_dir="$OCR_BASE/$doc_id"
mkdir -p "$png_dir" "$ocr_dir"
# Get page count
local page_count
page_count=$(pdfinfo "$pdf_path" 2>/dev/null | awk -F': +' '/^Pages/ {print $2}')
if [[ -z "$page_count" ]]; then
printf ' [skip] %s — could not read pdfinfo\n' "$fname" >&2
return 1
fi
printf '\n=== %s (%d pages) → %s ===\n' "$fname" "$page_count" "$doc_id"
# PNG generation (pdftoppm produces p-001.png, p-002.png, ...)
local need_png=1
if [[ $FORCE -eq 0 ]]; then
# Check if last expected PNG exists
local last_page_num
last_page_num=$(printf '%03d' "$page_count")
if [[ -f "$png_dir/p-$last_page_num.png" ]]; then
need_png=0
printf ' PNG: skip (already generated)\n'
fi
fi
if [[ $need_png -eq 1 ]]; then
printf ' PNG: pdftoppm @ %d DPI...\n' "$DPI"
pdftoppm -r "$DPI" -png "$pdf_path" "$png_dir/p"
# pdftoppm names files like p-1.png for pages 1-9, p-10.png for 10+
# Normalize to zero-padded p-001.png
for f in "$png_dir"/p-*.png; do
[[ -f "$f" ]] || continue
local bn
bn=$(basename "$f")
# Extract number, zero-pad to 3 digits
local num
num=$(printf '%s' "$bn" | sed -E 's/^p-([0-9]+)\.png$/\1/')
if [[ "$num" =~ ^[0-9]+$ ]]; then
local padded
padded=$(printf '%03d' "$num")
local new_name="p-$padded.png"
if [[ "$bn" != "$new_name" ]]; then
mv "$f" "$png_dir/$new_name"
fi
fi
done
printf ' PNG: done\n'
fi
# OCR per page (pdftotext -f N -l N -layout)
local need_ocr=1
if [[ $FORCE -eq 0 ]]; then
local last_page_num
last_page_num=$(printf '%03d' "$page_count")
if [[ -f "$ocr_dir/p-$last_page_num.txt" ]]; then
need_ocr=0
printf ' OCR: skip (already generated)\n'
fi
fi
if [[ $need_ocr -eq 1 ]]; then
printf ' OCR: pdftotext -layout per page...\n'
for (( p=1; p<=page_count; p++ )); do
local padded
padded=$(printf '%03d' "$p")
pdftotext -f "$p" -l "$p" -layout "$pdf_path" "$ocr_dir/p-$padded.txt" 2>/dev/null || true
done
printf ' OCR: done\n'
fi
printf ' ✓ %s\n' "$doc_id"
}
# Parse args
while [[ $# -gt 0 ]]; do
case "$1" in
--doc-id)
TARGET_DOC_ID="$2"
shift 2
;;
--filename)
TARGET_FILENAME="$2"
shift 2
;;
--all)
PROCESS_ALL=1
shift
;;
--force)
FORCE=1
shift
;;
-h|--help)
usage 0
;;
*)
printf 'Unknown arg: %s\n' "$1" >&2
usage 1
;;
esac
done
if [[ $PROCESS_ALL -eq 0 && -z "$TARGET_DOC_ID" && -z "$TARGET_FILENAME" ]]; then
usage 1
fi
if [[ -n "$TARGET_DOC_ID" ]]; then
fname=$(doc_id_to_filename "$TARGET_DOC_ID") || {
printf 'No PDF in %s matches doc_id %s\n' "$RAW_DIR" "$TARGET_DOC_ID" >&2
exit 1
}
convert_one_pdf "$RAW_DIR/$fname"
elif [[ -n "$TARGET_FILENAME" ]]; then
if [[ ! -f "$RAW_DIR/$TARGET_FILENAME" ]]; then
printf 'File not found: %s\n' "$RAW_DIR/$TARGET_FILENAME" >&2
exit 1
fi
convert_one_pdf "$RAW_DIR/$TARGET_FILENAME"
else
# --all
count=0
for pdf in "$RAW_DIR"/*.pdf; do
[[ -f "$pdf" ]] || continue
convert_one_pdf "$pdf" || true
count=$((count + 1))
done
printf '\n=== Total processed: %d PDFs ===\n' "$count"
fi