#!/usr/bin/env bash # 01-convert-pdfs.sh — Fase 2: PDF → PNG (200 DPI) + OCR (pdftotext -layout) # # Uso: # ./01-convert-pdfs.sh --doc-id # single doc # ./01-convert-pdfs.sh --filename # single PDF by filename # ./01-convert-pdfs.sh --all # all 115 PDFs in raw/ # # Idempotente: pula PNGs/OCR já gerados. Re-roda --force para sobrescrever. set -euo pipefail UFO_ROOT="/Users/guto/ufo" RAW_DIR="$UFO_ROOT/raw" PNG_BASE="$UFO_ROOT/processing/png" OCR_BASE="$UFO_ROOT/processing/ocr" DPI=72 # LLM vision downscales internally; 72 DPI matches PDF point grid and is sufficient for OCR + vision FORCE=0 TARGET_FILENAME="" TARGET_DOC_ID="" PROCESS_ALL=0 usage() { cat <] [--filename ] [--all] [--force] Options: --doc-id ID Process single PDF by doc_id (kebab-case) --filename F Process single PDF by raw filename --all Process every PDF in $RAW_DIR --force Re-convert even if outputs exist -h, --help Show this help EOF exit "${1:-0}" } # Canonicalize filename → doc_id (matches CLAUDE.md algorithm) filename_to_doc_id() { local fname="$1" local base="${fname%.*}" # strip extension # ASCII fold (best-effort via iconv) + lowercase + replace non-alnum with - local id id=$(printf '%s' "$base" \ | iconv -f UTF-8 -t ASCII//TRANSLIT 2>/dev/null \ | tr '[:upper:]' '[:lower:]' \ | sed -e 's/[^a-z0-9-]/-/g' -e 's/--*/-/g' -e 's/^-//' -e 's/-$//') # Prefix with doc- if starts with digit if [[ "$id" =~ ^[0-9] ]]; then id="doc-$id" fi printf '%s' "$id" } # Reverse: doc_id → filename (scan raw/ for match) doc_id_to_filename() { local target_id="$1" for f in "$RAW_DIR"/*.pdf; do [[ -f "$f" ]] || continue local fname fname=$(basename "$f") local id id=$(filename_to_doc_id "$fname") if [[ "$id" == "$target_id" ]]; then printf '%s' "$fname" return 0 fi done return 1 } convert_one_pdf() { local pdf_path="$1" local fname fname=$(basename "$pdf_path") local doc_id doc_id=$(filename_to_doc_id "$fname") local png_dir="$PNG_BASE/$doc_id" local ocr_dir="$OCR_BASE/$doc_id" mkdir -p "$png_dir" "$ocr_dir" # Get page count local page_count page_count=$(pdfinfo "$pdf_path" 2>/dev/null | awk -F': +' '/^Pages/ {print $2}') if [[ -z "$page_count" ]]; then printf ' [skip] %s — could not read pdfinfo\n' "$fname" >&2 return 1 fi printf '\n=== %s (%d pages) → %s ===\n' "$fname" "$page_count" "$doc_id" # PNG generation (pdftoppm produces p-001.png, p-002.png, ...) local need_png=1 if [[ $FORCE -eq 0 ]]; then # Check if last expected PNG exists local last_page_num last_page_num=$(printf '%03d' "$page_count") if [[ -f "$png_dir/p-$last_page_num.png" ]]; then need_png=0 printf ' PNG: skip (already generated)\n' fi fi if [[ $need_png -eq 1 ]]; then printf ' PNG: pdftoppm @ %d DPI...\n' "$DPI" pdftoppm -r "$DPI" -png "$pdf_path" "$png_dir/p" # pdftoppm names files like p-1.png for pages 1-9, p-10.png for 10+ # Normalize to zero-padded p-001.png for f in "$png_dir"/p-*.png; do [[ -f "$f" ]] || continue local bn bn=$(basename "$f") # Extract number, zero-pad to 3 digits local num num=$(printf '%s' "$bn" | sed -E 's/^p-([0-9]+)\.png$/\1/') if [[ "$num" =~ ^[0-9]+$ ]]; then local padded # Force decimal interpretation: leading zeros would make bash # treat "008" as invalid octal under printf '%03d'. padded=$(printf '%03d' "$((10#$num))") local new_name="p-$padded.png" if [[ "$bn" != "$new_name" ]]; then mv "$f" "$png_dir/$new_name" fi fi done printf ' PNG: done\n' fi # OCR per page (pdftotext -f N -l N -layout) local need_ocr=1 if [[ $FORCE -eq 0 ]]; then local last_page_num last_page_num=$(printf '%03d' "$page_count") if [[ -f "$ocr_dir/p-$last_page_num.txt" ]]; then need_ocr=0 printf ' OCR: skip (already generated)\n' fi fi if [[ $need_ocr -eq 1 ]]; then printf ' OCR: pdftotext -layout per page...\n' for (( p=1; p<=page_count; p++ )); do local padded padded=$(printf '%03d' "$p") pdftotext -f "$p" -l "$p" -layout "$pdf_path" "$ocr_dir/p-$padded.txt" 2>/dev/null || true done printf ' OCR: done\n' fi printf ' ✓ %s\n' "$doc_id" } # Parse args while [[ $# -gt 0 ]]; do case "$1" in --doc-id) TARGET_DOC_ID="$2" shift 2 ;; --filename) TARGET_FILENAME="$2" shift 2 ;; --all) PROCESS_ALL=1 shift ;; --force) FORCE=1 shift ;; -h|--help) usage 0 ;; *) printf 'Unknown arg: %s\n' "$1" >&2 usage 1 ;; esac done if [[ $PROCESS_ALL -eq 0 && -z "$TARGET_DOC_ID" && -z "$TARGET_FILENAME" ]]; then usage 1 fi if [[ -n "$TARGET_DOC_ID" ]]; then fname=$(doc_id_to_filename "$TARGET_DOC_ID") || { printf 'No PDF in %s matches doc_id %s\n' "$RAW_DIR" "$TARGET_DOC_ID" >&2 exit 1 } convert_one_pdf "$RAW_DIR/$fname" elif [[ -n "$TARGET_FILENAME" ]]; then if [[ ! -f "$RAW_DIR/$TARGET_FILENAME" ]]; then printf 'File not found: %s\n' "$RAW_DIR/$TARGET_FILENAME" >&2 exit 1 fi convert_one_pdf "$RAW_DIR/$TARGET_FILENAME" else # --all count=0 for pdf in "$RAW_DIR"/*.pdf; do [[ -f "$pdf" ]] || continue convert_one_pdf "$pdf" || true count=$((count + 1)) done printf '\n=== Total processed: %d PDFs ===\n' "$count" fi