disclosure-bureau/scripts/01-convert-pdfs.sh

#!/usr/bin/env bash
# 01-convert-pdfs.sh — Fase 2: PDF → PNG (200 DPI) + OCR (pdftotext -layout)
#
# Uso:
#   ./01-convert-pdfs.sh --doc-id <doc-id>       # single doc
#   ./01-convert-pdfs.sh --filename <filename>   # single PDF by filename
#   ./01-convert-pdfs.sh --all                   # all 115 PDFs in raw/
#
# Idempotente: pula PNGs/OCR já gerados. Re-roda --force para sobrescrever.

set -euo pipefail

UFO_ROOT="/Users/guto/ufo"
RAW_DIR="$UFO_ROOT/raw"
PNG_BASE="$UFO_ROOT/processing/png"
OCR_BASE="$UFO_ROOT/processing/ocr"
DPI=72   # LLM vision downscales internally; 72 DPI matches PDF point grid and is sufficient for OCR + vision
FORCE=0
TARGET_FILENAME=""
TARGET_DOC_ID=""
PROCESS_ALL=0

usage() {
    cat <<EOF
Usage: $0 [--doc-id <doc-id>] [--filename <filename.pdf>] [--all] [--force]

Options:
  --doc-id ID      Process single PDF by doc_id (kebab-case)
  --filename F     Process single PDF by raw filename
  --all            Process every PDF in $RAW_DIR
  --force          Re-convert even if outputs exist
  -h, --help       Show this help
EOF
    exit "${1:-0}"
}

# Canonicalize filename → doc_id (matches CLAUDE.md algorithm)
filename_to_doc_id() {
    local fname="$1"
    local base="${fname%.*}"  # strip extension
    # ASCII fold (best-effort via iconv) + lowercase + replace non-alnum with -
    local id
    id=$(printf '%s' "$base" \
        | iconv -f UTF-8 -t ASCII//TRANSLIT 2>/dev/null \
        | tr '[:upper:]' '[:lower:]' \
        | sed -e 's/[^a-z0-9-]/-/g' -e 's/--*/-/g' -e 's/^-//' -e 's/-$//')
    # Prefix with doc- if starts with digit
    if [[ "$id" =~ ^[0-9] ]]; then
        id="doc-$id"
    fi
    printf '%s' "$id"
}

# Reverse: doc_id → filename (scan raw/ for match)
doc_id_to_filename() {
    local target_id="$1"
    for f in "$RAW_DIR"/*.pdf; do
        [[ -f "$f" ]] || continue
        local fname
        fname=$(basename "$f")
        local id
        id=$(filename_to_doc_id "$fname")
        if [[ "$id" == "$target_id" ]]; then
            printf '%s' "$fname"
            return 0
        fi
    done
    return 1
}

convert_one_pdf() {
    local pdf_path="$1"
    local fname
    fname=$(basename "$pdf_path")
    local doc_id
    doc_id=$(filename_to_doc_id "$fname")
    local png_dir="$PNG_BASE/$doc_id"
    local ocr_dir="$OCR_BASE/$doc_id"

    mkdir -p "$png_dir" "$ocr_dir"

    # Get page count
    local page_count
    page_count=$(pdfinfo "$pdf_path" 2>/dev/null | awk -F': +' '/^Pages/ {print $2}')
    if [[ -z "$page_count" ]]; then
        printf '  [skip] %s — could not read pdfinfo\n' "$fname" >&2
        return 1
    fi

    printf '\n=== %s (%d pages) → %s ===\n' "$fname" "$page_count" "$doc_id"

    # PNG generation (pdftoppm produces p-001.png, p-002.png, ...)
    local need_png=1
    if [[ $FORCE -eq 0 ]]; then
        # Check if last expected PNG exists
        local last_page_num
        last_page_num=$(printf '%03d' "$page_count")
        if [[ -f "$png_dir/p-$last_page_num.png" ]]; then
            need_png=0
            printf '  PNG: skip (already generated)\n'
        fi
    fi

    if [[ $need_png -eq 1 ]]; then
        printf '  PNG: pdftoppm @ %d DPI...\n' "$DPI"
        pdftoppm -r "$DPI" -png "$pdf_path" "$png_dir/p"
        # pdftoppm names files like p-1.png for pages 1-9, p-10.png for 10+
        # Normalize to zero-padded p-001.png
        for f in "$png_dir"/p-*.png; do
            [[ -f "$f" ]] || continue
            local bn
            bn=$(basename "$f")
            # Extract number, zero-pad to 3 digits
            local num
            num=$(printf '%s' "$bn" | sed -E 's/^p-([0-9]+)\.png$/\1/')
            if [[ "$num" =~ ^[0-9]+$ ]]; then
                local padded
                padded=$(printf '%03d' "$num")
                local new_name="p-$padded.png"
                if [[ "$bn" != "$new_name" ]]; then
                    mv "$f" "$png_dir/$new_name"
                fi
            fi
        done
        printf '  PNG: done\n'
    fi

    # OCR per page (pdftotext -f N -l N -layout)
    local need_ocr=1
    if [[ $FORCE -eq 0 ]]; then
        local last_page_num
        last_page_num=$(printf '%03d' "$page_count")
        if [[ -f "$ocr_dir/p-$last_page_num.txt" ]]; then
            need_ocr=0
            printf '  OCR: skip (already generated)\n'
        fi
    fi

    if [[ $need_ocr -eq 1 ]]; then
        printf '  OCR: pdftotext -layout per page...\n'
        for (( p=1; p<=page_count; p++ )); do
            local padded
            padded=$(printf '%03d' "$p")
            pdftotext -f "$p" -l "$p" -layout "$pdf_path" "$ocr_dir/p-$padded.txt" 2>/dev/null || true
        done
        printf '  OCR: done\n'
    fi

    printf '  ✓ %s\n' "$doc_id"
}

# Parse args
while [[ $# -gt 0 ]]; do
    case "$1" in
        --doc-id)
            TARGET_DOC_ID="$2"
            shift 2
            ;;
        --filename)
            TARGET_FILENAME="$2"
            shift 2
            ;;
        --all)
            PROCESS_ALL=1
            shift
            ;;
        --force)
            FORCE=1
            shift
            ;;
        -h|--help)
            usage 0
            ;;
        *)
            printf 'Unknown arg: %s\n' "$1" >&2
            usage 1
            ;;
    esac
done

if [[ $PROCESS_ALL -eq 0 && -z "$TARGET_DOC_ID" && -z "$TARGET_FILENAME" ]]; then
    usage 1
fi

if [[ -n "$TARGET_DOC_ID" ]]; then
    fname=$(doc_id_to_filename "$TARGET_DOC_ID") || {
        printf 'No PDF in %s matches doc_id %s\n' "$RAW_DIR" "$TARGET_DOC_ID" >&2
        exit 1
    }
    convert_one_pdf "$RAW_DIR/$fname"
elif [[ -n "$TARGET_FILENAME" ]]; then
    if [[ ! -f "$RAW_DIR/$TARGET_FILENAME" ]]; then
        printf 'File not found: %s\n' "$RAW_DIR/$TARGET_FILENAME" >&2
        exit 1
    fi
    convert_one_pdf "$RAW_DIR/$TARGET_FILENAME"
else
    # --all
    count=0
    for pdf in "$RAW_DIR"/*.pdf; do
        [[ -f "$pdf" ]] || continue
        convert_one_pdf "$pdf" || true
        count=$((count + 1))
    done
    printf '\n=== Total processed: %d PDFs ===\n' "$count"
fi