206 lines
5.9 KiB
Bash
Executable file
206 lines
5.9 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# 01-convert-pdfs.sh — Fase 2: PDF → PNG (200 DPI) + OCR (pdftotext -layout)
|
|
#
|
|
# Uso:
|
|
# ./01-convert-pdfs.sh --doc-id <doc-id> # single doc
|
|
# ./01-convert-pdfs.sh --filename <filename> # single PDF by filename
|
|
# ./01-convert-pdfs.sh --all # all 115 PDFs in raw/
|
|
#
|
|
# Idempotente: pula PNGs/OCR já gerados. Re-roda --force para sobrescrever.
|
|
|
|
set -euo pipefail
|
|
|
|
UFO_ROOT="/Users/guto/ufo"
|
|
RAW_DIR="$UFO_ROOT/raw"
|
|
PNG_BASE="$UFO_ROOT/processing/png"
|
|
OCR_BASE="$UFO_ROOT/processing/ocr"
|
|
DPI=72 # LLM vision downscales internally; 72 DPI matches PDF point grid and is sufficient for OCR + vision
|
|
FORCE=0
|
|
TARGET_FILENAME=""
|
|
TARGET_DOC_ID=""
|
|
PROCESS_ALL=0
|
|
|
|
usage() {
|
|
cat <<EOF
|
|
Usage: $0 [--doc-id <doc-id>] [--filename <filename.pdf>] [--all] [--force]
|
|
|
|
Options:
|
|
--doc-id ID Process single PDF by doc_id (kebab-case)
|
|
--filename F Process single PDF by raw filename
|
|
--all Process every PDF in $RAW_DIR
|
|
--force Re-convert even if outputs exist
|
|
-h, --help Show this help
|
|
EOF
|
|
exit "${1:-0}"
|
|
}
|
|
|
|
# Canonicalize filename → doc_id (matches CLAUDE.md algorithm)
|
|
filename_to_doc_id() {
|
|
local fname="$1"
|
|
local base="${fname%.*}" # strip extension
|
|
# ASCII fold (best-effort via iconv) + lowercase + replace non-alnum with -
|
|
local id
|
|
id=$(printf '%s' "$base" \
|
|
| iconv -f UTF-8 -t ASCII//TRANSLIT 2>/dev/null \
|
|
| tr '[:upper:]' '[:lower:]' \
|
|
| sed -e 's/[^a-z0-9-]/-/g' -e 's/--*/-/g' -e 's/^-//' -e 's/-$//')
|
|
# Prefix with doc- if starts with digit
|
|
if [[ "$id" =~ ^[0-9] ]]; then
|
|
id="doc-$id"
|
|
fi
|
|
printf '%s' "$id"
|
|
}
|
|
|
|
# Reverse: doc_id → filename (scan raw/ for match)
|
|
doc_id_to_filename() {
|
|
local target_id="$1"
|
|
for f in "$RAW_DIR"/*.pdf; do
|
|
[[ -f "$f" ]] || continue
|
|
local fname
|
|
fname=$(basename "$f")
|
|
local id
|
|
id=$(filename_to_doc_id "$fname")
|
|
if [[ "$id" == "$target_id" ]]; then
|
|
printf '%s' "$fname"
|
|
return 0
|
|
fi
|
|
done
|
|
return 1
|
|
}
|
|
|
|
convert_one_pdf() {
|
|
local pdf_path="$1"
|
|
local fname
|
|
fname=$(basename "$pdf_path")
|
|
local doc_id
|
|
doc_id=$(filename_to_doc_id "$fname")
|
|
local png_dir="$PNG_BASE/$doc_id"
|
|
local ocr_dir="$OCR_BASE/$doc_id"
|
|
|
|
mkdir -p "$png_dir" "$ocr_dir"
|
|
|
|
# Get page count
|
|
local page_count
|
|
page_count=$(pdfinfo "$pdf_path" 2>/dev/null | awk -F': +' '/^Pages/ {print $2}')
|
|
if [[ -z "$page_count" ]]; then
|
|
printf ' [skip] %s — could not read pdfinfo\n' "$fname" >&2
|
|
return 1
|
|
fi
|
|
|
|
printf '\n=== %s (%d pages) → %s ===\n' "$fname" "$page_count" "$doc_id"
|
|
|
|
# PNG generation (pdftoppm produces p-001.png, p-002.png, ...)
|
|
local need_png=1
|
|
if [[ $FORCE -eq 0 ]]; then
|
|
# Check if last expected PNG exists
|
|
local last_page_num
|
|
last_page_num=$(printf '%03d' "$page_count")
|
|
if [[ -f "$png_dir/p-$last_page_num.png" ]]; then
|
|
need_png=0
|
|
printf ' PNG: skip (already generated)\n'
|
|
fi
|
|
fi
|
|
|
|
if [[ $need_png -eq 1 ]]; then
|
|
printf ' PNG: pdftoppm @ %d DPI...\n' "$DPI"
|
|
pdftoppm -r "$DPI" -png "$pdf_path" "$png_dir/p"
|
|
# pdftoppm names files like p-1.png for pages 1-9, p-10.png for 10+
|
|
# Normalize to zero-padded p-001.png
|
|
for f in "$png_dir"/p-*.png; do
|
|
[[ -f "$f" ]] || continue
|
|
local bn
|
|
bn=$(basename "$f")
|
|
# Extract number, zero-pad to 3 digits
|
|
local num
|
|
num=$(printf '%s' "$bn" | sed -E 's/^p-([0-9]+)\.png$/\1/')
|
|
if [[ "$num" =~ ^[0-9]+$ ]]; then
|
|
local padded
|
|
padded=$(printf '%03d' "$num")
|
|
local new_name="p-$padded.png"
|
|
if [[ "$bn" != "$new_name" ]]; then
|
|
mv "$f" "$png_dir/$new_name"
|
|
fi
|
|
fi
|
|
done
|
|
printf ' PNG: done\n'
|
|
fi
|
|
|
|
# OCR per page (pdftotext -f N -l N -layout)
|
|
local need_ocr=1
|
|
if [[ $FORCE -eq 0 ]]; then
|
|
local last_page_num
|
|
last_page_num=$(printf '%03d' "$page_count")
|
|
if [[ -f "$ocr_dir/p-$last_page_num.txt" ]]; then
|
|
need_ocr=0
|
|
printf ' OCR: skip (already generated)\n'
|
|
fi
|
|
fi
|
|
|
|
if [[ $need_ocr -eq 1 ]]; then
|
|
printf ' OCR: pdftotext -layout per page...\n'
|
|
for (( p=1; p<=page_count; p++ )); do
|
|
local padded
|
|
padded=$(printf '%03d' "$p")
|
|
pdftotext -f "$p" -l "$p" -layout "$pdf_path" "$ocr_dir/p-$padded.txt" 2>/dev/null || true
|
|
done
|
|
printf ' OCR: done\n'
|
|
fi
|
|
|
|
printf ' ✓ %s\n' "$doc_id"
|
|
}
|
|
|
|
# Parse args
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--doc-id)
|
|
TARGET_DOC_ID="$2"
|
|
shift 2
|
|
;;
|
|
--filename)
|
|
TARGET_FILENAME="$2"
|
|
shift 2
|
|
;;
|
|
--all)
|
|
PROCESS_ALL=1
|
|
shift
|
|
;;
|
|
--force)
|
|
FORCE=1
|
|
shift
|
|
;;
|
|
-h|--help)
|
|
usage 0
|
|
;;
|
|
*)
|
|
printf 'Unknown arg: %s\n' "$1" >&2
|
|
usage 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
if [[ $PROCESS_ALL -eq 0 && -z "$TARGET_DOC_ID" && -z "$TARGET_FILENAME" ]]; then
|
|
usage 1
|
|
fi
|
|
|
|
if [[ -n "$TARGET_DOC_ID" ]]; then
|
|
fname=$(doc_id_to_filename "$TARGET_DOC_ID") || {
|
|
printf 'No PDF in %s matches doc_id %s\n' "$RAW_DIR" "$TARGET_DOC_ID" >&2
|
|
exit 1
|
|
}
|
|
convert_one_pdf "$RAW_DIR/$fname"
|
|
elif [[ -n "$TARGET_FILENAME" ]]; then
|
|
if [[ ! -f "$RAW_DIR/$TARGET_FILENAME" ]]; then
|
|
printf 'File not found: %s\n' "$RAW_DIR/$TARGET_FILENAME" >&2
|
|
exit 1
|
|
fi
|
|
convert_one_pdf "$RAW_DIR/$TARGET_FILENAME"
|
|
else
|
|
# --all
|
|
count=0
|
|
for pdf in "$RAW_DIR"/*.pdf; do
|
|
[[ -f "$pdf" ]] || continue
|
|
convert_one_pdf "$pdf" || true
|
|
count=$((count + 1))
|
|
done
|
|
printf '\n=== Total processed: %d PDFs ===\n' "$count"
|
|
fi
|