#!/usr/bin/env python3 """ 05-crop-bboxes.py — Eager crop generation from bounding boxes For each page.md, read `images_detected[]`, `tables_detected[]`, and `signatures_observed[]` (the elements whose visual content is worth showing inline in chat replies). For each element with a bbox, crop the corresponding region from the page PNG using Pillow and save to: processing/crops//.png Where follows the convention: IMG--p- for images_detected TBL--p- for tables_detected SIG--p- for signatures_observed Padding: 1% of page dimensions around each bbox to avoid tight clipping. Idempotent: skips crops whose output PNG already exists with non-zero size (unless --force). Usage: ./05-crop-bboxes.py # all docs ./05-crop-bboxes.py --doc-id # single doc ./05-crop-bboxes.py --force # overwrite existing crops """ from __future__ import annotations import argparse import re import sys from datetime import datetime, timezone from pathlib import Path try: import yaml except ImportError: sys.stderr.write("Missing pyyaml. Run: pip3 install pyyaml\n") sys.exit(1) try: from PIL import Image except ImportError: sys.stderr.write("Missing pillow. Run: pip3 install pillow\n") sys.exit(1) UFO_ROOT = Path("/Users/guto/ufo") PAGES_BASE = UFO_ROOT / "wiki" / "pages" PNG_BASE = UFO_ROOT / "processing" / "png" CROPS_BASE = UFO_ROOT / "processing" / "crops" LOG_PATH = UFO_ROOT / "wiki" / "log.md" PADDING_FRACTION = 0.01 # 1% padding around bbox def utc_now_iso() -> str: return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def read_frontmatter(path: Path) -> dict: c = path.read_text(encoding="utf-8") if not c.startswith("---"): return {} end = c.find("---", 4) if end == -1: return {} try: return yaml.safe_load(c[3:end].strip()) or {} except yaml.YAMLError: return {} def doc_id_short(doc_id: str) -> str: """Compact uppercase tag for use inside crop ids. 'dow-uap-d54-mission-report-mediterranean-sea-na' → 'DOWD54' 'doc-65-hs1-...' → 'D65HS1' (drop common prefixes, keep first signal) """ s = doc_id.upper() # Remove common prefixes / fillers for prefix in ("DOW-UAP-", "DOS-UAP-", "NASA-UAP-", "FBI-PHOTO-", "DOC-"): if s.startswith(prefix): s = s[len(prefix):] break # Take first ~6 alphanumeric chars s = re.sub(r"[^A-Z0-9]", "", s)[:8] return s or "X" def make_crop_id(prefix: str, doc_id: str, page_num: int, idx: int) -> str: return f"{prefix}-{doc_id_short(doc_id)}-p{page_num:03d}-{idx:02d}" def crop_bbox( *, src_png: Path, dest_png: Path, bbox: dict, padding: float, force: bool, ) -> tuple[bool, str | None]: """Crop src_png by bbox and save to dest_png. Returns (created, reason_skipped_or_error). """ if not force and dest_png.exists() and dest_png.stat().st_size > 0: return (False, "exists") try: with Image.open(src_png) as im: W, H = im.size x = float(bbox.get("x", 0)) y = float(bbox.get("y", 0)) w = float(bbox.get("w", 0)) h = float(bbox.get("h", 0)) if w <= 0 or h <= 0: return (False, "zero-size-bbox") # Apply padding x_pad = max(0.0, x - padding) y_pad = max(0.0, y - padding) w_pad = min(1.0 - x_pad, w + 2 * padding) h_pad = min(1.0 - y_pad, h + 2 * padding) # Pixel coords px = int(round(x_pad * W)) py = int(round(y_pad * H)) pw = max(1, int(round(w_pad * W))) ph = max(1, int(round(h_pad * H))) crop = im.crop((px, py, px + pw, py + ph)) dest_png.parent.mkdir(parents=True, exist_ok=True) crop.save(dest_png, "PNG", optimize=True) return (True, None) except Exception as e: return (False, f"error: {e}") def process_page(page_md: Path, force: bool) -> dict: """Returns counts {created, skipped, error} for this page.""" fm = read_frontmatter(page_md) if not fm or fm.get("type") != "page": return {"created": 0, "skipped": 0, "error": 0} doc_id = fm.get("doc_id", "") page_id = fm.get("page_id", "") png_rel = fm.get("png_path", "") if not doc_id or not page_id or not png_rel: return {"created": 0, "skipped": 0, "error": 0} src_png = (page_md.parent / png_rel).resolve() if not src_png.exists(): sys.stderr.write(f" ✗ source PNG missing: {src_png}\n") return {"created": 0, "skipped": 0, "error": 1} page_num = int(fm.get("page_number", 0)) counts = {"created": 0, "skipped": 0, "error": 0} # 1. Images detected for idx, item in enumerate(fm.get("images_detected") or [], start=1): bbox = item.get("bbox") if not bbox: continue crop_id = make_crop_id("IMG", doc_id, page_num, idx) dest = CROPS_BASE / doc_id / f"{crop_id}.png" created, reason = crop_bbox( src_png=src_png, dest_png=dest, bbox=bbox, padding=PADDING_FRACTION, force=force, ) if created: counts["created"] += 1 elif reason == "exists": counts["skipped"] += 1 else: counts["error"] += 1 # 2. Tables detected for idx, item in enumerate(fm.get("tables_detected") or [], start=1): bbox = item.get("bbox") if not bbox: continue crop_id = make_crop_id("TBL", doc_id, page_num, idx) dest = CROPS_BASE / doc_id / f"{crop_id}.png" created, reason = crop_bbox( src_png=src_png, dest_png=dest, bbox=bbox, padding=PADDING_FRACTION * 2, force=force, ) if created: counts["created"] += 1 elif reason == "exists": counts["skipped"] += 1 else: counts["error"] += 1 # 3. Signatures observed for idx, item in enumerate(fm.get("signatures_observed") or [], start=1): bbox = item.get("bbox") if not bbox: continue crop_id = make_crop_id("SIG", doc_id, page_num, idx) dest = CROPS_BASE / doc_id / f"{crop_id}.png" created, reason = crop_bbox( src_png=src_png, dest_png=dest, bbox=bbox, padding=PADDING_FRACTION, force=force, ) if created: counts["created"] += 1 elif reason == "exists": counts["skipped"] += 1 else: counts["error"] += 1 return counts def main(): ap = argparse.ArgumentParser(description="Eager-crop bounding boxes from page PNGs.") g = ap.add_mutually_exclusive_group() g.add_argument("--doc-id", help="restrict to a single doc_id") g.add_argument("--all", action="store_true", help="process all pages (default)") ap.add_argument("--force", action="store_true", help="overwrite existing crops") args = ap.parse_args() if args.doc_id: glob = PAGES_BASE / args.doc_id / "*.md" pages = sorted(Path(str(glob).rsplit("/", 1)[0]).glob("*.md")) else: pages = sorted(PAGES_BASE.rglob("*.md")) totals = {"created": 0, "skipped": 0, "error": 0} doc_summary: dict[str, dict] = {} print(f"Cropping bboxes from {len(pages)} page(s)...", flush=True) for page_md in pages: c = process_page(page_md, args.force) doc_id = page_md.parent.name doc_summary.setdefault(doc_id, {"created": 0, "skipped": 0, "error": 0}) for k in c: totals[k] += c[k] doc_summary[doc_id][k] += c[k] for doc_id, c in sorted(doc_summary.items()): if c["created"] or c["error"]: print(f" {doc_id}: created={c['created']} skipped={c['skipped']} error={c['error']}", flush=True) print(f"\nTotal: created={totals['created']}, skipped={totals['skipped']}, error={totals['error']}", flush=True) if totals["created"] > 0 or totals["error"] > 0: with open(LOG_PATH, "a", encoding="utf-8") as fh: fh.write(f"\n## {utc_now_iso()} — CROP BBOXES (Phase 5.5)\n") fh.write(f"- operator: archivist\n- script: scripts/05-crop-bboxes.py\n") fh.write(f"- target: {args.doc_id or '(all)'}\n") fh.write(f"- created: {totals['created']}\n- skipped: {totals['skipped']}\n- error: {totals['error']}\n") if __name__ == "__main__": main()