#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ rebuild_doc65.py — Rebuild doc-65-hs1-834228961-62-hq-83894-section-4 Processes all 179 pages, writes chunks/, images/, _index.json, document.md """ import os import sys import json import base64 import datetime import time from pathlib import Path DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-4" DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 4 — FBI Flying Discs Investigation" RAW_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}") PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}") OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}") CHUNKS_DIR = RAW_DIR / "chunks" IMAGES_DIR = RAW_DIR / "images" TABLES_DIR = RAW_DIR / "tables" # Ensure dirs exist CHUNKS_DIR.mkdir(parents=True, exist_ok=True) IMAGES_DIR.mkdir(parents=True, exist_ok=True) TABLES_DIR.mkdir(parents=True, exist_ok=True) # Build ordered page map: page_number (1-based) -> (png_filename, ocr_filename) def build_page_map(): pngs = sorted([f for f in os.listdir(PNG_DIR) if f.endswith('.png')]) ocrs = sorted([f for f in os.listdir(OCR_DIR) if f.endswith('.txt')]) page_map = {} for i, (png, ocr) in enumerate(zip(pngs, ocrs), 1): page_map[i] = { 'png': str(PNG_DIR / png), 'ocr': str(OCR_DIR / ocr), 'png_filename': png, 'ocr_filename': ocr } return page_map def read_ocr(path): try: with open(path, 'r', encoding='utf-8') as f: return f.read().strip() except: return "" def encode_image_b64(path): with open(path, 'rb') as f: return base64.standard_b64encode(f.read()).decode('utf-8') def now_iso(): return datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') if __name__ == "__main__": page_map = build_page_map() print(f"Total pages: {len(page_map)}") for p, info in list(page_map.items())[:5]: print(f" Page {p:03d}: png={info['png_filename']}, ocr={info['ocr_filename']}") print("Script loaded OK")