disclosure-bureau/scripts/rebuild_doc65.py

64 lines
2 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
rebuild_doc65.py — Rebuild doc-65-hs1-834228961-62-hq-83894-section-4
Processes all 179 pages, writes chunks/, images/, _index.json, document.md
"""
import os
import sys
import json
import base64
import datetime
import time
from pathlib import Path
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-4"
DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 4 — FBI Flying Discs Investigation"
RAW_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}")
CHUNKS_DIR = RAW_DIR / "chunks"
IMAGES_DIR = RAW_DIR / "images"
TABLES_DIR = RAW_DIR / "tables"
# Ensure dirs exist
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
TABLES_DIR.mkdir(parents=True, exist_ok=True)
# Build ordered page map: page_number (1-based) -> (png_filename, ocr_filename)
def build_page_map():
pngs = sorted([f for f in os.listdir(PNG_DIR) if f.endswith('.png')])
ocrs = sorted([f for f in os.listdir(OCR_DIR) if f.endswith('.txt')])
page_map = {}
for i, (png, ocr) in enumerate(zip(pngs, ocrs), 1):
page_map[i] = {
'png': str(PNG_DIR / png),
'ocr': str(OCR_DIR / ocr),
'png_filename': png,
'ocr_filename': ocr
}
return page_map
def read_ocr(path):
try:
with open(path, 'r', encoding='utf-8') as f:
return f.read().strip()
except:
return ""
def encode_image_b64(path):
with open(path, 'rb') as f:
return base64.standard_b64encode(f.read()).decode('utf-8')
def now_iso():
return datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
if __name__ == "__main__":
page_map = build_page_map()
print(f"Total pages: {len(page_map)}")
for p, info in list(page_map.items())[:5]:
print(f" Page {p:03d}: png={info['png_filename']}, ocr={info['ocr_filename']}")
print("Script loaded OK")