64 lines
2 KiB
Python
64 lines
2 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
rebuild_doc65.py — Rebuild doc-65-hs1-834228961-62-hq-83894-section-4
|
|
Processes all 179 pages, writes chunks/, images/, _index.json, document.md
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import base64
|
|
import datetime
|
|
import time
|
|
from pathlib import Path
|
|
|
|
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-4"
|
|
DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 4 — FBI Flying Discs Investigation"
|
|
RAW_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
|
|
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
|
|
OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}")
|
|
|
|
CHUNKS_DIR = RAW_DIR / "chunks"
|
|
IMAGES_DIR = RAW_DIR / "images"
|
|
TABLES_DIR = RAW_DIR / "tables"
|
|
|
|
# Ensure dirs exist
|
|
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
|
|
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
|
|
TABLES_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Build ordered page map: page_number (1-based) -> (png_filename, ocr_filename)
|
|
def build_page_map():
|
|
pngs = sorted([f for f in os.listdir(PNG_DIR) if f.endswith('.png')])
|
|
ocrs = sorted([f for f in os.listdir(OCR_DIR) if f.endswith('.txt')])
|
|
page_map = {}
|
|
for i, (png, ocr) in enumerate(zip(pngs, ocrs), 1):
|
|
page_map[i] = {
|
|
'png': str(PNG_DIR / png),
|
|
'ocr': str(OCR_DIR / ocr),
|
|
'png_filename': png,
|
|
'ocr_filename': ocr
|
|
}
|
|
return page_map
|
|
|
|
def read_ocr(path):
|
|
try:
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
return f.read().strip()
|
|
except:
|
|
return ""
|
|
|
|
def encode_image_b64(path):
|
|
with open(path, 'rb') as f:
|
|
return base64.standard_b64encode(f.read()).decode('utf-8')
|
|
|
|
def now_iso():
|
|
return datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
|
|
if __name__ == "__main__":
|
|
page_map = build_page_map()
|
|
print(f"Total pages: {len(page_map)}")
|
|
for p, info in list(page_map.items())[:5]:
|
|
print(f" Page {p:03d}: png={info['png_filename']}, ocr={info['ocr_filename']}")
|
|
print("Script loaded OK")
|