645 lines
24 KiB
Python
645 lines
24 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
rebuild_doc65_section8.py
|
|
Direct Gemini-powered rebuild of doc-65-hs1-834228961-62-hq-83894-section-8.
|
|
Produces: chunks/, images/, tables/, _index.json, document.md
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import re
|
|
import time
|
|
import base64
|
|
import datetime
|
|
from pathlib import Path
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError as FuturesTimeout
|
|
|
|
from PIL import Image
|
|
import google.genai as genai
|
|
from google.genai import types
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Config
|
|
# ---------------------------------------------------------------------------
|
|
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-8"
|
|
DOC_TITLE = "65 HS1-834228961/62-HQ-83894 Section 8"
|
|
HIGHEST_CLASS = "TOP SECRET"
|
|
|
|
RAW_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
|
|
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
|
|
OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}")
|
|
CHUNKS_DIR = RAW_DIR / "chunks"
|
|
IMAGES_DIR = RAW_DIR / "images"
|
|
TABLES_DIR = RAW_DIR / "tables"
|
|
PAGES_RAW = RAW_DIR / "pages_raw.json"
|
|
|
|
MODEL = "models/gemini-3.1-flash-lite"
|
|
MAX_WORKERS = 4
|
|
PAGE_TIMEOUT = 150 # seconds per page
|
|
|
|
VALID_TYPES = {
|
|
"letterhead", "address_block", "classification_marking", "heading",
|
|
"paragraph", "form_field", "bulleted_item", "numbered_item", "quote_block",
|
|
"caption", "table_marker", "image", "stamp", "signature", "marginalia",
|
|
"redaction", "footer", "blank_area", "unknown",
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Gemini client
|
|
# ---------------------------------------------------------------------------
|
|
client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY"))
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Page-rebuilder prompt
|
|
# ---------------------------------------------------------------------------
|
|
PAGE_PROMPT = """\
|
|
You are a forensic document reconstruction agent for The Disclosure Bureau.
|
|
Given a single page image (PNG) and its raw OCR text from a US Department of War
|
|
declassified UAP/UFO document, decompose it into LOSSLESS agentic chunks.
|
|
|
|
## Chunk types — STRICT enum (use EXACTLY one of these 19 strings):
|
|
letterhead, address_block, classification_marking, heading, paragraph,
|
|
form_field, bulleted_item, numbered_item, quote_block, caption, table_marker,
|
|
image, stamp, signature, marginalia, redaction, footer, blank_area, unknown
|
|
|
|
## Output: ONE JSON object — NO markdown fences, NO prose before/after.
|
|
{{
|
|
"page_number": {page_number},
|
|
"page_summary_en": "1-2 sentences describing this page",
|
|
"page_summary_pt_br": "1-2 frases em português brasileiro",
|
|
"page_layout": {{
|
|
"columns": 1,
|
|
"orientation": "portrait",
|
|
"page_dimensions_approx": "letter"
|
|
}},
|
|
"chunks": [
|
|
{{
|
|
"order_in_page": 1,
|
|
"type": "paragraph",
|
|
"bbox": {{"x": 0.05, "y": 0.10, "w": 0.90, "h": 0.08}},
|
|
"content_en": "verbatim English text of this chunk",
|
|
"content_pt_br": "Texto em português brasileiro",
|
|
"metadata": {{
|
|
"ocr_confidence": 0.95,
|
|
"ocr_source_lines": [1, 2, 3],
|
|
"classification": null,
|
|
"redaction_code": null,
|
|
"redaction_inferred_content_type": null,
|
|
"image_type": null,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"prev_chunk_hint": null,
|
|
"next_chunk_hint": null,
|
|
"language_in_source": "en"
|
|
}}
|
|
}}
|
|
]
|
|
}}
|
|
|
|
## Rules:
|
|
1. Order by reading order (top→bottom, left→right). order_in_page is 1-indexed.
|
|
2. One semantic unit per chunk (one paragraph, one address block, one image, etc.).
|
|
3. ALL content accounted for — never skip anything, even blank areas if significant.
|
|
4. content_en: verbatim/near-verbatim. No paraphrasing.
|
|
5. content_pt_br: Brazilian Portuguese (pt-BR). Preserve UTF-8 accents: ç ã á é í ó ú â ê ô à.
|
|
Proper nouns and verbatim quoted passages stay in source language inside pt-br.
|
|
6. Redacted blocks: content_en = "[REDACTED — <code>]". Never fabricate hidden content.
|
|
7. bbox: normalized 0..1 relative to page PNG size. Tight around the chunk.
|
|
8. cross_page_hint: self_contained | continues_from_prev | continues_to_next
|
|
9. image chunks: content_en = brief 1-sentence placeholder description (will be analyzed separately).
|
|
10. classification field: exact string as it appears (e.g. "TOP SECRET", "SECRET//NOFORN") or null.
|
|
|
|
Document context:
|
|
doc_id: {doc_id}
|
|
page_number: {page_number} of {total_pages}
|
|
doc_title: {doc_title}
|
|
|
|
OCR text (layout-preserved, may have errors — trust the image when they disagree):
|
|
---
|
|
{ocr_text}
|
|
---
|
|
|
|
Now analyze the image + OCR and output the JSON:"""
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def get_page_files():
|
|
pages = []
|
|
for png in sorted(PNG_DIR.glob("p-*.png")):
|
|
m = re.match(r"p-0*(\d+)\.png", png.name)
|
|
if not m:
|
|
continue
|
|
pn = int(m.group(1))
|
|
# OCR: try zero-padded 3-digit, then bare number
|
|
for fmt in [f"p-{pn:03d}.txt", f"p-{pn}.txt"]:
|
|
ocr = OCR_DIR / fmt
|
|
if ocr.exists():
|
|
break
|
|
else:
|
|
ocr = None
|
|
pages.append((pn, png, ocr))
|
|
return pages
|
|
|
|
|
|
def encode_png(path):
|
|
with open(path, "rb") as f:
|
|
return base64.b64encode(f.read()).decode()
|
|
|
|
|
|
def call_gemini(png_path, ocr_text, page_num, total_pages):
|
|
prompt = PAGE_PROMPT.format(
|
|
doc_id=DOC_ID,
|
|
page_number=page_num,
|
|
total_pages=total_pages,
|
|
doc_title=DOC_TITLE,
|
|
ocr_text=ocr_text[:5000],
|
|
)
|
|
|
|
with open(png_path, "rb") as f:
|
|
img_bytes = f.read()
|
|
|
|
contents = [
|
|
types.Part(
|
|
inline_data=types.Blob(mime_type="image/png", data=img_bytes)
|
|
),
|
|
types.Part(text=prompt),
|
|
]
|
|
config = types.GenerateContentConfig(
|
|
temperature=0.1,
|
|
max_output_tokens=8192,
|
|
)
|
|
|
|
def _call():
|
|
resp = client.models.generate_content(
|
|
model=MODEL, contents=contents, config=config
|
|
)
|
|
if resp.text is None:
|
|
# Safety block or empty response — extract any available text from parts
|
|
try:
|
|
parts = resp.candidates[0].content.parts
|
|
return "\n".join(p.text for p in parts if hasattr(p, "text") and p.text)
|
|
except Exception:
|
|
return None
|
|
return resp.text
|
|
|
|
with ThreadPoolExecutor(max_workers=1) as ex:
|
|
future = ex.submit(_call)
|
|
return future.result(timeout=PAGE_TIMEOUT)
|
|
|
|
|
|
def parse_page_json(raw_text, page_num):
|
|
text = raw_text.strip()
|
|
text = re.sub(r"^```(?:json)?\s*", "", text, flags=re.MULTILINE)
|
|
text = re.sub(r"\s*```\s*$", "", text, flags=re.MULTILINE)
|
|
text = text.strip()
|
|
|
|
try:
|
|
data = json.loads(text)
|
|
except json.JSONDecodeError:
|
|
# Try to extract the largest {...} block
|
|
m = re.search(r"\{[\s\S]*\}", text)
|
|
if m:
|
|
try:
|
|
data = json.loads(m.group(0))
|
|
except json.JSONDecodeError:
|
|
return {"page_number": page_num, "error": "json_parse_failed",
|
|
"chunks": [], "raw": text[:300]}
|
|
else:
|
|
return {"page_number": page_num, "error": "no_json_found",
|
|
"chunks": [], "raw": text[:300]}
|
|
|
|
data["page_number"] = page_num
|
|
# Validate and normalize chunk types
|
|
for c in data.get("chunks", []):
|
|
if c.get("type") not in VALID_TYPES:
|
|
c["type"] = "unknown"
|
|
return data
|
|
|
|
|
|
def fallback_chunk(page_num, ocr_text):
|
|
"""Minimal unknown chunk when Gemini fails persistently."""
|
|
preview = ocr_text[:200].strip() if ocr_text and ocr_text.strip() else "(page content unavailable)"
|
|
return {
|
|
"page_number": page_num,
|
|
"page_summary_en": f"Page {page_num} — content could not be parsed by vision model.",
|
|
"page_summary_pt_br": f"Página {page_num} — conteúdo não pôde ser analisado pelo modelo de visão.",
|
|
"page_layout": {"columns": 1, "orientation": "portrait", "page_dimensions_approx": "letter"},
|
|
"chunks": [{
|
|
"order_in_page": 1,
|
|
"type": "unknown",
|
|
"bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
|
|
"content_en": f"[Vision analysis failed — OCR excerpt: {preview}]",
|
|
"content_pt_br": f"[Análise de visão falhou — trecho OCR: {preview}]",
|
|
"metadata": {
|
|
"ocr_confidence": 0.0,
|
|
"ocr_source_lines": [],
|
|
"classification": None,
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": None,
|
|
"formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"prev_chunk_hint": None,
|
|
"next_chunk_hint": None,
|
|
"language_in_source": "en",
|
|
},
|
|
}],
|
|
}
|
|
|
|
|
|
def process_page(page_num, png_path, ocr_path, total_pages, use_fallback=False):
|
|
ocr_text = (
|
|
ocr_path.read_text(encoding="utf-8", errors="replace")
|
|
if ocr_path
|
|
else "(OCR not available)"
|
|
)
|
|
if use_fallback:
|
|
return fallback_chunk(page_num, ocr_text)
|
|
try:
|
|
raw = call_gemini(png_path, ocr_text, page_num, total_pages)
|
|
if raw is None:
|
|
return {"page_number": page_num, "error": "gemini_none_response", "chunks": []}
|
|
return parse_page_json(raw, page_num)
|
|
except FuturesTimeout:
|
|
return {"page_number": page_num, "error": "timeout", "chunks": []}
|
|
except Exception as exc:
|
|
return {"page_number": page_num, "error": str(exc)[:200], "chunks": []}
|
|
|
|
|
|
def is_valid_page(p):
|
|
return bool(p.get("chunks")) and not p.get("error")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Phase 1: process all pages
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def phase_process_pages(pages):
|
|
total = len(pages)
|
|
print(f"[Phase 1] Processing {total} pages with {MODEL} ...")
|
|
|
|
# Load existing checkpoint
|
|
existing_map = {}
|
|
failed_pages = set()
|
|
if PAGES_RAW.exists():
|
|
try:
|
|
existing = json.loads(PAGES_RAW.read_text(encoding="utf-8"))
|
|
for p in existing:
|
|
if is_valid_page(p):
|
|
existing_map[p["page_number"]] = p
|
|
elif p.get("error"):
|
|
failed_pages.add(p["page_number"])
|
|
print(f" Checkpoint: {len(existing_map)} valid pages loaded, {len(failed_pages)} previously failed")
|
|
except Exception:
|
|
pass
|
|
|
|
to_process = [(pn, pp, op) for pn, pp, op in pages if pn not in existing_map]
|
|
print(f" Remaining: {len(to_process)} pages")
|
|
|
|
results_map = dict(existing_map)
|
|
|
|
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
|
futures = {
|
|
executor.submit(process_page, pn, pp, op, total, pn in failed_pages): pn
|
|
for pn, pp, op in to_process
|
|
}
|
|
done = 0
|
|
for future in as_completed(futures):
|
|
pn = futures[future]
|
|
done += 1
|
|
try:
|
|
result = future.result(timeout=PAGE_TIMEOUT + 30)
|
|
except Exception as exc:
|
|
result = {"page_number": pn, "error": str(exc)[:200], "chunks": []}
|
|
results_map[pn] = result
|
|
nchunks = len(result.get("chunks", []))
|
|
status = "OK" if is_valid_page(result) else f"ERR({result.get('error','?')[:40]})"
|
|
print(f" [{done}/{len(to_process)}] p-{pn:03d}: {status} chunks={nchunks}")
|
|
# Checkpoint every 10 pages
|
|
if done % 10 == 0:
|
|
ordered = [results_map[p[0]] for p in pages if p[0] in results_map]
|
|
PAGES_RAW.write_text(
|
|
json.dumps(ordered, ensure_ascii=False, indent=2), encoding="utf-8"
|
|
)
|
|
|
|
# Final save
|
|
ordered = [results_map[p[0]] for p in pages if p[0] in results_map]
|
|
PAGES_RAW.write_text(json.dumps(ordered, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
print(f" Saved {len(ordered)} pages to pages_raw.json")
|
|
return results_map
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Phase 2: globally number chunks
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def phase_number_chunks(pages, results_map):
|
|
print("[Phase 2] Globally numbering chunks ...")
|
|
all_chunks = [] # list of (page_num, chunk_dict)
|
|
for pn, _, _ in pages:
|
|
pg = results_map.get(pn, {})
|
|
chunks = sorted(pg.get("chunks", []), key=lambda c: c.get("order_in_page", 0))
|
|
for c in chunks:
|
|
all_chunks.append((pn, c))
|
|
|
|
total_chunks = len(all_chunks)
|
|
for i, (pn, c) in enumerate(all_chunks, 1):
|
|
c["chunk_id"] = f"c{i:04d}"
|
|
c["order_global"] = i
|
|
c["page"] = pn
|
|
c["prev_chunk"] = f"c{i-1:04d}" if i > 1 else None
|
|
c["next_chunk"] = f"c{i+1:04d}" if i < total_chunks else None
|
|
print(f" Total chunks: {total_chunks}")
|
|
return all_chunks
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Phase 3: crop image chunks
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def phase_crop_images(all_chunks, pages):
|
|
png_map = {pn: pp for pn, pp, _ in pages}
|
|
image_chunks = [(pn, c) for pn, c in all_chunks if c.get("type") == "image"]
|
|
print(f"[Phase 3] Cropping {len(image_chunks)} image chunks ...")
|
|
|
|
for pn, c in image_chunks:
|
|
cid = c["chunk_id"]
|
|
out_path = IMAGES_DIR / f"IMG-{cid}.png"
|
|
if out_path.exists():
|
|
continue
|
|
png_path = png_map.get(pn)
|
|
if not png_path:
|
|
continue
|
|
bbox = c.get("bbox", {})
|
|
if not bbox:
|
|
continue
|
|
try:
|
|
im = Image.open(png_path)
|
|
W, H = im.size
|
|
pad = 0.005
|
|
x = bbox.get("x", 0)
|
|
y = bbox.get("y", 0)
|
|
w = bbox.get("w", 1)
|
|
h = bbox.get("h", 1)
|
|
left = max(0, int((x - pad) * W))
|
|
top = max(0, int((y - pad) * H))
|
|
right = min(W, int((x + w + pad) * W))
|
|
bottom = min(H, int((y + h + pad) * H))
|
|
if right > left and bottom > top:
|
|
crop = im.crop((left, top, right, bottom))
|
|
crop.save(out_path)
|
|
c["related_image"] = f"IMG-{cid}.png"
|
|
except Exception as exc:
|
|
print(f" WARN crop {cid}: {exc}")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Phase 4: write chunk files
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def phase_write_chunks(all_chunks, pages):
|
|
png_map = {pn: pp for pn, pp, _ in pages}
|
|
print(f"[Phase 4] Writing {len(all_chunks)} chunk files ...")
|
|
for pn, c in all_chunks:
|
|
cid = c["chunk_id"]
|
|
chunk_path = CHUNKS_DIR / f"{cid}.md"
|
|
meta = c.get("metadata", {})
|
|
bbox = c.get("bbox", {"x": 0, "y": 0, "w": 0, "h": 0})
|
|
png_path = png_map.get(pn, "")
|
|
rel_png = f"../../processing/png/{DOC_ID}/{Path(str(png_path)).name}" if png_path else "null"
|
|
|
|
yaml_lines = [
|
|
"---",
|
|
f"chunk_id: {cid}",
|
|
f"type: {c.get('type', 'unknown')}",
|
|
f"page: {pn}",
|
|
f"order_in_page: {c.get('order_in_page', 0)}",
|
|
f"order_global: {c.get('order_global', 0)}",
|
|
f"bbox: {{x: {bbox.get('x',0):.4f}, y: {bbox.get('y',0):.4f}, w: {bbox.get('w',0):.4f}, h: {bbox.get('h',0):.4f}}}",
|
|
f"classification: {json.dumps(meta.get('classification'))}",
|
|
f"formatting: {json.dumps(meta.get('formatting', []))}",
|
|
f"cross_page_hint: {meta.get('cross_page_hint', 'self_contained')}",
|
|
f"prev_chunk: {json.dumps(c.get('prev_chunk'))}",
|
|
f"next_chunk: {json.dumps(c.get('next_chunk'))}",
|
|
f"related_image: {json.dumps(c.get('related_image'))}",
|
|
f"related_table: {json.dumps(c.get('related_table'))}",
|
|
f"ocr_confidence: {meta.get('ocr_confidence', 0.0)}",
|
|
f"ocr_source_lines: {json.dumps(meta.get('ocr_source_lines', []))}",
|
|
f"redaction_code: {json.dumps(meta.get('redaction_code'))}",
|
|
f"redaction_inferred_content_type: {json.dumps(meta.get('redaction_inferred_content_type'))}",
|
|
f"image_type: {json.dumps(meta.get('image_type'))}",
|
|
f"ufo_anomaly_detected: {str(c.get('ufo_anomaly_detected', False)).lower()}",
|
|
f"cryptid_anomaly_detected: {str(c.get('cryptid_anomaly_detected', False)).lower()}",
|
|
f"ufo_anomaly_type: {json.dumps(c.get('ufo_anomaly_type'))}",
|
|
f"ufo_anomaly_rationale: {json.dumps(c.get('ufo_anomaly_rationale'))}",
|
|
f"cryptid_anomaly_type: {json.dumps(c.get('cryptid_anomaly_type'))}",
|
|
f"cryptid_anomaly_rationale: {json.dumps(c.get('cryptid_anomaly_rationale'))}",
|
|
f"image_description_en: {json.dumps(c.get('image_description_en'))}",
|
|
f"image_description_pt_br: {json.dumps(c.get('image_description_pt_br'))}",
|
|
f"extracted_text: {json.dumps(c.get('extracted_text'))}",
|
|
f"source_png: {rel_png}",
|
|
"---",
|
|
]
|
|
body = "\n".join(yaml_lines) + "\n\n"
|
|
body += f"**EN:** {c.get('content_en', '')}\n\n"
|
|
body += f"**PT-BR:** {c.get('content_pt_br', '')}\n"
|
|
chunk_path.write_text(body, encoding="utf-8")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Phase 5: write _index.json
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def phase_write_index(all_chunks, pages):
|
|
total_pages = len(pages)
|
|
total_chunks = len(all_chunks)
|
|
build_at = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
index = {
|
|
"doc_id": DOC_ID,
|
|
"schema_version": "0.2.0",
|
|
"total_pages": total_pages,
|
|
"total_chunks": total_chunks,
|
|
"build_approach": "subagents",
|
|
"build_model": MODEL,
|
|
"build_at": build_at,
|
|
"chunks": [],
|
|
}
|
|
|
|
for pn, c in all_chunks:
|
|
cid = c["chunk_id"]
|
|
preview = (c.get("content_en") or "")[:80]
|
|
index["chunks"].append({
|
|
"chunk_id": cid,
|
|
"type": c.get("type", "unknown"),
|
|
"page": pn,
|
|
"order_in_page": c.get("order_in_page", 0),
|
|
"order_global": c.get("order_global", 0),
|
|
"file": f"chunks/{cid}.md",
|
|
"bbox": c.get("bbox", {}),
|
|
"preview": preview,
|
|
})
|
|
|
|
index_path = RAW_DIR / "_index.json"
|
|
index_path.write_text(json.dumps(index, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
print(f"[Phase 5] Written _index.json ({total_chunks} entries)")
|
|
return build_at
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Phase 6: assemble document.md
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def phase_assemble_document(all_chunks, pages, results_map, build_at):
|
|
total_pages = len(pages)
|
|
total_chunks = len(all_chunks)
|
|
|
|
# Histograms + anomaly lists
|
|
type_hist = {}
|
|
ufo_flagged = []
|
|
cryptid_flagged = []
|
|
for pn, c in all_chunks:
|
|
ctype = c.get("type", "unknown")
|
|
type_hist[ctype] = type_hist.get(ctype, 0) + 1
|
|
if c.get("ufo_anomaly_detected"):
|
|
ufo_flagged.append(c["chunk_id"])
|
|
if c.get("cryptid_anomaly_detected"):
|
|
cryptid_flagged.append(c["chunk_id"])
|
|
|
|
build_at_str = build_at
|
|
frontmatter = f"""---
|
|
schema_version: "0.2.0"
|
|
type: master_document
|
|
doc_id: {DOC_ID}
|
|
canonical_title: "{DOC_TITLE}"
|
|
total_pages: {total_pages}
|
|
total_chunks: {total_chunks}
|
|
chunk_types_histogram: {json.dumps(type_hist, ensure_ascii=False)}
|
|
multi_page_tables: []
|
|
ufo_anomalies_flagged: {json.dumps(ufo_flagged)}
|
|
cryptid_anomalies_flagged: {json.dumps(cryptid_flagged)}
|
|
build_approach: "subagents"
|
|
build_model: "{MODEL}"
|
|
build_at: "{build_at_str}"
|
|
---
|
|
|
|
"""
|
|
|
|
# Group chunks by page
|
|
chunks_by_page = {}
|
|
for pn, c in all_chunks:
|
|
chunks_by_page.setdefault(pn, []).append(c)
|
|
|
|
body_parts = []
|
|
for pn, _, _ in pages:
|
|
pg = results_map.get(pn, {})
|
|
summary_en = pg.get("page_summary_en", "")
|
|
summary_pt = pg.get("page_summary_pt_br", "")
|
|
body_parts.append(f"\n## Page {pn}\n")
|
|
if summary_en:
|
|
body_parts.append(f"<!-- page_summary_en: {summary_en} -->\n")
|
|
if summary_pt:
|
|
body_parts.append(f"<!-- page_summary_pt_br: {summary_pt} -->\n")
|
|
body_parts.append("\n")
|
|
|
|
for c in chunks_by_page.get(pn, []):
|
|
cid = c["chunk_id"]
|
|
ctype = c.get("type", "unknown")
|
|
bbox = c.get("bbox", {})
|
|
bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',0):.2f}/{bbox.get('h',0):.2f}"
|
|
|
|
body_parts.append(f"<!-- chunk:{cid} src:./chunks/{cid}.md -->\n")
|
|
body_parts.append(f'<a id="{cid}"></a>\n')
|
|
body_parts.append(f"### Chunk {cid} — {ctype} · p{pn} · bbox: {bbox_str}\n\n")
|
|
body_parts.append(f"**EN:** {c.get('content_en', '')}\n\n")
|
|
body_parts.append(f"**PT-BR:** {c.get('content_pt_br', '')}\n\n")
|
|
|
|
if ctype == "image" and c.get("related_image"):
|
|
body_parts.append(f"\n\n")
|
|
if c.get("image_description_en"):
|
|
body_parts.append(f"*Image (EN): {c['image_description_en']}*\n\n")
|
|
if c.get("image_description_pt_br"):
|
|
body_parts.append(f"*Imagem (PT-BR): {c['image_description_pt_br']}*\n\n")
|
|
|
|
# Metadata details block
|
|
meta_json = {
|
|
"chunk_id": cid,
|
|
"type": ctype,
|
|
"page": pn,
|
|
"order_global": c.get("order_global"),
|
|
"bbox": bbox,
|
|
"classification": c.get("metadata", {}).get("classification"),
|
|
"formatting": c.get("metadata", {}).get("formatting", []),
|
|
"cross_page_hint": c.get("metadata", {}).get("cross_page_hint"),
|
|
"ocr_confidence": c.get("metadata", {}).get("ocr_confidence"),
|
|
"ufo_anomaly_detected": c.get("ufo_anomaly_detected", False),
|
|
"cryptid_anomaly_detected": c.get("cryptid_anomaly_detected", False),
|
|
}
|
|
body_parts.append("<details><summary>metadata</summary>\n\n")
|
|
body_parts.append("```json\n")
|
|
body_parts.append(json.dumps(meta_json, ensure_ascii=False, indent=2))
|
|
body_parts.append("\n```\n\n</details>\n\n---\n\n")
|
|
|
|
doc_content = frontmatter + "".join(body_parts)
|
|
doc_path = RAW_DIR / "document.md"
|
|
doc_path.write_text(doc_content, encoding="utf-8")
|
|
doc_bytes = len(doc_content.encode("utf-8"))
|
|
print(f"[Phase 6] Written document.md ({doc_bytes:,} bytes)")
|
|
return doc_bytes, ufo_flagged, cryptid_flagged
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def main():
|
|
start = time.time()
|
|
|
|
# Ensure output dirs exist
|
|
for d in [CHUNKS_DIR, IMAGES_DIR, TABLES_DIR]:
|
|
d.mkdir(parents=True, exist_ok=True)
|
|
|
|
pages = get_page_files()
|
|
if not pages:
|
|
print("ERROR: no PNG pages found", file=sys.stderr)
|
|
sys.exit(1)
|
|
total_pages = len(pages)
|
|
print(f"Document: {DOC_ID}")
|
|
print(f"Pages found: {total_pages}")
|
|
|
|
# Phase 1: vision + OCR per page
|
|
results_map = phase_process_pages(pages)
|
|
|
|
# Phase 2: global chunk numbering
|
|
all_chunks = phase_number_chunks(pages, results_map)
|
|
|
|
# Phase 3: crop image chunks
|
|
phase_crop_images(all_chunks, pages)
|
|
|
|
# Phase 4: write chunk .md files
|
|
phase_write_chunks(all_chunks, pages)
|
|
|
|
# Phase 5: write _index.json
|
|
build_at = phase_write_index(all_chunks, pages)
|
|
|
|
# Phase 6: assemble document.md
|
|
doc_bytes, ufo_flagged, cryptid_flagged = phase_assemble_document(
|
|
all_chunks, pages, results_map, build_at
|
|
)
|
|
|
|
wall = int(time.time() - start)
|
|
images_count = len(list(IMAGES_DIR.glob("IMG-*.png")))
|
|
tables_count = len(list(TABLES_DIR.glob("TBL-*.csv")))
|
|
|
|
print(f"\nSTATS pages_done={total_pages} chunks_total={len(all_chunks)} "
|
|
f"images_extracted={images_count} tables_stitched={tables_count} "
|
|
f"ufo_anomalies={len(ufo_flagged)} cryptid_anomalies={len(cryptid_flagged)} "
|
|
f"wall_seconds={wall}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|