514 lines
20 KiB
Python
514 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
rebuild_doc65_gemini.py
|
|
Full pipeline to rebuild doc-65-hs1-834228961-62-hq-83894-section-4.
|
|
Uses Google Gemini flash for vision analysis of each page.
|
|
CRITICAL: Always wraps Gemini calls with thread timeout (known hang issue).
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import datetime
|
|
import time
|
|
import re
|
|
import concurrent.futures
|
|
from pathlib import Path
|
|
from PIL import Image as PILImage
|
|
|
|
import warnings
|
|
warnings.filterwarnings('ignore')
|
|
|
|
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-4"
|
|
DOC_TITLE = "65 HS1-834228961 62-HQ-83894 Section 4 — FBI Flying Discs / UAP Investigation Files"
|
|
RAW_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
|
|
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
|
|
OCR_DIR = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}")
|
|
CHUNKS_DIR = RAW_DIR / "chunks"
|
|
IMAGES_DIR = RAW_DIR / "images"
|
|
TABLES_DIR = RAW_DIR / "tables"
|
|
|
|
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
|
|
GEMINI_MODEL = "gemini-2.5-flash"
|
|
BATCH_SIZE = 4
|
|
MAX_WORKERS = 4
|
|
GEMINI_TIMEOUT_SEC = 120
|
|
|
|
for d in [CHUNKS_DIR, IMAGES_DIR, TABLES_DIR]:
|
|
d.mkdir(parents=True, exist_ok=True)
|
|
|
|
def build_page_map():
|
|
pngs = sorted([f for f in os.listdir(PNG_DIR) if f.endswith('.png')])
|
|
ocrs = sorted([f for f in os.listdir(OCR_DIR) if f.endswith('.txt')])
|
|
page_map = {}
|
|
for i, (png, ocr) in enumerate(zip(pngs, ocrs), 1):
|
|
page_map[i] = {
|
|
'png': str(PNG_DIR / png),
|
|
'ocr': str(OCR_DIR / ocr),
|
|
'png_filename': png,
|
|
}
|
|
return page_map
|
|
|
|
def read_ocr(path):
|
|
try:
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
return f.read().strip()
|
|
except:
|
|
return ""
|
|
|
|
def now_iso():
|
|
return datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
|
|
# Compact prompt that minimizes token usage in the response
|
|
PAGE_ANALYSIS_PROMPT = """Analyze this FBI declassified document page. Return ONLY raw JSON (no markdown fences).
|
|
|
|
JSON format (keep content_en values SHORT — max 300 chars per chunk, truncate with "..." if needed):
|
|
{"page_number":%d,"chunks":[{"type":"<cover|letterhead|heading|subheading|body_text|caption|signature|stamp|redaction|table_marker|image|handwriting|footer|header|separator|blank>","order_in_page":<int>,"content_en":"<text>","content_pt_br":"<pt-br>","bbox":{"x":<0-1>,"y":<0-1>,"w":<0-1>,"h":<0-1>},"classification":null,"formatting":[],"cross_page_hint":"self_contained","ocr_confidence":<0-1>,"ocr_source_lines":[],"redaction_code":null,"redaction_inferred_content_type":null,"image_type":null,"ufo_anomaly_detected":<true|false>,"cryptid_anomaly_detected":false,"ufo_anomaly_type":null,"ufo_anomaly_rationale":null,"cryptid_anomaly_type":null,"cryptid_anomaly_rationale":null,"image_description_en":null,"image_description_pt_br":null,"extracted_text":null}]}
|
|
|
|
Rules:
|
|
- Each paragraph/section = separate chunk
|
|
- Redacted: type=redaction, content_en="[REDACTED]"
|
|
- Blank page: one chunk type=blank
|
|
- Flying disc/UAP reports: ufo_anomaly_detected=true
|
|
- bbox: x=left, y=top, w=width, h=height, all 0.0-1.0
|
|
- Page %d of 179, FBI flying discs 1947"""
|
|
|
|
def fallback_chunk(page_num):
|
|
return {
|
|
"page_number": page_num,
|
|
"chunks": [{
|
|
"type": "body_text",
|
|
"order_in_page": 1,
|
|
"content_en": f"[Page {page_num} — vision analysis failed]",
|
|
"content_pt_br": f"[Página {page_num} — análise visual falhou]",
|
|
"bbox": {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90},
|
|
"classification": None, "formatting": [],
|
|
"cross_page_hint": "self_contained",
|
|
"ocr_confidence": 0.0, "ocr_source_lines": [],
|
|
"redaction_code": None, "redaction_inferred_content_type": None,
|
|
"image_type": None, "ufo_anomaly_detected": False,
|
|
"cryptid_anomaly_detected": False,
|
|
"ufo_anomaly_type": None, "ufo_anomaly_rationale": None,
|
|
"cryptid_anomaly_type": None, "cryptid_anomaly_rationale": None,
|
|
"image_description_en": None, "image_description_pt_br": None,
|
|
"extracted_text": None
|
|
}]
|
|
}
|
|
|
|
def _gemini_call_inner(page_num, png_path, prompt_text):
|
|
"""Single Gemini API call — run inside thread for timeout."""
|
|
import google.genai as genai
|
|
import google.genai.types as gTypes
|
|
|
|
client = genai.Client(api_key=GEMINI_API_KEY)
|
|
|
|
with open(png_path, 'rb') as f:
|
|
img_bytes = f.read()
|
|
|
|
response = client.models.generate_content(
|
|
model=GEMINI_MODEL,
|
|
contents=[
|
|
gTypes.Part.from_bytes(data=img_bytes, mime_type='image/png'),
|
|
prompt_text
|
|
],
|
|
config=gTypes.GenerateContentConfig(
|
|
temperature=0.1,
|
|
max_output_tokens=16384,
|
|
)
|
|
)
|
|
return response.text
|
|
|
|
def clean_json_text(text):
|
|
"""Try to clean and extract JSON from potentially truncated response."""
|
|
if text is None:
|
|
return None
|
|
text = text.strip()
|
|
# Remove markdown fences
|
|
text = re.sub(r'^```(?:json)?\s*', '', text, flags=re.MULTILINE)
|
|
text = re.sub(r'\s*```\s*$', '', text, flags=re.MULTILINE)
|
|
text = text.strip()
|
|
|
|
# Try direct parse first
|
|
try:
|
|
return json.loads(text)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Try to find the JSON object boundaries
|
|
start = text.find('{')
|
|
if start == -1:
|
|
return None
|
|
|
|
# Try to repair truncated JSON by finding the last complete chunk
|
|
# Strategy: find the last complete chunk object and close the array properly
|
|
text_from_start = text[start:]
|
|
|
|
# Try progressively smaller slices to find valid JSON
|
|
# Look for last valid chunk boundary
|
|
last_bracket = text_from_start.rfind('}')
|
|
while last_bracket > 0:
|
|
candidate = text_from_start[:last_bracket+1]
|
|
# Try to close the chunks array and root object
|
|
for suffix in ['', ']}', ']}}']:
|
|
try:
|
|
result = json.loads(candidate + suffix)
|
|
return result
|
|
except:
|
|
pass
|
|
last_bracket = text_from_start.rfind('}', 0, last_bracket)
|
|
|
|
return None
|
|
|
|
def analyze_page(page_num, png_path, ocr_text, retry=3):
|
|
prompt = PAGE_ANALYSIS_PROMPT % (page_num, page_num)
|
|
|
|
for attempt in range(retry):
|
|
try:
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
|
future = pool.submit(_gemini_call_inner, page_num, png_path, prompt)
|
|
text = future.result(timeout=GEMINI_TIMEOUT_SEC)
|
|
|
|
data = clean_json_text(text)
|
|
if data and 'chunks' in data and data['chunks']:
|
|
return data
|
|
|
|
print(f" P{page_num} no valid JSON (attempt {attempt+1})", flush=True)
|
|
if attempt < retry - 1:
|
|
time.sleep(2)
|
|
|
|
except concurrent.futures.TimeoutError:
|
|
print(f" P{page_num} TIMEOUT (attempt {attempt+1}/{retry})", flush=True)
|
|
if attempt < retry - 1:
|
|
time.sleep(5)
|
|
except Exception as e:
|
|
err = str(e)
|
|
print(f" P{page_num} error (attempt {attempt+1}): {err[:120]}", flush=True)
|
|
if '429' in err or 'RESOURCE_EXHAUSTED' in err:
|
|
wait = 15 * (attempt + 1)
|
|
print(f" Rate limit hit, waiting {wait}s...", flush=True)
|
|
time.sleep(wait)
|
|
elif attempt < retry - 1:
|
|
time.sleep(3)
|
|
|
|
return fallback_chunk(page_num)
|
|
|
|
def process_page_task(args):
|
|
page_num, png_path, ocr_path = args
|
|
ocr_text = read_ocr(ocr_path)
|
|
result = analyze_page(page_num, png_path, ocr_text)
|
|
n = len(result.get('chunks', []))
|
|
print(f" P{page_num:03d}: {n} chunks", flush=True)
|
|
return page_num, result
|
|
|
|
def crop_image_chunk(page_png, bbox, out_path):
|
|
try:
|
|
im = PILImage.open(page_png)
|
|
W, H = im.size
|
|
x = max(0.0, float(bbox.get('x', 0)))
|
|
y = max(0.0, float(bbox.get('y', 0)))
|
|
w = max(0.01, float(bbox.get('w', 0.5)))
|
|
h = max(0.01, float(bbox.get('h', 0.5)))
|
|
pad = 0.005
|
|
left = max(0, int((x - pad) * W))
|
|
top = max(0, int((y - pad) * H))
|
|
right = min(W, int((x + w + pad) * W))
|
|
bottom = min(H, int((y + h + pad) * H))
|
|
if right <= left or bottom <= top:
|
|
return False
|
|
crop = im.crop((left, top, right, bottom))
|
|
crop.save(str(out_path))
|
|
return True
|
|
except Exception as e:
|
|
print(f" Crop error: {e}", flush=True)
|
|
return False
|
|
|
|
def yaml_scalar(v):
|
|
if v is None:
|
|
return "null"
|
|
if isinstance(v, bool):
|
|
return "true" if v else "false"
|
|
if isinstance(v, (int, float)):
|
|
return str(v)
|
|
if isinstance(v, list):
|
|
if not v:
|
|
return "[]"
|
|
return "[" + ", ".join(yaml_scalar(i) for i in v) + "]"
|
|
if isinstance(v, dict):
|
|
return "{" + ", ".join(f"{k}: {yaml_scalar(vv)}" for k, vv in v.items()) + "}"
|
|
s = str(v)
|
|
needs_quote = any(c in s for c in [':', '#', '[', ']', '{', '}', '|', '>', '*', '&', '!', "'", '"', '\n', '\r'])
|
|
if needs_quote:
|
|
s = s.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n').replace('\r', '')
|
|
return f'"{s}"'
|
|
return s
|
|
|
|
def bbox_safe(bbox):
|
|
if not bbox or not isinstance(bbox, dict):
|
|
return {"x": 0.05, "y": 0.05, "w": 0.90, "h": 0.90}
|
|
return {
|
|
"x": float(bbox.get('x', 0.05)),
|
|
"y": float(bbox.get('y', 0.05)),
|
|
"w": float(bbox.get('w', 0.90)),
|
|
"h": float(bbox.get('h', 0.90)),
|
|
}
|
|
|
|
def write_chunk_file(chunk_id, chunk_data, page_num, order_global, prev_chunk, next_chunk, source_png_filename):
|
|
bbox = bbox_safe(chunk_data.get('bbox'))
|
|
ctype = chunk_data.get('type', 'body_text')
|
|
related_image = f"IMG-{chunk_id}.png" if ctype == 'image' else None
|
|
|
|
meta = {
|
|
"chunk_id": chunk_id,
|
|
"type": ctype,
|
|
"page": page_num,
|
|
"order_in_page": chunk_data.get('order_in_page', 1),
|
|
"order_global": order_global,
|
|
"bbox": bbox,
|
|
"classification": chunk_data.get('classification'),
|
|
"formatting": chunk_data.get('formatting', []),
|
|
"cross_page_hint": chunk_data.get('cross_page_hint', 'self_contained'),
|
|
"prev_chunk": prev_chunk,
|
|
"next_chunk": next_chunk,
|
|
"related_image": related_image,
|
|
"related_table": None,
|
|
"ocr_confidence": chunk_data.get('ocr_confidence', 0.8),
|
|
"ocr_source_lines": chunk_data.get('ocr_source_lines', []),
|
|
"redaction_code": chunk_data.get('redaction_code'),
|
|
"redaction_inferred_content_type": chunk_data.get('redaction_inferred_content_type'),
|
|
"image_type": chunk_data.get('image_type'),
|
|
"ufo_anomaly_detected": bool(chunk_data.get('ufo_anomaly_detected', False)),
|
|
"cryptid_anomaly_detected": bool(chunk_data.get('cryptid_anomaly_detected', False)),
|
|
"ufo_anomaly_type": chunk_data.get('ufo_anomaly_type'),
|
|
"ufo_anomaly_rationale": chunk_data.get('ufo_anomaly_rationale'),
|
|
"cryptid_anomaly_type": chunk_data.get('cryptid_anomaly_type'),
|
|
"cryptid_anomaly_rationale": chunk_data.get('cryptid_anomaly_rationale'),
|
|
"image_description_en": chunk_data.get('image_description_en'),
|
|
"image_description_pt_br": chunk_data.get('image_description_pt_br'),
|
|
"extracted_text": chunk_data.get('extracted_text'),
|
|
"source_png": f"../../processing/png/{DOC_ID}/{source_png_filename}",
|
|
}
|
|
|
|
lines = ["---"]
|
|
for k, v in meta.items():
|
|
if isinstance(v, dict):
|
|
pairs = ", ".join(f"{kk}: {yaml_scalar(vv)}" for kk, vv in v.items())
|
|
lines.append(f"{k}: {{{pairs}}}")
|
|
else:
|
|
lines.append(f"{k}: {yaml_scalar(v)}")
|
|
lines.append("---")
|
|
lines.append("")
|
|
lines.append(f"**EN:** {chunk_data.get('content_en', '')}")
|
|
lines.append("")
|
|
lines.append(f"**PT-BR:** {chunk_data.get('content_pt_br', '')}")
|
|
lines.append("")
|
|
|
|
out_path = CHUNKS_DIR / f"{chunk_id}.md"
|
|
with open(str(out_path), 'w', encoding='utf-8') as f:
|
|
f.write('\n'.join(lines))
|
|
|
|
return meta
|
|
|
|
def main():
|
|
start_time = time.time()
|
|
page_map = build_page_map()
|
|
total_pages = len(page_map)
|
|
print(f"Pages: {total_pages}, Model: {GEMINI_MODEL}", flush=True)
|
|
|
|
all_page_results = {}
|
|
page_nums = list(page_map.keys())
|
|
|
|
cache_file = RAW_DIR / "_page_results_cache.json"
|
|
if cache_file.exists():
|
|
print("Loading partial cache...", flush=True)
|
|
with open(str(cache_file), 'r', encoding='utf-8') as f:
|
|
cached = json.load(f)
|
|
all_page_results = {int(k): v for k, v in cached.items()}
|
|
print(f" Loaded {len(all_page_results)} cached pages", flush=True)
|
|
|
|
pages_to_process = [p for p in page_nums if p not in all_page_results]
|
|
print(f"Pages to process: {len(pages_to_process)}", flush=True)
|
|
|
|
total_batches = (len(pages_to_process) + BATCH_SIZE - 1) // BATCH_SIZE
|
|
|
|
for batch_idx, batch_start in enumerate(range(0, len(pages_to_process), BATCH_SIZE)):
|
|
batch = pages_to_process[batch_start:batch_start + BATCH_SIZE]
|
|
batch_args = [(p, page_map[p]['png'], page_map[p]['ocr']) for p in batch]
|
|
|
|
print(f"Batch {batch_idx+1}/{total_batches}: pages {batch[0]}-{batch[-1]}", flush=True)
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
|
futures = {executor.submit(process_page_task, args): args[0] for args in batch_args}
|
|
for future in concurrent.futures.as_completed(futures, timeout=600):
|
|
page_num = futures[future]
|
|
try:
|
|
pn, result = future.result(timeout=5)
|
|
all_page_results[pn] = result
|
|
except Exception as e:
|
|
print(f" P{page_num} future error: {e}", flush=True)
|
|
all_page_results[page_num] = fallback_chunk(page_num)
|
|
|
|
with open(str(cache_file), 'w', encoding='utf-8') as f:
|
|
json.dump({str(k): v for k, v in all_page_results.items()}, f, ensure_ascii=False)
|
|
print(f" Cache: {len(all_page_results)} pages", flush=True)
|
|
|
|
if batch_start + BATCH_SIZE < len(pages_to_process):
|
|
time.sleep(1)
|
|
|
|
print(f"\nAll pages processed. Building output...", flush=True)
|
|
|
|
all_chunks_ordered = []
|
|
for page_num in sorted(all_page_results.keys()):
|
|
result = all_page_results[page_num]
|
|
chunks = sorted(result.get('chunks', []), key=lambda c: c.get('order_in_page', 1))
|
|
source_png = page_map[page_num]['png_filename']
|
|
for chunk in chunks:
|
|
all_chunks_ordered.append((page_num, chunk, source_png))
|
|
|
|
total_chunks = len(all_chunks_ordered)
|
|
print(f"Total chunks: {total_chunks}", flush=True)
|
|
|
|
chunk_id_list = [f"c{(i+1):04d}" for i in range(total_chunks)]
|
|
|
|
print("Cropping image chunks...", flush=True)
|
|
images_extracted = 0
|
|
for i, (page_num, chunk_data, source_png) in enumerate(all_chunks_ordered):
|
|
if chunk_data.get('type') == 'image':
|
|
chunk_id = chunk_id_list[i]
|
|
bbox = bbox_safe(chunk_data.get('bbox'))
|
|
img_out = IMAGES_DIR / f"IMG-{chunk_id}.png"
|
|
png_path = page_map[page_num]['png']
|
|
if crop_image_chunk(png_path, bbox, img_out):
|
|
images_extracted += 1
|
|
|
|
print("Writing chunk files...", flush=True)
|
|
index_entries = []
|
|
all_chunk_meta = []
|
|
ufo_anomalies = []
|
|
cryptid_anomalies = []
|
|
|
|
for i, (page_num, chunk_data, source_png) in enumerate(all_chunks_ordered):
|
|
chunk_id = chunk_id_list[i]
|
|
order_global = i + 1
|
|
prev_chunk = chunk_id_list[i-1] if i > 0 else None
|
|
next_chunk = chunk_id_list[i+1] if i < total_chunks - 1 else None
|
|
|
|
meta = write_chunk_file(chunk_id, chunk_data, page_num, order_global,
|
|
prev_chunk, next_chunk, source_png)
|
|
all_chunk_meta.append(meta)
|
|
|
|
if chunk_data.get('ufo_anomaly_detected'):
|
|
ufo_anomalies.append(chunk_id)
|
|
if chunk_data.get('cryptid_anomaly_detected'):
|
|
cryptid_anomalies.append(chunk_id)
|
|
|
|
content_en = str(chunk_data.get('content_en', ''))
|
|
preview = content_en[:80].replace('\n', ' ')
|
|
index_entries.append({
|
|
"chunk_id": chunk_id,
|
|
"type": chunk_data.get('type', 'body_text'),
|
|
"page": page_num,
|
|
"order_in_page": chunk_data.get('order_in_page', 1),
|
|
"order_global": order_global,
|
|
"file": f"chunks/{chunk_id}.md",
|
|
"bbox": bbox_safe(chunk_data.get('bbox')),
|
|
"preview": preview
|
|
})
|
|
|
|
print("Writing _index.json...", flush=True)
|
|
build_at = now_iso()
|
|
type_hist = {}
|
|
for entry in index_entries:
|
|
t = entry['type']
|
|
type_hist[t] = type_hist.get(t, 0) + 1
|
|
|
|
index_data = {
|
|
"doc_id": DOC_ID,
|
|
"schema_version": "0.2.0",
|
|
"total_pages": total_pages,
|
|
"total_chunks": total_chunks,
|
|
"build_approach": "subagents",
|
|
"build_model": "claude-sonnet-4-6",
|
|
"build_at": build_at,
|
|
"chunks": index_entries
|
|
}
|
|
with open(str(RAW_DIR / "_index.json"), 'w', encoding='utf-8') as f:
|
|
json.dump(index_data, f, indent=2, ensure_ascii=False)
|
|
|
|
print("Assembling document.md...", flush=True)
|
|
doc_lines = []
|
|
doc_lines.append("---")
|
|
doc_lines.append('schema_version: "0.2.0"')
|
|
doc_lines.append("type: master_document")
|
|
doc_lines.append(f"doc_id: {DOC_ID}")
|
|
doc_lines.append(f'canonical_title: "{DOC_TITLE}"')
|
|
doc_lines.append(f"total_pages: {total_pages}")
|
|
doc_lines.append(f"total_chunks: {total_chunks}")
|
|
hist_str = ", ".join(f'"{k}": {v}' for k, v in sorted(type_hist.items()))
|
|
doc_lines.append(f"chunk_types_histogram: {{{hist_str}}}")
|
|
doc_lines.append("multi_page_tables: []")
|
|
doc_lines.append(f"ufo_anomalies_flagged: [{', '.join(ufo_anomalies)}]")
|
|
doc_lines.append(f"cryptid_anomalies_flagged: [{', '.join(cryptid_anomalies)}]")
|
|
doc_lines.append('build_approach: "subagents"')
|
|
doc_lines.append("build_model: claude-sonnet-4-6")
|
|
doc_lines.append(f"build_at: {build_at}")
|
|
doc_lines.append("---")
|
|
doc_lines.append("")
|
|
|
|
chunks_by_page = {}
|
|
for i, (page_num, chunk_data, source_png) in enumerate(all_chunks_ordered):
|
|
if page_num not in chunks_by_page:
|
|
chunks_by_page[page_num] = []
|
|
chunks_by_page[page_num].append((chunk_id_list[i], chunk_data, all_chunk_meta[i]))
|
|
|
|
for page_num in sorted(chunks_by_page.keys()):
|
|
doc_lines.append(f"## Page {page_num}")
|
|
doc_lines.append("")
|
|
for chunk_id, chunk_data, meta in chunks_by_page[page_num]:
|
|
ctype = chunk_data.get('type', 'body_text')
|
|
bbox = bbox_safe(chunk_data.get('bbox'))
|
|
bbox_str = f"{bbox['x']:.2f}/{bbox['y']:.2f}/{bbox['w']:.2f}/{bbox['h']:.2f}"
|
|
|
|
doc_lines.append(f"<!-- chunk:{chunk_id} src:./chunks/{chunk_id}.md -->")
|
|
doc_lines.append(f'<a id="{chunk_id}"></a>')
|
|
doc_lines.append(f"### Chunk {chunk_id} — {ctype} · p{page_num} · bbox: {bbox_str}")
|
|
doc_lines.append("")
|
|
doc_lines.append(f"**EN:** {chunk_data.get('content_en', '')}")
|
|
doc_lines.append("")
|
|
doc_lines.append(f"**PT-BR:** {chunk_data.get('content_pt_br', '')}")
|
|
doc_lines.append("")
|
|
|
|
if ctype == 'image':
|
|
doc_lines.append(f"")
|
|
doc_lines.append("")
|
|
desc = chunk_data.get('image_description_en', '')
|
|
if desc:
|
|
doc_lines.append(f"*{desc}*")
|
|
doc_lines.append("")
|
|
|
|
doc_lines.append("<details><summary>metadata</summary>")
|
|
doc_lines.append("")
|
|
doc_lines.append("```json")
|
|
doc_lines.append(json.dumps(meta, indent=2, ensure_ascii=False))
|
|
doc_lines.append("```")
|
|
doc_lines.append("")
|
|
doc_lines.append("</details>")
|
|
doc_lines.append("")
|
|
doc_lines.append("---")
|
|
doc_lines.append("")
|
|
|
|
doc_content = '\n'.join(doc_lines)
|
|
with open(str(RAW_DIR / "document.md"), 'w', encoding='utf-8') as f:
|
|
f.write(doc_content)
|
|
|
|
doc_bytes = len(doc_content.encode('utf-8'))
|
|
wall_seconds = int(time.time() - start_time)
|
|
|
|
if cache_file.exists():
|
|
os.remove(str(cache_file))
|
|
|
|
print(f"\nSTATS pages={total_pages} chunks={total_chunks} images={images_extracted} tables=0 ufo={len(ufo_anomalies)} cryptid={len(cryptid_anomalies)} doc_md_bytes={doc_bytes}")
|
|
print(f"Wall time: {wall_seconds}s")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|