disclosure-bureau/scripts/rebuild_doc65_section8.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
rebuild_doc65_section8.py
Direct Gemini-powered rebuild of doc-65-hs1-834228961-62-hq-83894-section-8.
Produces: chunks/, images/, tables/, _index.json, document.md
"""

import os
import sys
import json
import re
import time
import base64
import datetime
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError as FuturesTimeout

from PIL import Image
import google.genai as genai
from google.genai import types

# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-section-8"
DOC_TITLE = "65 HS1-834228961/62-HQ-83894 Section 8"
HIGHEST_CLASS = "TOP SECRET"

RAW_DIR   = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
PNG_DIR   = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
OCR_DIR   = Path(f"/Users/guto/ufo/processing/ocr/{DOC_ID}")
CHUNKS_DIR = RAW_DIR / "chunks"
IMAGES_DIR = RAW_DIR / "images"
TABLES_DIR = RAW_DIR / "tables"
PAGES_RAW  = RAW_DIR / "pages_raw.json"

MODEL       = "models/gemini-3.1-flash-lite"
MAX_WORKERS = 4
PAGE_TIMEOUT = 150  # seconds per page

VALID_TYPES = {
    "letterhead", "address_block", "classification_marking", "heading",
    "paragraph", "form_field", "bulleted_item", "numbered_item", "quote_block",
    "caption", "table_marker", "image", "stamp", "signature", "marginalia",
    "redaction", "footer", "blank_area", "unknown",
}

# ---------------------------------------------------------------------------
# Gemini client
# ---------------------------------------------------------------------------
client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY"))

# ---------------------------------------------------------------------------
# Page-rebuilder prompt
# ---------------------------------------------------------------------------
PAGE_PROMPT = """\
You are a forensic document reconstruction agent for The Disclosure Bureau.
Given a single page image (PNG) and its raw OCR text from a US Department of War
declassified UAP/UFO document, decompose it into LOSSLESS agentic chunks.

## Chunk types — STRICT enum (use EXACTLY one of these 19 strings):
letterhead, address_block, classification_marking, heading, paragraph,
form_field, bulleted_item, numbered_item, quote_block, caption, table_marker,
image, stamp, signature, marginalia, redaction, footer, blank_area, unknown

## Output: ONE JSON object — NO markdown fences, NO prose before/after.
{{
  "page_number": {page_number},
  "page_summary_en": "1-2 sentences describing this page",
  "page_summary_pt_br": "1-2 frases em português brasileiro",
  "page_layout": {{
    "columns": 1,
    "orientation": "portrait",
    "page_dimensions_approx": "letter"
  }},
  "chunks": [
    {{
      "order_in_page": 1,
      "type": "paragraph",
      "bbox": {{"x": 0.05, "y": 0.10, "w": 0.90, "h": 0.08}},
      "content_en": "verbatim English text of this chunk",
      "content_pt_br": "Texto em português brasileiro",
      "metadata": {{
        "ocr_confidence": 0.95,
        "ocr_source_lines": [1, 2, 3],
        "classification": null,
        "redaction_code": null,
        "redaction_inferred_content_type": null,
        "image_type": null,
        "formatting": [],
        "cross_page_hint": "self_contained",
        "prev_chunk_hint": null,
        "next_chunk_hint": null,
        "language_in_source": "en"
      }}
    }}
  ]
}}

## Rules:
1. Order by reading order (top→bottom, left→right). order_in_page is 1-indexed.
2. One semantic unit per chunk (one paragraph, one address block, one image, etc.).
3. ALL content accounted for — never skip anything, even blank areas if significant.
4. content_en: verbatim/near-verbatim. No paraphrasing.
5. content_pt_br: Brazilian Portuguese (pt-BR). Preserve UTF-8 accents: ç ã á é í ó ú â ê ô à.
   Proper nouns and verbatim quoted passages stay in source language inside pt-br.
6. Redacted blocks: content_en = "[REDACTED — <code>]". Never fabricate hidden content.
7. bbox: normalized 0..1 relative to page PNG size. Tight around the chunk.
8. cross_page_hint: self_contained | continues_from_prev | continues_to_next
9. image chunks: content_en = brief 1-sentence placeholder description (will be analyzed separately).
10. classification field: exact string as it appears (e.g. "TOP SECRET", "SECRET//NOFORN") or null.

Document context:
  doc_id: {doc_id}
  page_number: {page_number} of {total_pages}
  doc_title: {doc_title}

OCR text (layout-preserved, may have errors — trust the image when they disagree):
---
{ocr_text}
---

Now analyze the image + OCR and output the JSON:"""

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def get_page_files():
    pages = []
    for png in sorted(PNG_DIR.glob("p-*.png")):
        m = re.match(r"p-0*(\d+)\.png", png.name)
        if not m:
            continue
        pn = int(m.group(1))
        # OCR: try zero-padded 3-digit, then bare number
        for fmt in [f"p-{pn:03d}.txt", f"p-{pn}.txt"]:
            ocr = OCR_DIR / fmt
            if ocr.exists():
                break
        else:
            ocr = None
        pages.append((pn, png, ocr))
    return pages


def encode_png(path):
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode()


def call_gemini(png_path, ocr_text, page_num, total_pages):
    prompt = PAGE_PROMPT.format(
        doc_id=DOC_ID,
        page_number=page_num,
        total_pages=total_pages,
        doc_title=DOC_TITLE,
        ocr_text=ocr_text[:5000],
    )

    with open(png_path, "rb") as f:
        img_bytes = f.read()

    contents = [
        types.Part(
            inline_data=types.Blob(mime_type="image/png", data=img_bytes)
        ),
        types.Part(text=prompt),
    ]
    config = types.GenerateContentConfig(
        temperature=0.1,
        max_output_tokens=8192,
    )

    def _call():
        resp = client.models.generate_content(
            model=MODEL, contents=contents, config=config
        )
        if resp.text is None:
            # Safety block or empty response — extract any available text from parts
            try:
                parts = resp.candidates[0].content.parts
                return "\n".join(p.text for p in parts if hasattr(p, "text") and p.text)
            except Exception:
                return None
        return resp.text

    with ThreadPoolExecutor(max_workers=1) as ex:
        future = ex.submit(_call)
        return future.result(timeout=PAGE_TIMEOUT)


def parse_page_json(raw_text, page_num):
    text = raw_text.strip()
    text = re.sub(r"^```(?:json)?\s*", "", text, flags=re.MULTILINE)
    text = re.sub(r"\s*```\s*$", "", text, flags=re.MULTILINE)
    text = text.strip()

    try:
        data = json.loads(text)
    except json.JSONDecodeError:
        # Try to extract the largest {...} block
        m = re.search(r"\{[\s\S]*\}", text)
        if m:
            try:
                data = json.loads(m.group(0))
            except json.JSONDecodeError:
                return {"page_number": page_num, "error": "json_parse_failed",
                        "chunks": [], "raw": text[:300]}
        else:
            return {"page_number": page_num, "error": "no_json_found",
                    "chunks": [], "raw": text[:300]}

    data["page_number"] = page_num
    # Validate and normalize chunk types
    for c in data.get("chunks", []):
        if c.get("type") not in VALID_TYPES:
            c["type"] = "unknown"
    return data


def fallback_chunk(page_num, ocr_text):
    """Minimal unknown chunk when Gemini fails persistently."""
    preview = ocr_text[:200].strip() if ocr_text and ocr_text.strip() else "(page content unavailable)"
    return {
        "page_number": page_num,
        "page_summary_en": f"Page {page_num} — content could not be parsed by vision model.",
        "page_summary_pt_br": f"Página {page_num} — conteúdo não pôde ser analisado pelo modelo de visão.",
        "page_layout": {"columns": 1, "orientation": "portrait", "page_dimensions_approx": "letter"},
        "chunks": [{
            "order_in_page": 1,
            "type": "unknown",
            "bbox": {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
            "content_en": f"[Vision analysis failed — OCR excerpt: {preview}]",
            "content_pt_br": f"[Análise de visão falhou — trecho OCR: {preview}]",
            "metadata": {
                "ocr_confidence": 0.0,
                "ocr_source_lines": [],
                "classification": None,
                "redaction_code": None,
                "redaction_inferred_content_type": None,
                "image_type": None,
                "formatting": [],
                "cross_page_hint": "self_contained",
                "prev_chunk_hint": None,
                "next_chunk_hint": None,
                "language_in_source": "en",
            },
        }],
    }


def process_page(page_num, png_path, ocr_path, total_pages, use_fallback=False):
    ocr_text = (
        ocr_path.read_text(encoding="utf-8", errors="replace")
        if ocr_path
        else "(OCR not available)"
    )
    if use_fallback:
        return fallback_chunk(page_num, ocr_text)
    try:
        raw = call_gemini(png_path, ocr_text, page_num, total_pages)
        if raw is None:
            return {"page_number": page_num, "error": "gemini_none_response", "chunks": []}
        return parse_page_json(raw, page_num)
    except FuturesTimeout:
        return {"page_number": page_num, "error": "timeout", "chunks": []}
    except Exception as exc:
        return {"page_number": page_num, "error": str(exc)[:200], "chunks": []}


def is_valid_page(p):
    return bool(p.get("chunks")) and not p.get("error")


# ---------------------------------------------------------------------------
# Phase 1: process all pages
# ---------------------------------------------------------------------------

def phase_process_pages(pages):
    total = len(pages)
    print(f"[Phase 1] Processing {total} pages with {MODEL} ...")

    # Load existing checkpoint
    existing_map = {}
    failed_pages = set()
    if PAGES_RAW.exists():
        try:
            existing = json.loads(PAGES_RAW.read_text(encoding="utf-8"))
            for p in existing:
                if is_valid_page(p):
                    existing_map[p["page_number"]] = p
                elif p.get("error"):
                    failed_pages.add(p["page_number"])
            print(f"  Checkpoint: {len(existing_map)} valid pages loaded, {len(failed_pages)} previously failed")
        except Exception:
            pass

    to_process = [(pn, pp, op) for pn, pp, op in pages if pn not in existing_map]
    print(f"  Remaining: {len(to_process)} pages")

    results_map = dict(existing_map)

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {
            executor.submit(process_page, pn, pp, op, total, pn in failed_pages): pn
            for pn, pp, op in to_process
        }
        done = 0
        for future in as_completed(futures):
            pn = futures[future]
            done += 1
            try:
                result = future.result(timeout=PAGE_TIMEOUT + 30)
            except Exception as exc:
                result = {"page_number": pn, "error": str(exc)[:200], "chunks": []}
            results_map[pn] = result
            nchunks = len(result.get("chunks", []))
            status = "OK" if is_valid_page(result) else f"ERR({result.get('error','?')[:40]})"
            print(f"  [{done}/{len(to_process)}] p-{pn:03d}: {status} chunks={nchunks}")
            # Checkpoint every 10 pages
            if done % 10 == 0:
                ordered = [results_map[p[0]] for p in pages if p[0] in results_map]
                PAGES_RAW.write_text(
                    json.dumps(ordered, ensure_ascii=False, indent=2), encoding="utf-8"
                )

    # Final save
    ordered = [results_map[p[0]] for p in pages if p[0] in results_map]
    PAGES_RAW.write_text(json.dumps(ordered, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"  Saved {len(ordered)} pages to pages_raw.json")
    return results_map


# ---------------------------------------------------------------------------
# Phase 2: globally number chunks
# ---------------------------------------------------------------------------

def phase_number_chunks(pages, results_map):
    print("[Phase 2] Globally numbering chunks ...")
    all_chunks = []  # list of (page_num, chunk_dict)
    for pn, _, _ in pages:
        pg = results_map.get(pn, {})
        chunks = sorted(pg.get("chunks", []), key=lambda c: c.get("order_in_page", 0))
        for c in chunks:
            all_chunks.append((pn, c))

    total_chunks = len(all_chunks)
    for i, (pn, c) in enumerate(all_chunks, 1):
        c["chunk_id"] = f"c{i:04d}"
        c["order_global"] = i
        c["page"] = pn
        c["prev_chunk"] = f"c{i-1:04d}" if i > 1 else None
        c["next_chunk"] = f"c{i+1:04d}" if i < total_chunks else None
    print(f"  Total chunks: {total_chunks}")
    return all_chunks


# ---------------------------------------------------------------------------
# Phase 3: crop image chunks
# ---------------------------------------------------------------------------

def phase_crop_images(all_chunks, pages):
    png_map = {pn: pp for pn, pp, _ in pages}
    image_chunks = [(pn, c) for pn, c in all_chunks if c.get("type") == "image"]
    print(f"[Phase 3] Cropping {len(image_chunks)} image chunks ...")

    for pn, c in image_chunks:
        cid = c["chunk_id"]
        out_path = IMAGES_DIR / f"IMG-{cid}.png"
        if out_path.exists():
            continue
        png_path = png_map.get(pn)
        if not png_path:
            continue
        bbox = c.get("bbox", {})
        if not bbox:
            continue
        try:
            im = Image.open(png_path)
            W, H = im.size
            pad = 0.005
            x = bbox.get("x", 0)
            y = bbox.get("y", 0)
            w = bbox.get("w", 1)
            h = bbox.get("h", 1)
            left   = max(0, int((x - pad) * W))
            top    = max(0, int((y - pad) * H))
            right  = min(W, int((x + w + pad) * W))
            bottom = min(H, int((y + h + pad) * H))
            if right > left and bottom > top:
                crop = im.crop((left, top, right, bottom))
                crop.save(out_path)
                c["related_image"] = f"IMG-{cid}.png"
        except Exception as exc:
            print(f"  WARN crop {cid}: {exc}")


# ---------------------------------------------------------------------------
# Phase 4: write chunk files
# ---------------------------------------------------------------------------

def phase_write_chunks(all_chunks, pages):
    png_map = {pn: pp for pn, pp, _ in pages}
    print(f"[Phase 4] Writing {len(all_chunks)} chunk files ...")
    for pn, c in all_chunks:
        cid = c["chunk_id"]
        chunk_path = CHUNKS_DIR / f"{cid}.md"
        meta = c.get("metadata", {})
        bbox = c.get("bbox", {"x": 0, "y": 0, "w": 0, "h": 0})
        png_path = png_map.get(pn, "")
        rel_png = f"../../processing/png/{DOC_ID}/{Path(str(png_path)).name}" if png_path else "null"

        yaml_lines = [
            "---",
            f"chunk_id: {cid}",
            f"type: {c.get('type', 'unknown')}",
            f"page: {pn}",
            f"order_in_page: {c.get('order_in_page', 0)}",
            f"order_global: {c.get('order_global', 0)}",
            f"bbox: {{x: {bbox.get('x',0):.4f}, y: {bbox.get('y',0):.4f}, w: {bbox.get('w',0):.4f}, h: {bbox.get('h',0):.4f}}}",
            f"classification: {json.dumps(meta.get('classification'))}",
            f"formatting: {json.dumps(meta.get('formatting', []))}",
            f"cross_page_hint: {meta.get('cross_page_hint', 'self_contained')}",
            f"prev_chunk: {json.dumps(c.get('prev_chunk'))}",
            f"next_chunk: {json.dumps(c.get('next_chunk'))}",
            f"related_image: {json.dumps(c.get('related_image'))}",
            f"related_table: {json.dumps(c.get('related_table'))}",
            f"ocr_confidence: {meta.get('ocr_confidence', 0.0)}",
            f"ocr_source_lines: {json.dumps(meta.get('ocr_source_lines', []))}",
            f"redaction_code: {json.dumps(meta.get('redaction_code'))}",
            f"redaction_inferred_content_type: {json.dumps(meta.get('redaction_inferred_content_type'))}",
            f"image_type: {json.dumps(meta.get('image_type'))}",
            f"ufo_anomaly_detected: {str(c.get('ufo_anomaly_detected', False)).lower()}",
            f"cryptid_anomaly_detected: {str(c.get('cryptid_anomaly_detected', False)).lower()}",
            f"ufo_anomaly_type: {json.dumps(c.get('ufo_anomaly_type'))}",
            f"ufo_anomaly_rationale: {json.dumps(c.get('ufo_anomaly_rationale'))}",
            f"cryptid_anomaly_type: {json.dumps(c.get('cryptid_anomaly_type'))}",
            f"cryptid_anomaly_rationale: {json.dumps(c.get('cryptid_anomaly_rationale'))}",
            f"image_description_en: {json.dumps(c.get('image_description_en'))}",
            f"image_description_pt_br: {json.dumps(c.get('image_description_pt_br'))}",
            f"extracted_text: {json.dumps(c.get('extracted_text'))}",
            f"source_png: {rel_png}",
            "---",
        ]
        body = "\n".join(yaml_lines) + "\n\n"
        body += f"**EN:** {c.get('content_en', '')}\n\n"
        body += f"**PT-BR:** {c.get('content_pt_br', '')}\n"
        chunk_path.write_text(body, encoding="utf-8")


# ---------------------------------------------------------------------------
# Phase 5: write _index.json
# ---------------------------------------------------------------------------

def phase_write_index(all_chunks, pages):
    total_pages = len(pages)
    total_chunks = len(all_chunks)
    build_at = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")

    index = {
        "doc_id": DOC_ID,
        "schema_version": "0.2.0",
        "total_pages": total_pages,
        "total_chunks": total_chunks,
        "build_approach": "subagents",
        "build_model": MODEL,
        "build_at": build_at,
        "chunks": [],
    }

    for pn, c in all_chunks:
        cid = c["chunk_id"]
        preview = (c.get("content_en") or "")[:80]
        index["chunks"].append({
            "chunk_id": cid,
            "type": c.get("type", "unknown"),
            "page": pn,
            "order_in_page": c.get("order_in_page", 0),
            "order_global": c.get("order_global", 0),
            "file": f"chunks/{cid}.md",
            "bbox": c.get("bbox", {}),
            "preview": preview,
        })

    index_path = RAW_DIR / "_index.json"
    index_path.write_text(json.dumps(index, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"[Phase 5] Written _index.json ({total_chunks} entries)")
    return build_at


# ---------------------------------------------------------------------------
# Phase 6: assemble document.md
# ---------------------------------------------------------------------------

def phase_assemble_document(all_chunks, pages, results_map, build_at):
    total_pages = len(pages)
    total_chunks = len(all_chunks)

    # Histograms + anomaly lists
    type_hist = {}
    ufo_flagged = []
    cryptid_flagged = []
    for pn, c in all_chunks:
        ctype = c.get("type", "unknown")
        type_hist[ctype] = type_hist.get(ctype, 0) + 1
        if c.get("ufo_anomaly_detected"):
            ufo_flagged.append(c["chunk_id"])
        if c.get("cryptid_anomaly_detected"):
            cryptid_flagged.append(c["chunk_id"])

    build_at_str = build_at
    frontmatter = f"""---
schema_version: "0.2.0"
type: master_document
doc_id: {DOC_ID}
canonical_title: "{DOC_TITLE}"
total_pages: {total_pages}
total_chunks: {total_chunks}
chunk_types_histogram: {json.dumps(type_hist, ensure_ascii=False)}
multi_page_tables: []
ufo_anomalies_flagged: {json.dumps(ufo_flagged)}
cryptid_anomalies_flagged: {json.dumps(cryptid_flagged)}
build_approach: "subagents"
build_model: "{MODEL}"
build_at: "{build_at_str}"
---

"""

    # Group chunks by page
    chunks_by_page = {}
    for pn, c in all_chunks:
        chunks_by_page.setdefault(pn, []).append(c)

    body_parts = []
    for pn, _, _ in pages:
        pg = results_map.get(pn, {})
        summary_en = pg.get("page_summary_en", "")
        summary_pt = pg.get("page_summary_pt_br", "")
        body_parts.append(f"\n## Page {pn}\n")
        if summary_en:
            body_parts.append(f"<!-- page_summary_en: {summary_en} -->\n")
        if summary_pt:
            body_parts.append(f"<!-- page_summary_pt_br: {summary_pt} -->\n")
        body_parts.append("\n")

        for c in chunks_by_page.get(pn, []):
            cid = c["chunk_id"]
            ctype = c.get("type", "unknown")
            bbox = c.get("bbox", {})
            bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',0):.2f}/{bbox.get('h',0):.2f}"

            body_parts.append(f"<!-- chunk:{cid} src:./chunks/{cid}.md -->\n")
            body_parts.append(f'<a id="{cid}"></a>\n')
            body_parts.append(f"### Chunk {cid} — {ctype} · p{pn} · bbox: {bbox_str}\n\n")
            body_parts.append(f"**EN:** {c.get('content_en', '')}\n\n")
            body_parts.append(f"**PT-BR:** {c.get('content_pt_br', '')}\n\n")

            if ctype == "image" and c.get("related_image"):
                body_parts.append(f"![{cid}](./images/{c['related_image']})\n\n")
                if c.get("image_description_en"):
                    body_parts.append(f"*Image (EN): {c['image_description_en']}*\n\n")
                if c.get("image_description_pt_br"):
                    body_parts.append(f"*Imagem (PT-BR): {c['image_description_pt_br']}*\n\n")

            # Metadata details block
            meta_json = {
                "chunk_id": cid,
                "type": ctype,
                "page": pn,
                "order_global": c.get("order_global"),
                "bbox": bbox,
                "classification": c.get("metadata", {}).get("classification"),
                "formatting": c.get("metadata", {}).get("formatting", []),
                "cross_page_hint": c.get("metadata", {}).get("cross_page_hint"),
                "ocr_confidence": c.get("metadata", {}).get("ocr_confidence"),
                "ufo_anomaly_detected": c.get("ufo_anomaly_detected", False),
                "cryptid_anomaly_detected": c.get("cryptid_anomaly_detected", False),
            }
            body_parts.append("<details><summary>metadata</summary>\n\n")
            body_parts.append("```json\n")
            body_parts.append(json.dumps(meta_json, ensure_ascii=False, indent=2))
            body_parts.append("\n```\n\n</details>\n\n---\n\n")

    doc_content = frontmatter + "".join(body_parts)
    doc_path = RAW_DIR / "document.md"
    doc_path.write_text(doc_content, encoding="utf-8")
    doc_bytes = len(doc_content.encode("utf-8"))
    print(f"[Phase 6] Written document.md ({doc_bytes:,} bytes)")
    return doc_bytes, ufo_flagged, cryptid_flagged


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    start = time.time()

    # Ensure output dirs exist
    for d in [CHUNKS_DIR, IMAGES_DIR, TABLES_DIR]:
        d.mkdir(parents=True, exist_ok=True)

    pages = get_page_files()
    if not pages:
        print("ERROR: no PNG pages found", file=sys.stderr)
        sys.exit(1)
    total_pages = len(pages)
    print(f"Document: {DOC_ID}")
    print(f"Pages found: {total_pages}")

    # Phase 1: vision + OCR per page
    results_map = phase_process_pages(pages)

    # Phase 2: global chunk numbering
    all_chunks = phase_number_chunks(pages, results_map)

    # Phase 3: crop image chunks
    phase_crop_images(all_chunks, pages)

    # Phase 4: write chunk .md files
    phase_write_chunks(all_chunks, pages)

    # Phase 5: write _index.json
    build_at = phase_write_index(all_chunks, pages)

    # Phase 6: assemble document.md
    doc_bytes, ufo_flagged, cryptid_flagged = phase_assemble_document(
        all_chunks, pages, results_map, build_at
    )

    wall = int(time.time() - start)
    images_count = len(list(IMAGES_DIR.glob("IMG-*.png")))
    tables_count = len(list(TABLES_DIR.glob("TBL-*.csv")))

    print(f"\nSTATS pages_done={total_pages} chunks_total={len(all_chunks)} "
          f"images_extracted={images_count} tables_stitched={tables_count} "
          f"ufo_anomalies={len(ufo_flagged)} cryptid_anomalies={len(cryptid_flagged)} "
          f"wall_seconds={wall}")


if __name__ == "__main__":
    main()