disclosure-bureau/scripts/26-chunk-harness.py

369 lines
13 KiB
Python
Executable file

#!/usr/bin/env python3
"""
26-chunk-harness.py — Deterministic harness that assembles document.md
from raw/<doc-id>/chunks/*.md + _index.json.
Use to:
- Verify chunks are losslessly assemblable
- Re-render document.md after manual chunk edits
- Generate alternate views (HTML, PDF, single-language)
Usage:
./26-chunk-harness.py --doc-id <id> # rebuild document.md
./26-chunk-harness.py --doc-id <id> --validate # just check structure
./26-chunk-harness.py --doc-id <id> --lang pt-br # render only PT-BR
./26-chunk-harness.py --doc-id <id> --format html # render to HTML
"""
from __future__ import annotations
import argparse
import json
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
try:
import yaml
except ImportError:
sys.stderr.write("pip3 install pyyaml\n"); sys.exit(1)
UFO_ROOT = Path("/Users/guto/ufo")
CANONICAL_TYPES = {
"letterhead", "address_block", "classification_marking", "heading",
"paragraph", "form_field", "bulleted_item", "numbered_item", "quote_block",
"caption", "table_marker", "image", "stamp", "signature", "marginalia",
"redaction", "footer", "blank_area", "unknown",
}
TYPE_NORMALIZER = {
"body_paragraph": "paragraph",
"narrative": "paragraph",
"prose": "paragraph",
"body_text": "paragraph",
"classification_banner": "classification_marking",
"security_banner": "classification_marking",
"classification_label": "classification_marking",
"header_block": "heading",
"section_header": "heading",
"subject_line": "heading",
"doc_title": "heading",
"agenda_heading": "heading",
"addressee_block": "address_block",
"distribution_list": "address_block",
"routing_block": "address_block",
"to_block": "address_block",
"from_block": "address_block",
"signature_block": "signature",
"sig": "signature",
"form_reference": "form_field",
"field": "form_field",
"label_value": "form_field",
"kv_field": "form_field",
}
def canonicalize_type(t: str) -> str:
if t in CANONICAL_TYPES:
return t
return TYPE_NORMALIZER.get(t, t)
def _shallow_yaml_extract(text: str) -> dict:
"""Best-effort key:value extraction when full yaml parse fails (broken quotes etc).
Only handles top-level scalar fields — drops broken arrays / objects.
Enough for the harness to render bodies + render basic metadata.
"""
out: dict = {}
for line in text.splitlines():
# only treat lines that look like `key: value` (no indentation)
if not line or line[0] in (" ", "\t", "-"):
continue
m = re.match(r"^([a-zA-Z_][a-zA-Z0-9_]*)\s*:\s*(.*)$", line)
if not m:
continue
k, v = m.group(1), m.group(2).strip()
if v.startswith("{") or v.startswith("["):
# complex — skip rather than parse partial
continue
if v == "null" or v == "":
out[k] = None
elif v.lower() == "true":
out[k] = True
elif v.lower() == "false":
out[k] = False
elif re.match(r"^-?\d+\.\d+$", v):
out[k] = float(v)
elif re.match(r"^-?\d+$", v):
out[k] = int(v)
elif (v[0] == v[-1]) and v[0] in ('"', "'"):
out[k] = v[1:-1]
else:
out[k] = v
return out
def read_chunk(path: Path) -> tuple[dict, str]:
c = path.read_text(encoding="utf-8")
if not c.startswith("---"):
return {}, c
end = c.find("---", 4)
fm_text = c[3:end].strip()
body = c[end + 3:].lstrip("\n")
try:
fm = yaml.safe_load(fm_text) or {}
except yaml.YAMLError:
# Malformed frontmatter (quoted strings, unclosed brackets) — degrade gracefully
fm = _shallow_yaml_extract(fm_text)
fm["_yaml_error"] = True
return fm, body
def validate(doc_dir: Path) -> list[str]:
"""Return list of errors (empty if valid)."""
errors: list[str] = []
index_path = doc_dir / "_index.json"
if not index_path.exists():
errors.append("missing _index.json")
return errors
try:
index = json.loads(index_path.read_text())
except json.JSONDecodeError as e:
errors.append(f"_index.json malformed: {e}")
return errors
chunks_dir = doc_dir / "chunks"
expected_ids = set()
for entry in index.get("chunks", []):
cid = entry.get("chunk_id")
if not cid:
errors.append(f"index entry missing chunk_id: {entry}")
continue
expected_ids.add(cid)
chunk_path = chunks_dir / f"{cid}.md"
if not chunk_path.exists():
errors.append(f"chunk file missing: {chunk_path}")
continue
try:
fm, body = read_chunk(chunk_path)
except Exception as e:
errors.append(f"chunk {cid} unreadable: {e}")
continue
if fm.get("_yaml_error"):
errors.append(f"chunk {cid}: YAML frontmatter malformed (shallow-parsed; body OK)")
if not fm.get("type"):
errors.append(f"chunk {cid}: missing type")
if not body.strip():
errors.append(f"chunk {cid}: empty body")
related_image = fm.get("related_image")
if related_image:
img_path = doc_dir / "images" / related_image
if not img_path.exists():
errors.append(f"chunk {cid}: related_image missing on disk: {related_image}")
# Check chunk files that aren't in the index (orphans)
if chunks_dir.exists():
for chunk_file in chunks_dir.glob("c*.md"):
cid = chunk_file.stem
if cid not in expected_ids:
errors.append(f"orphan chunk file (not in index): {cid}")
return errors
TEXTUAL_TYPES = {
# Canonical
"letterhead", "address_block", "classification_marking", "heading",
"paragraph", "form_field", "bulleted_item", "numbered_item", "quote_block",
"caption", "footer",
# Variations the agent invented (kept as-is)
"body_paragraph", "header_block", "header", "section_header",
"subject_line", "addressee_block", "form_reference", "distribution_list",
"transcript_block", "to_from_line", "date_line", "list_item",
"page_number", "title_block", "narrative_paragraph", "signature_block",
"handwriting", "marginalia_note",
}
def assemble_prose(doc_dir: Path, lang: str) -> str:
"""Clean reading view: just the textual content in the chosen language, page by page."""
index = json.loads((doc_dir / "_index.json").read_text())
chunks_meta = index.get("chunks", [])
by_page: dict[int, list[dict]] = {}
for c in chunks_meta:
by_page.setdefault(c.get("page", 0), []).append(c)
for page_chunks in by_page.values():
page_chunks.sort(key=lambda x: x.get("order_in_page", 0))
chunks_dir = doc_dir / "chunks"
out: list[str] = []
out.append(f"# {index.get('doc_id')}")
out.append("")
out.append(f"> {index.get('total_pages')} páginas · {len(chunks_meta)} chunks · idioma: {lang}")
out.append("")
marker = "**EN:**" if lang == "en" else "**PT-BR:**"
for page_num in sorted(by_page.keys()):
out.append(f"## Página {page_num}" if lang == "pt-br" else f"## Page {page_num}")
out.append("")
for c in by_page[page_num]:
canonical = canonicalize_type(c.get("type", ""))
if canonical not in TEXTUAL_TYPES:
continue
fm, body = read_chunk(chunks_dir / f"{c['chunk_id']}.md")
text = ""
for line in body.split("\n"):
s = line.strip()
if s.startswith(marker):
text = s.removeprefix(marker).strip()
break
if not text:
continue
if canonical == "heading":
out.append(f"### {text}")
elif canonical == "classification_marking":
out.append(f"_{text}_")
elif canonical in ("bulleted_item", "numbered_item"):
out.append(f"- {text}")
elif canonical == "quote_block":
out.append(f"> {text}")
else:
out.append(text)
out.append("")
out.append("")
return "\n".join(out)
def assemble_markdown(doc_dir: Path, lang: str = "both") -> str:
"""Read _index.json + chunks/, return assembled markdown."""
index = json.loads((doc_dir / "_index.json").read_text())
doc_id = index.get("doc_id", doc_dir.name)
chunks_meta = index.get("chunks", [])
# Group by page
by_page: dict[int, list[dict]] = {}
for c in chunks_meta:
by_page.setdefault(c.get("page", 0), []).append(c)
for page_chunks in by_page.values():
page_chunks.sort(key=lambda x: x.get("order_in_page", 0))
# Compute summary stats
type_hist: dict[str, int] = {}
ufo_flags: list[str] = []
cryptid_flags: list[str] = []
for c in chunks_meta:
type_hist[c.get("type", "unknown")] = type_hist.get(c.get("type", "unknown"), 0) + 1
chunks_dir = doc_dir / "chunks"
for entry in chunks_meta:
cid = entry.get("chunk_id")
fm, _ = read_chunk(chunks_dir / f"{cid}.md")
if fm.get("ufo_anomaly_detected"):
ufo_flags.append(cid)
if fm.get("cryptid_anomaly_detected"):
cryptid_flags.append(cid)
out: list[str] = []
out.append("---")
out.append(yaml.dump({
"schema_version": "0.2.0",
"type": "master_document",
"doc_id": doc_id,
"total_pages": index.get("total_pages"),
"total_chunks": len(chunks_meta),
"chunk_types_histogram": type_hist,
"ufo_anomalies_flagged": ufo_flags,
"cryptid_anomalies_flagged": cryptid_flags,
"build_approach": "subagents+harness",
"build_model": "claude-sonnet-4-6",
"assembled_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
}, sort_keys=False, allow_unicode=True).rstrip())
out.append("---")
out.append("")
out.append(f"# {doc_id}")
out.append("")
out.append(f"> **{len(chunks_meta)} chunks** across **{index.get('total_pages', '?')} pages** · types: {type_hist}")
if ufo_flags:
out.append(f"> 🛸 **UAP anomalies flagged in chunks:** {', '.join(ufo_flags)}")
out.append("")
for page_num in sorted(by_page.keys()):
out.append(f"## Page {page_num}")
out.append("")
for c in by_page[page_num]:
cid = c.get("chunk_id")
fm, body = read_chunk(chunks_dir / f"{cid}.md")
bbox = fm.get("bbox") or {}
bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',0):.2f}/{bbox.get('h',0):.2f}"
out.append(f'<a id="{cid}"></a>')
out.append(f"### Chunk {cid}{fm.get('type','?')} · p{page_num} · bbox: {bbox_str}")
out.append("")
# Render body — body already has **EN:** and **PT-BR:** sections
if lang == "en":
# Extract only EN line
for line in body.split("\n"):
if line.strip().startswith("**EN:**"):
out.append(line)
elif lang == "pt-br":
for line in body.split("\n"):
if line.strip().startswith("**PT-BR:**"):
out.append(line)
else:
out.append(body.rstrip())
out.append("")
# Embed image if applicable
if fm.get("related_image"):
out.append(f"![chunk image](./images/{fm['related_image']})")
out.append("")
out.append("---")
out.append("")
return "\n".join(out)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--doc-id", required=True)
ap.add_argument("--validate", action="store_true")
ap.add_argument("--lang", choices=["both", "en", "pt-br"], default="both")
ap.add_argument("--prose", action="store_true", help="Produce text-only reading view (no bbox/metadata, only textual chunks)")
ap.add_argument("--root", default=str(UFO_ROOT / "raw"))
args = ap.parse_args()
doc_dir = Path(args.root) / args.doc_id
if not doc_dir.exists():
sys.stderr.write(f"✗ Doc dir not found: {doc_dir}\n"); sys.exit(1)
if args.validate:
errs = validate(doc_dir)
if errs:
print(f"{len(errs)} validation errors:")
for e in errs[:50]:
print(f" · {e}")
sys.exit(1)
index = json.loads((doc_dir / "_index.json").read_text())
print(f"{len(index.get('chunks', []))} chunks validated across {index.get('total_pages', '?')} pages")
return
if args.prose:
if args.lang == "both":
sys.stderr.write("--prose requires --lang en or --lang pt-br\n"); sys.exit(1)
md = assemble_prose(doc_dir, lang=args.lang)
out_path = doc_dir / f"document.prose.{args.lang}.md"
else:
md = assemble_markdown(doc_dir, lang=args.lang)
out_path = doc_dir / ("document.md" if args.lang == "both" else f"document.{args.lang}.md")
out_path.write_text(md, encoding="utf-8")
print(f"✓ Wrote {out_path} ({len(md)} bytes)")
if __name__ == "__main__":
main()