369 lines
13 KiB
Python
Executable file
369 lines
13 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
26-chunk-harness.py — Deterministic harness that assembles document.md
|
|
from raw/<doc-id>/chunks/*.md + _index.json.
|
|
|
|
Use to:
|
|
- Verify chunks are losslessly assemblable
|
|
- Re-render document.md after manual chunk edits
|
|
- Generate alternate views (HTML, PDF, single-language)
|
|
|
|
Usage:
|
|
./26-chunk-harness.py --doc-id <id> # rebuild document.md
|
|
./26-chunk-harness.py --doc-id <id> --validate # just check structure
|
|
./26-chunk-harness.py --doc-id <id> --lang pt-br # render only PT-BR
|
|
./26-chunk-harness.py --doc-id <id> --format html # render to HTML
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import yaml
|
|
except ImportError:
|
|
sys.stderr.write("pip3 install pyyaml\n"); sys.exit(1)
|
|
|
|
|
|
UFO_ROOT = Path("/Users/guto/ufo")
|
|
|
|
|
|
CANONICAL_TYPES = {
|
|
"letterhead", "address_block", "classification_marking", "heading",
|
|
"paragraph", "form_field", "bulleted_item", "numbered_item", "quote_block",
|
|
"caption", "table_marker", "image", "stamp", "signature", "marginalia",
|
|
"redaction", "footer", "blank_area", "unknown",
|
|
}
|
|
|
|
TYPE_NORMALIZER = {
|
|
"body_paragraph": "paragraph",
|
|
"narrative": "paragraph",
|
|
"prose": "paragraph",
|
|
"body_text": "paragraph",
|
|
"classification_banner": "classification_marking",
|
|
"security_banner": "classification_marking",
|
|
"classification_label": "classification_marking",
|
|
"header_block": "heading",
|
|
"section_header": "heading",
|
|
"subject_line": "heading",
|
|
"doc_title": "heading",
|
|
"agenda_heading": "heading",
|
|
"addressee_block": "address_block",
|
|
"distribution_list": "address_block",
|
|
"routing_block": "address_block",
|
|
"to_block": "address_block",
|
|
"from_block": "address_block",
|
|
"signature_block": "signature",
|
|
"sig": "signature",
|
|
"form_reference": "form_field",
|
|
"field": "form_field",
|
|
"label_value": "form_field",
|
|
"kv_field": "form_field",
|
|
}
|
|
|
|
|
|
def canonicalize_type(t: str) -> str:
|
|
if t in CANONICAL_TYPES:
|
|
return t
|
|
return TYPE_NORMALIZER.get(t, t)
|
|
|
|
|
|
def _shallow_yaml_extract(text: str) -> dict:
|
|
"""Best-effort key:value extraction when full yaml parse fails (broken quotes etc).
|
|
|
|
Only handles top-level scalar fields — drops broken arrays / objects.
|
|
Enough for the harness to render bodies + render basic metadata.
|
|
"""
|
|
out: dict = {}
|
|
for line in text.splitlines():
|
|
# only treat lines that look like `key: value` (no indentation)
|
|
if not line or line[0] in (" ", "\t", "-"):
|
|
continue
|
|
m = re.match(r"^([a-zA-Z_][a-zA-Z0-9_]*)\s*:\s*(.*)$", line)
|
|
if not m:
|
|
continue
|
|
k, v = m.group(1), m.group(2).strip()
|
|
if v.startswith("{") or v.startswith("["):
|
|
# complex — skip rather than parse partial
|
|
continue
|
|
if v == "null" or v == "":
|
|
out[k] = None
|
|
elif v.lower() == "true":
|
|
out[k] = True
|
|
elif v.lower() == "false":
|
|
out[k] = False
|
|
elif re.match(r"^-?\d+\.\d+$", v):
|
|
out[k] = float(v)
|
|
elif re.match(r"^-?\d+$", v):
|
|
out[k] = int(v)
|
|
elif (v[0] == v[-1]) and v[0] in ('"', "'"):
|
|
out[k] = v[1:-1]
|
|
else:
|
|
out[k] = v
|
|
return out
|
|
|
|
|
|
def read_chunk(path: Path) -> tuple[dict, str]:
|
|
c = path.read_text(encoding="utf-8")
|
|
if not c.startswith("---"):
|
|
return {}, c
|
|
end = c.find("---", 4)
|
|
fm_text = c[3:end].strip()
|
|
body = c[end + 3:].lstrip("\n")
|
|
try:
|
|
fm = yaml.safe_load(fm_text) or {}
|
|
except yaml.YAMLError:
|
|
# Malformed frontmatter (quoted strings, unclosed brackets) — degrade gracefully
|
|
fm = _shallow_yaml_extract(fm_text)
|
|
fm["_yaml_error"] = True
|
|
return fm, body
|
|
|
|
|
|
def validate(doc_dir: Path) -> list[str]:
|
|
"""Return list of errors (empty if valid)."""
|
|
errors: list[str] = []
|
|
|
|
index_path = doc_dir / "_index.json"
|
|
if not index_path.exists():
|
|
errors.append("missing _index.json")
|
|
return errors
|
|
|
|
try:
|
|
index = json.loads(index_path.read_text())
|
|
except json.JSONDecodeError as e:
|
|
errors.append(f"_index.json malformed: {e}")
|
|
return errors
|
|
|
|
chunks_dir = doc_dir / "chunks"
|
|
expected_ids = set()
|
|
for entry in index.get("chunks", []):
|
|
cid = entry.get("chunk_id")
|
|
if not cid:
|
|
errors.append(f"index entry missing chunk_id: {entry}")
|
|
continue
|
|
expected_ids.add(cid)
|
|
chunk_path = chunks_dir / f"{cid}.md"
|
|
if not chunk_path.exists():
|
|
errors.append(f"chunk file missing: {chunk_path}")
|
|
continue
|
|
try:
|
|
fm, body = read_chunk(chunk_path)
|
|
except Exception as e:
|
|
errors.append(f"chunk {cid} unreadable: {e}")
|
|
continue
|
|
if fm.get("_yaml_error"):
|
|
errors.append(f"chunk {cid}: YAML frontmatter malformed (shallow-parsed; body OK)")
|
|
if not fm.get("type"):
|
|
errors.append(f"chunk {cid}: missing type")
|
|
if not body.strip():
|
|
errors.append(f"chunk {cid}: empty body")
|
|
related_image = fm.get("related_image")
|
|
if related_image:
|
|
img_path = doc_dir / "images" / related_image
|
|
if not img_path.exists():
|
|
errors.append(f"chunk {cid}: related_image missing on disk: {related_image}")
|
|
|
|
# Check chunk files that aren't in the index (orphans)
|
|
if chunks_dir.exists():
|
|
for chunk_file in chunks_dir.glob("c*.md"):
|
|
cid = chunk_file.stem
|
|
if cid not in expected_ids:
|
|
errors.append(f"orphan chunk file (not in index): {cid}")
|
|
|
|
return errors
|
|
|
|
|
|
TEXTUAL_TYPES = {
|
|
# Canonical
|
|
"letterhead", "address_block", "classification_marking", "heading",
|
|
"paragraph", "form_field", "bulleted_item", "numbered_item", "quote_block",
|
|
"caption", "footer",
|
|
# Variations the agent invented (kept as-is)
|
|
"body_paragraph", "header_block", "header", "section_header",
|
|
"subject_line", "addressee_block", "form_reference", "distribution_list",
|
|
"transcript_block", "to_from_line", "date_line", "list_item",
|
|
"page_number", "title_block", "narrative_paragraph", "signature_block",
|
|
"handwriting", "marginalia_note",
|
|
}
|
|
|
|
|
|
def assemble_prose(doc_dir: Path, lang: str) -> str:
|
|
"""Clean reading view: just the textual content in the chosen language, page by page."""
|
|
index = json.loads((doc_dir / "_index.json").read_text())
|
|
chunks_meta = index.get("chunks", [])
|
|
by_page: dict[int, list[dict]] = {}
|
|
for c in chunks_meta:
|
|
by_page.setdefault(c.get("page", 0), []).append(c)
|
|
for page_chunks in by_page.values():
|
|
page_chunks.sort(key=lambda x: x.get("order_in_page", 0))
|
|
|
|
chunks_dir = doc_dir / "chunks"
|
|
out: list[str] = []
|
|
out.append(f"# {index.get('doc_id')}")
|
|
out.append("")
|
|
out.append(f"> {index.get('total_pages')} páginas · {len(chunks_meta)} chunks · idioma: {lang}")
|
|
out.append("")
|
|
|
|
marker = "**EN:**" if lang == "en" else "**PT-BR:**"
|
|
for page_num in sorted(by_page.keys()):
|
|
out.append(f"## Página {page_num}" if lang == "pt-br" else f"## Page {page_num}")
|
|
out.append("")
|
|
for c in by_page[page_num]:
|
|
canonical = canonicalize_type(c.get("type", ""))
|
|
if canonical not in TEXTUAL_TYPES:
|
|
continue
|
|
fm, body = read_chunk(chunks_dir / f"{c['chunk_id']}.md")
|
|
text = ""
|
|
for line in body.split("\n"):
|
|
s = line.strip()
|
|
if s.startswith(marker):
|
|
text = s.removeprefix(marker).strip()
|
|
break
|
|
if not text:
|
|
continue
|
|
if canonical == "heading":
|
|
out.append(f"### {text}")
|
|
elif canonical == "classification_marking":
|
|
out.append(f"_{text}_")
|
|
elif canonical in ("bulleted_item", "numbered_item"):
|
|
out.append(f"- {text}")
|
|
elif canonical == "quote_block":
|
|
out.append(f"> {text}")
|
|
else:
|
|
out.append(text)
|
|
out.append("")
|
|
out.append("")
|
|
return "\n".join(out)
|
|
|
|
|
|
def assemble_markdown(doc_dir: Path, lang: str = "both") -> str:
|
|
"""Read _index.json + chunks/, return assembled markdown."""
|
|
index = json.loads((doc_dir / "_index.json").read_text())
|
|
doc_id = index.get("doc_id", doc_dir.name)
|
|
chunks_meta = index.get("chunks", [])
|
|
|
|
# Group by page
|
|
by_page: dict[int, list[dict]] = {}
|
|
for c in chunks_meta:
|
|
by_page.setdefault(c.get("page", 0), []).append(c)
|
|
for page_chunks in by_page.values():
|
|
page_chunks.sort(key=lambda x: x.get("order_in_page", 0))
|
|
|
|
# Compute summary stats
|
|
type_hist: dict[str, int] = {}
|
|
ufo_flags: list[str] = []
|
|
cryptid_flags: list[str] = []
|
|
for c in chunks_meta:
|
|
type_hist[c.get("type", "unknown")] = type_hist.get(c.get("type", "unknown"), 0) + 1
|
|
|
|
chunks_dir = doc_dir / "chunks"
|
|
for entry in chunks_meta:
|
|
cid = entry.get("chunk_id")
|
|
fm, _ = read_chunk(chunks_dir / f"{cid}.md")
|
|
if fm.get("ufo_anomaly_detected"):
|
|
ufo_flags.append(cid)
|
|
if fm.get("cryptid_anomaly_detected"):
|
|
cryptid_flags.append(cid)
|
|
|
|
out: list[str] = []
|
|
out.append("---")
|
|
out.append(yaml.dump({
|
|
"schema_version": "0.2.0",
|
|
"type": "master_document",
|
|
"doc_id": doc_id,
|
|
"total_pages": index.get("total_pages"),
|
|
"total_chunks": len(chunks_meta),
|
|
"chunk_types_histogram": type_hist,
|
|
"ufo_anomalies_flagged": ufo_flags,
|
|
"cryptid_anomalies_flagged": cryptid_flags,
|
|
"build_approach": "subagents+harness",
|
|
"build_model": "claude-sonnet-4-6",
|
|
"assembled_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
}, sort_keys=False, allow_unicode=True).rstrip())
|
|
out.append("---")
|
|
out.append("")
|
|
out.append(f"# {doc_id}")
|
|
out.append("")
|
|
out.append(f"> **{len(chunks_meta)} chunks** across **{index.get('total_pages', '?')} pages** · types: {type_hist}")
|
|
if ufo_flags:
|
|
out.append(f"> 🛸 **UAP anomalies flagged in chunks:** {', '.join(ufo_flags)}")
|
|
out.append("")
|
|
|
|
for page_num in sorted(by_page.keys()):
|
|
out.append(f"## Page {page_num}")
|
|
out.append("")
|
|
for c in by_page[page_num]:
|
|
cid = c.get("chunk_id")
|
|
fm, body = read_chunk(chunks_dir / f"{cid}.md")
|
|
bbox = fm.get("bbox") or {}
|
|
bbox_str = f"{bbox.get('x',0):.2f}/{bbox.get('y',0):.2f}/{bbox.get('w',0):.2f}/{bbox.get('h',0):.2f}"
|
|
out.append(f'<a id="{cid}"></a>')
|
|
out.append(f"### Chunk {cid} — {fm.get('type','?')} · p{page_num} · bbox: {bbox_str}")
|
|
out.append("")
|
|
|
|
# Render body — body already has **EN:** and **PT-BR:** sections
|
|
if lang == "en":
|
|
# Extract only EN line
|
|
for line in body.split("\n"):
|
|
if line.strip().startswith("**EN:**"):
|
|
out.append(line)
|
|
elif lang == "pt-br":
|
|
for line in body.split("\n"):
|
|
if line.strip().startswith("**PT-BR:**"):
|
|
out.append(line)
|
|
else:
|
|
out.append(body.rstrip())
|
|
out.append("")
|
|
|
|
# Embed image if applicable
|
|
if fm.get("related_image"):
|
|
out.append(f"")
|
|
out.append("")
|
|
out.append("---")
|
|
out.append("")
|
|
|
|
return "\n".join(out)
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--doc-id", required=True)
|
|
ap.add_argument("--validate", action="store_true")
|
|
ap.add_argument("--lang", choices=["both", "en", "pt-br"], default="both")
|
|
ap.add_argument("--prose", action="store_true", help="Produce text-only reading view (no bbox/metadata, only textual chunks)")
|
|
ap.add_argument("--root", default=str(UFO_ROOT / "raw"))
|
|
args = ap.parse_args()
|
|
|
|
doc_dir = Path(args.root) / args.doc_id
|
|
if not doc_dir.exists():
|
|
sys.stderr.write(f"✗ Doc dir not found: {doc_dir}\n"); sys.exit(1)
|
|
|
|
if args.validate:
|
|
errs = validate(doc_dir)
|
|
if errs:
|
|
print(f"✗ {len(errs)} validation errors:")
|
|
for e in errs[:50]:
|
|
print(f" · {e}")
|
|
sys.exit(1)
|
|
index = json.loads((doc_dir / "_index.json").read_text())
|
|
print(f"✓ {len(index.get('chunks', []))} chunks validated across {index.get('total_pages', '?')} pages")
|
|
return
|
|
|
|
if args.prose:
|
|
if args.lang == "both":
|
|
sys.stderr.write("--prose requires --lang en or --lang pt-br\n"); sys.exit(1)
|
|
md = assemble_prose(doc_dir, lang=args.lang)
|
|
out_path = doc_dir / f"document.prose.{args.lang}.md"
|
|
else:
|
|
md = assemble_markdown(doc_dir, lang=args.lang)
|
|
out_path = doc_dir / ("document.md" if args.lang == "both" else f"document.{args.lang}.md")
|
|
out_path.write_text(md, encoding="utf-8")
|
|
print(f"✓ Wrote {out_path} ({len(md)} bytes)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|