disclosure-bureau/scripts/25-master-doc-test.py

#!/usr/bin/env python3
"""
25-master-doc-test.py — A/B test: rebuild a document using either subagents or
agent teams, measure cost + time + output quality.

Both approaches must produce raw/<doc-id>/document.md with the same schema.

Usage:
  ./25-master-doc-test.py --doc-id <id> --approach subagent --max-pages 20
  ./25-master-doc-test.py --doc-id <id> --approach team --max-pages 20
  ./25-master-doc-test.py --doc-id <id> --both --max-pages 20    # runs both
"""
from __future__ import annotations

import argparse
import json
import os
import shutil
import subprocess
import sys
import time
from datetime import datetime, timezone
from pathlib import Path


UFO_ROOT = Path("/Users/guto/ufo")


def utc_now_iso() -> str:
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


PROMPT_SUBAGENT = """Rebuild the document `{doc_id}` into `raw/{doc_id}/document.md` using the `doc-rebuilder` subagent.

Constraints:
- Process ONLY the first {max_pages} pages (p001 .. p{max_pages:03d}).
- The doc-rebuilder agent will spawn `page-rebuilder`, `image-analyst`, and `table-stitcher` subagents in parallel via the Task tool.
- Output schema: as defined in the doc-rebuilder agent's system prompt.
- Target file: `/Users/guto/ufo/raw/{doc_id}/document.md`.
- Image crops go to `/Users/guto/ufo/raw/{doc_id}/images/`.
- Set frontmatter `build_approach: "subagents"`.

Begin. When done, output a single line with stats: pages_done, chunks_total, images_extracted, tables_stitched, ufo_anomalies, cryptid_anomalies, wall_seconds."""


PROMPT_TEAM = """Create an agent team to rebuild the document `{doc_id}` into `raw/{doc_id}/document.md`.

Constraints:
- Process ONLY the first {max_pages} pages (p001 .. p{max_pages:03d}).
- Spawn an agent team with these teammates (use the subagent definitions from .claude/agents/):
  - 1× lead (you) coordinating
  - 4× page-rebuilder teammates working different page subsets in parallel
  - 1× image-analyst teammate processing all image chunks after page-rebuilders finish
  - 1× table-stitcher teammate for multi-page tables
- Use the shared task list to coordinate work.
- Output schema: as defined in the doc-rebuilder agent's system prompt.
- Target file: `/Users/guto/ufo/raw/{doc_id}/document.md`.
- Set frontmatter `build_approach: "agent-teams"`.

When all teammates complete, clean up the team and output a single line of stats: pages_done, chunks_total, images_extracted, tables_stitched, ufo_anomalies, cryptid_anomalies, wall_seconds."""


def run_approach(approach: str, doc_id: str, max_pages: int) -> dict:
    """Invoke claude CLI for one approach. Returns metrics."""
    out_dir = UFO_ROOT / "raw" / doc_id
    if out_dir.exists():
        # Move existing aside so we don't clobber
        backup = UFO_ROOT / "raw" / f"{doc_id}.backup-{int(time.time())}"
        shutil.move(str(out_dir), str(backup))
    out_dir.mkdir(parents=True, exist_ok=True)
    (out_dir / "images").mkdir(exist_ok=True)
    (out_dir / "tables").mkdir(exist_ok=True)

    if approach == "subagent":
        prompt = PROMPT_SUBAGENT.format(doc_id=doc_id, max_pages=max_pages)
        env = {**os.environ}
    else:
        prompt = PROMPT_TEAM.format(doc_id=doc_id, max_pages=max_pages)
        env = {**os.environ, "CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS": "1"}

    cmd = [
        "claude", "-p",
        "--model", "sonnet",
        "--output-format", "json",
        "--max-turns", "60",
        "--allowedTools", "Read,Write,Bash,Task",
        "--add-dir", str(UFO_ROOT),
        "--",
        prompt,
    ]
    if approach == "team":
        # Inject teammate-mode flag; some experimental features need it
        cmd.insert(-2, "--teammate-mode")
        cmd.insert(-2, "in-process")

    print(f"\n{'=' * 70}")
    print(f"  APPROACH: {approach.upper()}")
    print(f"{'=' * 70}")
    print(f"  cmd: {' '.join(cmd[:8])} … (prompt truncated)")
    print(f"  starting at {utc_now_iso()}")
    sys.stdout.flush()

    t0 = time.time()
    proc = subprocess.run(cmd, capture_output=True, text=True, env=env, check=False, timeout=3600)
    wall = time.time() - t0

    metrics = {
        "approach": approach,
        "wall_seconds": round(wall, 1),
        "returncode": proc.returncode,
        "stderr_tail": proc.stderr[-1000:] if proc.stderr else "",
    }

    try:
        cli = json.loads(proc.stdout) if proc.stdout else {}
    except json.JSONDecodeError:
        cli = {"raw_stdout": proc.stdout[-3000:]}

    metrics["is_error"] = cli.get("is_error", proc.returncode != 0)
    metrics["duration_ms"] = cli.get("duration_ms")
    metrics["duration_api_ms"] = cli.get("duration_api_ms")
    metrics["total_cost_usd"] = cli.get("total_cost_usd")
    metrics["num_turns"] = cli.get("num_turns")
    metrics["usage"] = cli.get("usage")
    metrics["result_excerpt"] = (cli.get("result") or "")[:2000]

    # Inspect output
    doc_md = out_dir / "document.md"
    metrics["output_exists"] = doc_md.exists()
    metrics["output_size_bytes"] = doc_md.stat().st_size if doc_md.exists() else 0
    metrics["images_extracted"] = len(list((out_dir / "images").glob("*"))) if (out_dir / "images").exists() else 0

    # Rename output so both approaches can coexist
    if doc_md.exists():
        archive = UFO_ROOT / "raw" / f"{doc_id}--{approach}"
        if archive.exists():
            shutil.rmtree(archive)
        shutil.move(str(out_dir), str(archive))
        metrics["archived_at"] = str(archive)

    print(f"  finished in {wall:.1f}s · rc={proc.returncode}")
    print(f"  output exists: {metrics['output_exists']} · size: {metrics['output_size_bytes']} bytes")
    print(f"  images_extracted: {metrics['images_extracted']}")
    return metrics


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--doc-id", required=True)
    ap.add_argument("--max-pages", type=int, default=20)
    ap.add_argument("--approach", choices=["subagent", "team"])
    ap.add_argument("--both", action="store_true")
    args = ap.parse_args()

    if not args.approach and not args.both:
        ap.error("provide --approach or --both")

    results: dict[str, dict] = {}
    if args.both or args.approach == "subagent":
        results["subagent"] = run_approach("subagent", args.doc_id, args.max_pages)
    if args.both or args.approach == "team":
        results["team"] = run_approach("team", args.doc_id, args.max_pages)

    # Comparison table
    print(f"\n{'=' * 70}")
    print(f"  COMPARISON — {args.doc_id} first {args.max_pages} pages")
    print(f"{'=' * 70}")
    if "subagent" in results and "team" in results:
        s = results["subagent"]; t = results["team"]
        print(f"  {'metric':<25} {'subagent':>20} {'team':>20}")
        print(f"  {'-' * 25} {'-' * 20:>20} {'-' * 20:>20}")
        print(f"  {'wall_seconds':<25} {s['wall_seconds']:>20} {t['wall_seconds']:>20}")
        print(f"  {'returncode':<25} {s['returncode']:>20} {t['returncode']:>20}")
        print(f"  {'is_error':<25} {str(s.get('is_error')):>20} {str(t.get('is_error')):>20}")
        print(f"  {'total_cost_usd':<25} {str(s.get('total_cost_usd')):>20} {str(t.get('total_cost_usd')):>20}")
        print(f"  {'num_turns':<25} {str(s.get('num_turns')):>20} {str(t.get('num_turns')):>20}")
        print(f"  {'output_size_bytes':<25} {s['output_size_bytes']:>20} {t['output_size_bytes']:>20}")
        print(f"  {'images_extracted':<25} {s['images_extracted']:>20} {t['images_extracted']:>20}")
    else:
        for k, v in results.items():
            print(json.dumps({k: v}, indent=2, default=str))

    # Save full result JSON
    report_dir = UFO_ROOT / "raw" / "_ab-test-reports"
    report_dir.mkdir(parents=True, exist_ok=True)
    report = report_dir / f"{args.doc_id}--{int(time.time())}.json"
    report.write_text(json.dumps(results, indent=2, default=str))
    print(f"\nFull report: {report}")


if __name__ == "__main__":
    main()