#!/usr/bin/env python3 """ 25-master-doc-test.py — A/B test: rebuild a document using either subagents or agent teams, measure cost + time + output quality. Both approaches must produce raw//document.md with the same schema. Usage: ./25-master-doc-test.py --doc-id --approach subagent --max-pages 20 ./25-master-doc-test.py --doc-id --approach team --max-pages 20 ./25-master-doc-test.py --doc-id --both --max-pages 20 # runs both """ from __future__ import annotations import argparse import json import os import shutil import subprocess import sys import time from datetime import datetime, timezone from pathlib import Path UFO_ROOT = Path("/Users/guto/ufo") def utc_now_iso() -> str: return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") PROMPT_SUBAGENT = """Rebuild the document `{doc_id}` into `raw/{doc_id}/document.md` using the `doc-rebuilder` subagent. Constraints: - Process ONLY the first {max_pages} pages (p001 .. p{max_pages:03d}). - The doc-rebuilder agent will spawn `page-rebuilder`, `image-analyst`, and `table-stitcher` subagents in parallel via the Task tool. - Output schema: as defined in the doc-rebuilder agent's system prompt. - Target file: `/Users/guto/ufo/raw/{doc_id}/document.md`. - Image crops go to `/Users/guto/ufo/raw/{doc_id}/images/`. - Set frontmatter `build_approach: "subagents"`. Begin. When done, output a single line with stats: pages_done, chunks_total, images_extracted, tables_stitched, ufo_anomalies, cryptid_anomalies, wall_seconds.""" PROMPT_TEAM = """Create an agent team to rebuild the document `{doc_id}` into `raw/{doc_id}/document.md`. Constraints: - Process ONLY the first {max_pages} pages (p001 .. p{max_pages:03d}). - Spawn an agent team with these teammates (use the subagent definitions from .claude/agents/): - 1× lead (you) coordinating - 4× page-rebuilder teammates working different page subsets in parallel - 1× image-analyst teammate processing all image chunks after page-rebuilders finish - 1× table-stitcher teammate for multi-page tables - Use the shared task list to coordinate work. - Output schema: as defined in the doc-rebuilder agent's system prompt. - Target file: `/Users/guto/ufo/raw/{doc_id}/document.md`. - Set frontmatter `build_approach: "agent-teams"`. When all teammates complete, clean up the team and output a single line of stats: pages_done, chunks_total, images_extracted, tables_stitched, ufo_anomalies, cryptid_anomalies, wall_seconds.""" def run_approach(approach: str, doc_id: str, max_pages: int) -> dict: """Invoke claude CLI for one approach. Returns metrics.""" out_dir = UFO_ROOT / "raw" / doc_id if out_dir.exists(): # Move existing aside so we don't clobber backup = UFO_ROOT / "raw" / f"{doc_id}.backup-{int(time.time())}" shutil.move(str(out_dir), str(backup)) out_dir.mkdir(parents=True, exist_ok=True) (out_dir / "images").mkdir(exist_ok=True) (out_dir / "tables").mkdir(exist_ok=True) if approach == "subagent": prompt = PROMPT_SUBAGENT.format(doc_id=doc_id, max_pages=max_pages) env = {**os.environ} else: prompt = PROMPT_TEAM.format(doc_id=doc_id, max_pages=max_pages) env = {**os.environ, "CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS": "1"} cmd = [ "claude", "-p", "--model", "sonnet", "--output-format", "json", "--max-turns", "60", "--allowedTools", "Read,Write,Bash,Task", "--add-dir", str(UFO_ROOT), "--", prompt, ] if approach == "team": # Inject teammate-mode flag; some experimental features need it cmd.insert(-2, "--teammate-mode") cmd.insert(-2, "in-process") print(f"\n{'=' * 70}") print(f" APPROACH: {approach.upper()}") print(f"{'=' * 70}") print(f" cmd: {' '.join(cmd[:8])} … (prompt truncated)") print(f" starting at {utc_now_iso()}") sys.stdout.flush() t0 = time.time() proc = subprocess.run(cmd, capture_output=True, text=True, env=env, check=False, timeout=3600) wall = time.time() - t0 metrics = { "approach": approach, "wall_seconds": round(wall, 1), "returncode": proc.returncode, "stderr_tail": proc.stderr[-1000:] if proc.stderr else "", } try: cli = json.loads(proc.stdout) if proc.stdout else {} except json.JSONDecodeError: cli = {"raw_stdout": proc.stdout[-3000:]} metrics["is_error"] = cli.get("is_error", proc.returncode != 0) metrics["duration_ms"] = cli.get("duration_ms") metrics["duration_api_ms"] = cli.get("duration_api_ms") metrics["total_cost_usd"] = cli.get("total_cost_usd") metrics["num_turns"] = cli.get("num_turns") metrics["usage"] = cli.get("usage") metrics["result_excerpt"] = (cli.get("result") or "")[:2000] # Inspect output doc_md = out_dir / "document.md" metrics["output_exists"] = doc_md.exists() metrics["output_size_bytes"] = doc_md.stat().st_size if doc_md.exists() else 0 metrics["images_extracted"] = len(list((out_dir / "images").glob("*"))) if (out_dir / "images").exists() else 0 # Rename output so both approaches can coexist if doc_md.exists(): archive = UFO_ROOT / "raw" / f"{doc_id}--{approach}" if archive.exists(): shutil.rmtree(archive) shutil.move(str(out_dir), str(archive)) metrics["archived_at"] = str(archive) print(f" finished in {wall:.1f}s · rc={proc.returncode}") print(f" output exists: {metrics['output_exists']} · size: {metrics['output_size_bytes']} bytes") print(f" images_extracted: {metrics['images_extracted']}") return metrics def main(): ap = argparse.ArgumentParser() ap.add_argument("--doc-id", required=True) ap.add_argument("--max-pages", type=int, default=20) ap.add_argument("--approach", choices=["subagent", "team"]) ap.add_argument("--both", action="store_true") args = ap.parse_args() if not args.approach and not args.both: ap.error("provide --approach or --both") results: dict[str, dict] = {} if args.both or args.approach == "subagent": results["subagent"] = run_approach("subagent", args.doc_id, args.max_pages) if args.both or args.approach == "team": results["team"] = run_approach("team", args.doc_id, args.max_pages) # Comparison table print(f"\n{'=' * 70}") print(f" COMPARISON — {args.doc_id} first {args.max_pages} pages") print(f"{'=' * 70}") if "subagent" in results and "team" in results: s = results["subagent"]; t = results["team"] print(f" {'metric':<25} {'subagent':>20} {'team':>20}") print(f" {'-' * 25} {'-' * 20:>20} {'-' * 20:>20}") print(f" {'wall_seconds':<25} {s['wall_seconds']:>20} {t['wall_seconds']:>20}") print(f" {'returncode':<25} {s['returncode']:>20} {t['returncode']:>20}") print(f" {'is_error':<25} {str(s.get('is_error')):>20} {str(t.get('is_error')):>20}") print(f" {'total_cost_usd':<25} {str(s.get('total_cost_usd')):>20} {str(t.get('total_cost_usd')):>20}") print(f" {'num_turns':<25} {str(s.get('num_turns')):>20} {str(t.get('num_turns')):>20}") print(f" {'output_size_bytes':<25} {s['output_size_bytes']:>20} {t['output_size_bytes']:>20}") print(f" {'images_extracted':<25} {s['images_extracted']:>20} {t['images_extracted']:>20}") else: for k, v in results.items(): print(json.dumps({k: v}, indent=2, default=str)) # Save full result JSON report_dir = UFO_ROOT / "raw" / "_ab-test-reports" report_dir.mkdir(parents=True, exist_ok=True) report = report_dir / f"{args.doc_id}--{int(time.time())}.json" report.write_text(json.dumps(results, indent=2, default=str)) print(f"\nFull report: {report}") if __name__ == "__main__": main()