192 lines
7.7 KiB
Python
Executable file
192 lines
7.7 KiB
Python
Executable file
#!/usr/bin/env python3
|
||
"""
|
||
25-master-doc-test.py — A/B test: rebuild a document using either subagents or
|
||
agent teams, measure cost + time + output quality.
|
||
|
||
Both approaches must produce raw/<doc-id>/document.md with the same schema.
|
||
|
||
Usage:
|
||
./25-master-doc-test.py --doc-id <id> --approach subagent --max-pages 20
|
||
./25-master-doc-test.py --doc-id <id> --approach team --max-pages 20
|
||
./25-master-doc-test.py --doc-id <id> --both --max-pages 20 # runs both
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import os
|
||
import shutil
|
||
import subprocess
|
||
import sys
|
||
import time
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
|
||
|
||
UFO_ROOT = Path("/Users/guto/ufo")
|
||
|
||
|
||
def utc_now_iso() -> str:
|
||
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||
|
||
|
||
PROMPT_SUBAGENT = """Rebuild the document `{doc_id}` into `raw/{doc_id}/document.md` using the `doc-rebuilder` subagent.
|
||
|
||
Constraints:
|
||
- Process ONLY the first {max_pages} pages (p001 .. p{max_pages:03d}).
|
||
- The doc-rebuilder agent will spawn `page-rebuilder`, `image-analyst`, and `table-stitcher` subagents in parallel via the Task tool.
|
||
- Output schema: as defined in the doc-rebuilder agent's system prompt.
|
||
- Target file: `/Users/guto/ufo/raw/{doc_id}/document.md`.
|
||
- Image crops go to `/Users/guto/ufo/raw/{doc_id}/images/`.
|
||
- Set frontmatter `build_approach: "subagents"`.
|
||
|
||
Begin. When done, output a single line with stats: pages_done, chunks_total, images_extracted, tables_stitched, ufo_anomalies, cryptid_anomalies, wall_seconds."""
|
||
|
||
|
||
PROMPT_TEAM = """Create an agent team to rebuild the document `{doc_id}` into `raw/{doc_id}/document.md`.
|
||
|
||
Constraints:
|
||
- Process ONLY the first {max_pages} pages (p001 .. p{max_pages:03d}).
|
||
- Spawn an agent team with these teammates (use the subagent definitions from .claude/agents/):
|
||
- 1× lead (you) coordinating
|
||
- 4× page-rebuilder teammates working different page subsets in parallel
|
||
- 1× image-analyst teammate processing all image chunks after page-rebuilders finish
|
||
- 1× table-stitcher teammate for multi-page tables
|
||
- Use the shared task list to coordinate work.
|
||
- Output schema: as defined in the doc-rebuilder agent's system prompt.
|
||
- Target file: `/Users/guto/ufo/raw/{doc_id}/document.md`.
|
||
- Set frontmatter `build_approach: "agent-teams"`.
|
||
|
||
When all teammates complete, clean up the team and output a single line of stats: pages_done, chunks_total, images_extracted, tables_stitched, ufo_anomalies, cryptid_anomalies, wall_seconds."""
|
||
|
||
|
||
def run_approach(approach: str, doc_id: str, max_pages: int) -> dict:
|
||
"""Invoke claude CLI for one approach. Returns metrics."""
|
||
out_dir = UFO_ROOT / "raw" / doc_id
|
||
if out_dir.exists():
|
||
# Move existing aside so we don't clobber
|
||
backup = UFO_ROOT / "raw" / f"{doc_id}.backup-{int(time.time())}"
|
||
shutil.move(str(out_dir), str(backup))
|
||
out_dir.mkdir(parents=True, exist_ok=True)
|
||
(out_dir / "images").mkdir(exist_ok=True)
|
||
(out_dir / "tables").mkdir(exist_ok=True)
|
||
|
||
if approach == "subagent":
|
||
prompt = PROMPT_SUBAGENT.format(doc_id=doc_id, max_pages=max_pages)
|
||
env = {**os.environ}
|
||
else:
|
||
prompt = PROMPT_TEAM.format(doc_id=doc_id, max_pages=max_pages)
|
||
env = {**os.environ, "CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS": "1"}
|
||
|
||
cmd = [
|
||
"claude", "-p",
|
||
"--model", "sonnet",
|
||
"--output-format", "json",
|
||
"--max-turns", "60",
|
||
"--allowedTools", "Read,Write,Bash,Task",
|
||
"--add-dir", str(UFO_ROOT),
|
||
"--",
|
||
prompt,
|
||
]
|
||
if approach == "team":
|
||
# Inject teammate-mode flag; some experimental features need it
|
||
cmd.insert(-2, "--teammate-mode")
|
||
cmd.insert(-2, "in-process")
|
||
|
||
print(f"\n{'=' * 70}")
|
||
print(f" APPROACH: {approach.upper()}")
|
||
print(f"{'=' * 70}")
|
||
print(f" cmd: {' '.join(cmd[:8])} … (prompt truncated)")
|
||
print(f" starting at {utc_now_iso()}")
|
||
sys.stdout.flush()
|
||
|
||
t0 = time.time()
|
||
proc = subprocess.run(cmd, capture_output=True, text=True, env=env, check=False, timeout=3600)
|
||
wall = time.time() - t0
|
||
|
||
metrics = {
|
||
"approach": approach,
|
||
"wall_seconds": round(wall, 1),
|
||
"returncode": proc.returncode,
|
||
"stderr_tail": proc.stderr[-1000:] if proc.stderr else "",
|
||
}
|
||
|
||
try:
|
||
cli = json.loads(proc.stdout) if proc.stdout else {}
|
||
except json.JSONDecodeError:
|
||
cli = {"raw_stdout": proc.stdout[-3000:]}
|
||
|
||
metrics["is_error"] = cli.get("is_error", proc.returncode != 0)
|
||
metrics["duration_ms"] = cli.get("duration_ms")
|
||
metrics["duration_api_ms"] = cli.get("duration_api_ms")
|
||
metrics["total_cost_usd"] = cli.get("total_cost_usd")
|
||
metrics["num_turns"] = cli.get("num_turns")
|
||
metrics["usage"] = cli.get("usage")
|
||
metrics["result_excerpt"] = (cli.get("result") or "")[:2000]
|
||
|
||
# Inspect output
|
||
doc_md = out_dir / "document.md"
|
||
metrics["output_exists"] = doc_md.exists()
|
||
metrics["output_size_bytes"] = doc_md.stat().st_size if doc_md.exists() else 0
|
||
metrics["images_extracted"] = len(list((out_dir / "images").glob("*"))) if (out_dir / "images").exists() else 0
|
||
|
||
# Rename output so both approaches can coexist
|
||
if doc_md.exists():
|
||
archive = UFO_ROOT / "raw" / f"{doc_id}--{approach}"
|
||
if archive.exists():
|
||
shutil.rmtree(archive)
|
||
shutil.move(str(out_dir), str(archive))
|
||
metrics["archived_at"] = str(archive)
|
||
|
||
print(f" finished in {wall:.1f}s · rc={proc.returncode}")
|
||
print(f" output exists: {metrics['output_exists']} · size: {metrics['output_size_bytes']} bytes")
|
||
print(f" images_extracted: {metrics['images_extracted']}")
|
||
return metrics
|
||
|
||
|
||
def main():
|
||
ap = argparse.ArgumentParser()
|
||
ap.add_argument("--doc-id", required=True)
|
||
ap.add_argument("--max-pages", type=int, default=20)
|
||
ap.add_argument("--approach", choices=["subagent", "team"])
|
||
ap.add_argument("--both", action="store_true")
|
||
args = ap.parse_args()
|
||
|
||
if not args.approach and not args.both:
|
||
ap.error("provide --approach or --both")
|
||
|
||
results: dict[str, dict] = {}
|
||
if args.both or args.approach == "subagent":
|
||
results["subagent"] = run_approach("subagent", args.doc_id, args.max_pages)
|
||
if args.both or args.approach == "team":
|
||
results["team"] = run_approach("team", args.doc_id, args.max_pages)
|
||
|
||
# Comparison table
|
||
print(f"\n{'=' * 70}")
|
||
print(f" COMPARISON — {args.doc_id} first {args.max_pages} pages")
|
||
print(f"{'=' * 70}")
|
||
if "subagent" in results and "team" in results:
|
||
s = results["subagent"]; t = results["team"]
|
||
print(f" {'metric':<25} {'subagent':>20} {'team':>20}")
|
||
print(f" {'-' * 25} {'-' * 20:>20} {'-' * 20:>20}")
|
||
print(f" {'wall_seconds':<25} {s['wall_seconds']:>20} {t['wall_seconds']:>20}")
|
||
print(f" {'returncode':<25} {s['returncode']:>20} {t['returncode']:>20}")
|
||
print(f" {'is_error':<25} {str(s.get('is_error')):>20} {str(t.get('is_error')):>20}")
|
||
print(f" {'total_cost_usd':<25} {str(s.get('total_cost_usd')):>20} {str(t.get('total_cost_usd')):>20}")
|
||
print(f" {'num_turns':<25} {str(s.get('num_turns')):>20} {str(t.get('num_turns')):>20}")
|
||
print(f" {'output_size_bytes':<25} {s['output_size_bytes']:>20} {t['output_size_bytes']:>20}")
|
||
print(f" {'images_extracted':<25} {s['images_extracted']:>20} {t['images_extracted']:>20}")
|
||
else:
|
||
for k, v in results.items():
|
||
print(json.dumps({k: v}, indent=2, default=str))
|
||
|
||
# Save full result JSON
|
||
report_dir = UFO_ROOT / "raw" / "_ab-test-reports"
|
||
report_dir.mkdir(parents=True, exist_ok=True)
|
||
report = report_dir / f"{args.doc_id}--{int(time.time())}.json"
|
||
report.write_text(json.dumps(results, indent=2, default=str))
|
||
print(f"\nFull report: {report}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|