disclosure-bureau/scripts/25-master-doc-test.py

193 lines
7.7 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
"""
25-master-doc-test.py A/B test: rebuild a document using either subagents or
agent teams, measure cost + time + output quality.
Both approaches must produce raw/<doc-id>/document.md with the same schema.
Usage:
./25-master-doc-test.py --doc-id <id> --approach subagent --max-pages 20
./25-master-doc-test.py --doc-id <id> --approach team --max-pages 20
./25-master-doc-test.py --doc-id <id> --both --max-pages 20 # runs both
"""
from __future__ import annotations
import argparse
import json
import os
import shutil
import subprocess
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
UFO_ROOT = Path("/Users/guto/ufo")
def utc_now_iso() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
PROMPT_SUBAGENT = """Rebuild the document `{doc_id}` into `raw/{doc_id}/document.md` using the `doc-rebuilder` subagent.
Constraints:
- Process ONLY the first {max_pages} pages (p001 .. p{max_pages:03d}).
- The doc-rebuilder agent will spawn `page-rebuilder`, `image-analyst`, and `table-stitcher` subagents in parallel via the Task tool.
- Output schema: as defined in the doc-rebuilder agent's system prompt.
- Target file: `/Users/guto/ufo/raw/{doc_id}/document.md`.
- Image crops go to `/Users/guto/ufo/raw/{doc_id}/images/`.
- Set frontmatter `build_approach: "subagents"`.
Begin. When done, output a single line with stats: pages_done, chunks_total, images_extracted, tables_stitched, ufo_anomalies, cryptid_anomalies, wall_seconds."""
PROMPT_TEAM = """Create an agent team to rebuild the document `{doc_id}` into `raw/{doc_id}/document.md`.
Constraints:
- Process ONLY the first {max_pages} pages (p001 .. p{max_pages:03d}).
- Spawn an agent team with these teammates (use the subagent definitions from .claude/agents/):
- 1× lead (you) coordinating
- 4× page-rebuilder teammates working different page subsets in parallel
- 1× image-analyst teammate processing all image chunks after page-rebuilders finish
- 1× table-stitcher teammate for multi-page tables
- Use the shared task list to coordinate work.
- Output schema: as defined in the doc-rebuilder agent's system prompt.
- Target file: `/Users/guto/ufo/raw/{doc_id}/document.md`.
- Set frontmatter `build_approach: "agent-teams"`.
When all teammates complete, clean up the team and output a single line of stats: pages_done, chunks_total, images_extracted, tables_stitched, ufo_anomalies, cryptid_anomalies, wall_seconds."""
def run_approach(approach: str, doc_id: str, max_pages: int) -> dict:
"""Invoke claude CLI for one approach. Returns metrics."""
out_dir = UFO_ROOT / "raw" / doc_id
if out_dir.exists():
# Move existing aside so we don't clobber
backup = UFO_ROOT / "raw" / f"{doc_id}.backup-{int(time.time())}"
shutil.move(str(out_dir), str(backup))
out_dir.mkdir(parents=True, exist_ok=True)
(out_dir / "images").mkdir(exist_ok=True)
(out_dir / "tables").mkdir(exist_ok=True)
if approach == "subagent":
prompt = PROMPT_SUBAGENT.format(doc_id=doc_id, max_pages=max_pages)
env = {**os.environ}
else:
prompt = PROMPT_TEAM.format(doc_id=doc_id, max_pages=max_pages)
env = {**os.environ, "CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS": "1"}
cmd = [
"claude", "-p",
"--model", "sonnet",
"--output-format", "json",
"--max-turns", "60",
"--allowedTools", "Read,Write,Bash,Task",
"--add-dir", str(UFO_ROOT),
"--",
prompt,
]
if approach == "team":
# Inject teammate-mode flag; some experimental features need it
cmd.insert(-2, "--teammate-mode")
cmd.insert(-2, "in-process")
print(f"\n{'=' * 70}")
print(f" APPROACH: {approach.upper()}")
print(f"{'=' * 70}")
print(f" cmd: {' '.join(cmd[:8])} … (prompt truncated)")
print(f" starting at {utc_now_iso()}")
sys.stdout.flush()
t0 = time.time()
proc = subprocess.run(cmd, capture_output=True, text=True, env=env, check=False, timeout=3600)
wall = time.time() - t0
metrics = {
"approach": approach,
"wall_seconds": round(wall, 1),
"returncode": proc.returncode,
"stderr_tail": proc.stderr[-1000:] if proc.stderr else "",
}
try:
cli = json.loads(proc.stdout) if proc.stdout else {}
except json.JSONDecodeError:
cli = {"raw_stdout": proc.stdout[-3000:]}
metrics["is_error"] = cli.get("is_error", proc.returncode != 0)
metrics["duration_ms"] = cli.get("duration_ms")
metrics["duration_api_ms"] = cli.get("duration_api_ms")
metrics["total_cost_usd"] = cli.get("total_cost_usd")
metrics["num_turns"] = cli.get("num_turns")
metrics["usage"] = cli.get("usage")
metrics["result_excerpt"] = (cli.get("result") or "")[:2000]
# Inspect output
doc_md = out_dir / "document.md"
metrics["output_exists"] = doc_md.exists()
metrics["output_size_bytes"] = doc_md.stat().st_size if doc_md.exists() else 0
metrics["images_extracted"] = len(list((out_dir / "images").glob("*"))) if (out_dir / "images").exists() else 0
# Rename output so both approaches can coexist
if doc_md.exists():
archive = UFO_ROOT / "raw" / f"{doc_id}--{approach}"
if archive.exists():
shutil.rmtree(archive)
shutil.move(str(out_dir), str(archive))
metrics["archived_at"] = str(archive)
print(f" finished in {wall:.1f}s · rc={proc.returncode}")
print(f" output exists: {metrics['output_exists']} · size: {metrics['output_size_bytes']} bytes")
print(f" images_extracted: {metrics['images_extracted']}")
return metrics
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--doc-id", required=True)
ap.add_argument("--max-pages", type=int, default=20)
ap.add_argument("--approach", choices=["subagent", "team"])
ap.add_argument("--both", action="store_true")
args = ap.parse_args()
if not args.approach and not args.both:
ap.error("provide --approach or --both")
results: dict[str, dict] = {}
if args.both or args.approach == "subagent":
results["subagent"] = run_approach("subagent", args.doc_id, args.max_pages)
if args.both or args.approach == "team":
results["team"] = run_approach("team", args.doc_id, args.max_pages)
# Comparison table
print(f"\n{'=' * 70}")
print(f" COMPARISON — {args.doc_id} first {args.max_pages} pages")
print(f"{'=' * 70}")
if "subagent" in results and "team" in results:
s = results["subagent"]; t = results["team"]
print(f" {'metric':<25} {'subagent':>20} {'team':>20}")
print(f" {'-' * 25} {'-' * 20:>20} {'-' * 20:>20}")
print(f" {'wall_seconds':<25} {s['wall_seconds']:>20} {t['wall_seconds']:>20}")
print(f" {'returncode':<25} {s['returncode']:>20} {t['returncode']:>20}")
print(f" {'is_error':<25} {str(s.get('is_error')):>20} {str(t.get('is_error')):>20}")
print(f" {'total_cost_usd':<25} {str(s.get('total_cost_usd')):>20} {str(t.get('total_cost_usd')):>20}")
print(f" {'num_turns':<25} {str(s.get('num_turns')):>20} {str(t.get('num_turns')):>20}")
print(f" {'output_size_bytes':<25} {s['output_size_bytes']:>20} {t['output_size_bytes']:>20}")
print(f" {'images_extracted':<25} {s['images_extracted']:>20} {t['images_extracted']:>20}")
else:
for k, v in results.items():
print(json.dumps({k: v}, indent=2, default=str))
# Save full result JSON
report_dir = UFO_ROOT / "raw" / "_ab-test-reports"
report_dir.mkdir(parents=True, exist_ok=True)
report = report_dir / f"{args.doc_id}--{int(time.time())}.json"
report.write_text(json.dumps(results, indent=2, default=str))
print(f"\nFull report: {report}")
if __name__ == "__main__":
main()