disclosure-bureau/scripts/25-master-doc-test.py

192 lines
7.7 KiB
Python
Executable file
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
25-master-doc-test.py — A/B test: rebuild a document using either subagents or
agent teams, measure cost + time + output quality.
Both approaches must produce raw/<doc-id>/document.md with the same schema.
Usage:
./25-master-doc-test.py --doc-id <id> --approach subagent --max-pages 20
./25-master-doc-test.py --doc-id <id> --approach team --max-pages 20
./25-master-doc-test.py --doc-id <id> --both --max-pages 20 # runs both
"""
from __future__ import annotations
import argparse
import json
import os
import shutil
import subprocess
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
UFO_ROOT = Path("/Users/guto/ufo")
def utc_now_iso() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
PROMPT_SUBAGENT = """Rebuild the document `{doc_id}` into `raw/{doc_id}/document.md` using the `doc-rebuilder` subagent.
Constraints:
- Process ONLY the first {max_pages} pages (p001 .. p{max_pages:03d}).
- The doc-rebuilder agent will spawn `page-rebuilder`, `image-analyst`, and `table-stitcher` subagents in parallel via the Task tool.
- Output schema: as defined in the doc-rebuilder agent's system prompt.
- Target file: `/Users/guto/ufo/raw/{doc_id}/document.md`.
- Image crops go to `/Users/guto/ufo/raw/{doc_id}/images/`.
- Set frontmatter `build_approach: "subagents"`.
Begin. When done, output a single line with stats: pages_done, chunks_total, images_extracted, tables_stitched, ufo_anomalies, cryptid_anomalies, wall_seconds."""
PROMPT_TEAM = """Create an agent team to rebuild the document `{doc_id}` into `raw/{doc_id}/document.md`.
Constraints:
- Process ONLY the first {max_pages} pages (p001 .. p{max_pages:03d}).
- Spawn an agent team with these teammates (use the subagent definitions from .claude/agents/):
- 1× lead (you) coordinating
- 4× page-rebuilder teammates working different page subsets in parallel
- 1× image-analyst teammate processing all image chunks after page-rebuilders finish
- 1× table-stitcher teammate for multi-page tables
- Use the shared task list to coordinate work.
- Output schema: as defined in the doc-rebuilder agent's system prompt.
- Target file: `/Users/guto/ufo/raw/{doc_id}/document.md`.
- Set frontmatter `build_approach: "agent-teams"`.
When all teammates complete, clean up the team and output a single line of stats: pages_done, chunks_total, images_extracted, tables_stitched, ufo_anomalies, cryptid_anomalies, wall_seconds."""
def run_approach(approach: str, doc_id: str, max_pages: int) -> dict:
"""Invoke claude CLI for one approach. Returns metrics."""
out_dir = UFO_ROOT / "raw" / doc_id
if out_dir.exists():
# Move existing aside so we don't clobber
backup = UFO_ROOT / "raw" / f"{doc_id}.backup-{int(time.time())}"
shutil.move(str(out_dir), str(backup))
out_dir.mkdir(parents=True, exist_ok=True)
(out_dir / "images").mkdir(exist_ok=True)
(out_dir / "tables").mkdir(exist_ok=True)
if approach == "subagent":
prompt = PROMPT_SUBAGENT.format(doc_id=doc_id, max_pages=max_pages)
env = {**os.environ}
else:
prompt = PROMPT_TEAM.format(doc_id=doc_id, max_pages=max_pages)
env = {**os.environ, "CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS": "1"}
cmd = [
"claude", "-p",
"--model", "sonnet",
"--output-format", "json",
"--max-turns", "60",
"--allowedTools", "Read,Write,Bash,Task",
"--add-dir", str(UFO_ROOT),
"--",
prompt,
]
if approach == "team":
# Inject teammate-mode flag; some experimental features need it
cmd.insert(-2, "--teammate-mode")
cmd.insert(-2, "in-process")
print(f"\n{'=' * 70}")
print(f" APPROACH: {approach.upper()}")
print(f"{'=' * 70}")
print(f" cmd: {' '.join(cmd[:8])} … (prompt truncated)")
print(f" starting at {utc_now_iso()}")
sys.stdout.flush()
t0 = time.time()
proc = subprocess.run(cmd, capture_output=True, text=True, env=env, check=False, timeout=3600)
wall = time.time() - t0
metrics = {
"approach": approach,
"wall_seconds": round(wall, 1),
"returncode": proc.returncode,
"stderr_tail": proc.stderr[-1000:] if proc.stderr else "",
}
try:
cli = json.loads(proc.stdout) if proc.stdout else {}
except json.JSONDecodeError:
cli = {"raw_stdout": proc.stdout[-3000:]}
metrics["is_error"] = cli.get("is_error", proc.returncode != 0)
metrics["duration_ms"] = cli.get("duration_ms")
metrics["duration_api_ms"] = cli.get("duration_api_ms")
metrics["total_cost_usd"] = cli.get("total_cost_usd")
metrics["num_turns"] = cli.get("num_turns")
metrics["usage"] = cli.get("usage")
metrics["result_excerpt"] = (cli.get("result") or "")[:2000]
# Inspect output
doc_md = out_dir / "document.md"
metrics["output_exists"] = doc_md.exists()
metrics["output_size_bytes"] = doc_md.stat().st_size if doc_md.exists() else 0
metrics["images_extracted"] = len(list((out_dir / "images").glob("*"))) if (out_dir / "images").exists() else 0
# Rename output so both approaches can coexist
if doc_md.exists():
archive = UFO_ROOT / "raw" / f"{doc_id}--{approach}"
if archive.exists():
shutil.rmtree(archive)
shutil.move(str(out_dir), str(archive))
metrics["archived_at"] = str(archive)
print(f" finished in {wall:.1f}s · rc={proc.returncode}")
print(f" output exists: {metrics['output_exists']} · size: {metrics['output_size_bytes']} bytes")
print(f" images_extracted: {metrics['images_extracted']}")
return metrics
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--doc-id", required=True)
ap.add_argument("--max-pages", type=int, default=20)
ap.add_argument("--approach", choices=["subagent", "team"])
ap.add_argument("--both", action="store_true")
args = ap.parse_args()
if not args.approach and not args.both:
ap.error("provide --approach or --both")
results: dict[str, dict] = {}
if args.both or args.approach == "subagent":
results["subagent"] = run_approach("subagent", args.doc_id, args.max_pages)
if args.both or args.approach == "team":
results["team"] = run_approach("team", args.doc_id, args.max_pages)
# Comparison table
print(f"\n{'=' * 70}")
print(f" COMPARISON — {args.doc_id} first {args.max_pages} pages")
print(f"{'=' * 70}")
if "subagent" in results and "team" in results:
s = results["subagent"]; t = results["team"]
print(f" {'metric':<25} {'subagent':>20} {'team':>20}")
print(f" {'-' * 25} {'-' * 20:>20} {'-' * 20:>20}")
print(f" {'wall_seconds':<25} {s['wall_seconds']:>20} {t['wall_seconds']:>20}")
print(f" {'returncode':<25} {s['returncode']:>20} {t['returncode']:>20}")
print(f" {'is_error':<25} {str(s.get('is_error')):>20} {str(t.get('is_error')):>20}")
print(f" {'total_cost_usd':<25} {str(s.get('total_cost_usd')):>20} {str(t.get('total_cost_usd')):>20}")
print(f" {'num_turns':<25} {str(s.get('num_turns')):>20} {str(t.get('num_turns')):>20}")
print(f" {'output_size_bytes':<25} {s['output_size_bytes']:>20} {t['output_size_bytes']:>20}")
print(f" {'images_extracted':<25} {s['images_extracted']:>20} {t['images_extracted']:>20}")
else:
for k, v in results.items():
print(json.dumps({k: v}, indent=2, default=str))
# Save full result JSON
report_dir = UFO_ROOT / "raw" / "_ab-test-reports"
report_dir.mkdir(parents=True, exist_ok=True)
report = report_dir / f"{args.doc_id}--{int(time.time())}.json"
report.write_text(json.dumps(results, indent=2, default=str))
print(f"\nFull report: {report}")
if __name__ == "__main__":
main()