disclosure-bureau/scripts/34-generate-doc-pitches.py

#!/usr/bin/env python3
"""
34-generate-doc-pitches.py — Generate Johnny Harris-style "enthusiast pitches"
(50-150 words, PT-BR + EN) for every document in wiki/documents/*.md.

Each pitch is injected into the doc's frontmatter as:
  enthusiast_pitch_pt_br: "..."
  enthusiast_pitch_en: "..."

Style guide (encoded in the prompt):
  - Mystery hook opening (date + place)
  - Concrete sensory details, real witness names with credentials
  - Staccato pacing, repetition for emphasis
  - Cliffhanger question at the end
  - Length adapts to doc richness: single dense case → ~140w · multi-case → focus on
    pattern + zoom on one · sparse doc → 50-80w

Pattern: each doc in its OWN `claude -p` subprocess (clean context). Workers
parallel for throughput. Idempotent: skips docs that already have pitch.

Usage:
  ./34-generate-doc-pitches.py                    # all docs missing pitch
  ./34-generate-doc-pitches.py --workers 4
  ./34-generate-doc-pitches.py --doc-id doc-X     # single doc
  ./34-generate-doc-pitches.py --force            # regenerate even if exists
  ./34-generate-doc-pitches.py --model haiku      # cheaper, faster
"""
from __future__ import annotations

import argparse
import json
import os
import re
import subprocess
import sys
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from threading import Lock

try:
    import yaml
except ImportError:
    sys.stderr.write("pip3 install pyyaml\n"); sys.exit(1)


UFO_ROOT = Path("/Users/guto/ufo")
WIKI_DOCS = UFO_ROOT / "wiki" / "documents"
LOG_DIR = UFO_ROOT / "raw" / "_pitch-generation"
LOG_DIR.mkdir(parents=True, exist_ok=True)
PROGRESS_LOG = LOG_DIR / "progress.jsonl"

progress_lock = Lock()
quota_exhausted = False  # detected globally → stops new spawns


SYSTEM_PROMPT = """You are writing in the voice of JOHNNY HARRIS — independent journalist, Vox/YouTube-style visual storyteller. Brazilian Portuguese (pt-br, NOT European). Companion piece for an English version.

Your goal: produce a SHORT (50-150 words) "enthusiast pitch" for a single declassified UAP/UFO document, designed to hook a curious lay reader on a website card.

STYLE RULES:
1. Open with a MYSTERY HOOK — date + place, concrete and grounded ("24 de abril de 1964, fim de tarde. Socorro, Novo México.")
2. Use staccato sentences. Forward motion. Plain language, no jargon.
3. Anchor in specific details: witness names + credentials, altitudes, coordinates, formal stamps (CONFIDENTIAL/RESTRICTED), unit numbers.
4. Repetition for emphasis. ("Sem som. Sem rastro.")
5. End with a CLIFFHANGER question or final stamp/document marker. Never a summary sentence.
6. Bold key facts with **markdown** if helpful (3-5 max).
7. Preserve verbatim quotes from the source in English when they're vivid (e.g., RESTRICTED, callsigns, military jargon).

LENGTH ADAPTS TO DOC RICHNESS:
- Sparse doc / form fragment: 50-80 words
- One dense case: 100-150 words
- Multi-case doc: lead with scale ("100 incidentes em uma pasta"), zoom on ONE vivid case, signal the rest ("e há mais 99 desses"), pattern recognition, final question.

OUTPUT FORMAT — return EXACTLY this JSON, nothing else:
{
  "pitch_pt_br": "...",
  "pitch_en": "..."
}

Both versions should hit roughly the same word count. PT-BR is the primary; EN is a faithful adaptation, NOT a literal translation."""


USER_PROMPT_TEMPLATE = """Generate the enthusiast pitch for this declassified document:

DOC ID: {doc_id}
TITLE: {canonical_title}
COLLECTION: {collection}
PAGES: {page_count}
CLASSIFICATION: {classification}

DOCUMENT BODY (truncated to first 6000 chars — focus on substantive content):
{body}

Return the JSON now."""


def utc_iso() -> str:
    from datetime import datetime, timezone
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def append_jsonl(path: Path, record: dict) -> None:
    with progress_lock:
        with path.open("a", encoding="utf-8") as fh:
            fh.write(json.dumps(record, ensure_ascii=False) + "\n")


def read_doc(doc_path: Path) -> tuple[dict, str, str]:
    raw = doc_path.read_text(encoding="utf-8")
    if not raw.startswith("---"):
        return {}, "", raw
    end = raw.find("---", 4)
    if end < 0:
        return {}, "", raw
    fm_text = raw[3:end].strip()
    body = raw[end + 3 :].lstrip("\n")
    try:
        fm = yaml.safe_load(fm_text) or {}
    except yaml.YAMLError:
        fm = {}
    return fm, fm_text, body


def write_doc(doc_path: Path, fm: dict, body: str) -> None:
    new_yaml = yaml.safe_dump(fm, sort_keys=False, allow_unicode=True, width=120)
    new_raw = f"---\n{new_yaml}---\n{body}"
    doc_path.write_text(new_raw, encoding="utf-8")


def call_claude(doc_id: str, fm: dict, body: str, model: str, timeout_s: int) -> tuple[bool, dict]:
    """Call `claude -p` for ONE pitch. Returns (success, payload)."""
    global quota_exhausted
    if quota_exhausted:
        return False, {"error": "quota_exhausted_early_abort"}

    prompt = USER_PROMPT_TEMPLATE.format(
        doc_id=doc_id,
        canonical_title=fm.get("canonical_title") or doc_id,
        collection=fm.get("collection") or "—",
        page_count=fm.get("page_count") or "?",
        classification=fm.get("highest_classification") or fm.get("classification") or "—",
        body=body[:6000],
    )

    cmd = [
        "claude", "-p",
        "--model", model,
        "--output-format", "json",
        "--max-turns", "2",
        "--system-prompt", SYSTEM_PROMPT,
        "--",
        prompt,
    ]

    try:
        proc = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            env={**os.environ},
            check=False,
            timeout=timeout_s,
        )
    except subprocess.TimeoutExpired:
        return False, {"error": "timeout", "wall_seconds": timeout_s}

    if proc.returncode != 0:
        excerpt = (proc.stdout or "")[-500:]
        if "monthly usage limit" in excerpt.lower() or "usage limit" in excerpt.lower():
            quota_exhausted = True
            return False, {"error": "quota_exhausted", "result_excerpt": excerpt}
        return False, {"error": "rc_nonzero", "rc": proc.returncode, "stderr": (proc.stderr or "")[-500:]}

    try:
        cli = json.loads(proc.stdout)
    except json.JSONDecodeError:
        return False, {"error": "cli_json_parse", "raw": proc.stdout[-500:]}

    result = cli.get("result", "")
    if not result:
        return False, {"error": "empty_result", "cli": cli}

    # Try multiple strategies to extract the pitches robustly
    payload = None

    # Strategy 1: try parsing the whole result as JSON
    try:
        payload = json.loads(result.strip())
    except json.JSONDecodeError:
        pass

    # Strategy 2: regex for the two fields directly (handles unescaped chars in values)
    if not payload:
        # Match: "pitch_pt_br": "<content until next ", " ... pitch_en or end of object>"
        # We use a more flexible approach: split on the field names
        pt_match = re.search(
            r'"pitch_pt_br"\s*:\s*"((?:[^"\\]|\\.)*)"\s*,\s*"pitch_en"',
            result, re.DOTALL
        )
        en_match = re.search(
            r'"pitch_en"\s*:\s*"((?:[^"\\]|\\.)*)"\s*[},]',
            result, re.DOTALL
        )
        if pt_match and en_match:
            pt = pt_match.group(1).replace('\\"', '"').replace('\\n', '\n').replace('\\\\', '\\')
            en = en_match.group(1).replace('\\"', '"').replace('\\n', '\n').replace('\\\\', '\\')
            payload = {"pitch_pt_br": pt, "pitch_en": en}

    # Strategy 3: balanced-brace extraction
    if not payload:
        try:
            start = result.index("{")
            depth = 0
            for i in range(start, len(result)):
                if result[i] == "{": depth += 1
                elif result[i] == "}":
                    depth -= 1
                    if depth == 0:
                        try:
                            payload = json.loads(result[start : i + 1])
                        except json.JSONDecodeError:
                            pass
                        break
        except ValueError:
            pass

    if not payload:
        return False, {"error": "no_extractable_json", "result_excerpt": result[:600]}

    if "pitch_pt_br" not in payload or "pitch_en" not in payload:
        return False, {"error": "missing_fields", "payload": payload}

    return True, {
        "pitch_pt_br": payload["pitch_pt_br"].strip(),
        "pitch_en": payload["pitch_en"].strip(),
        "cost_usd": cli.get("total_cost_usd"),
        "num_turns": cli.get("num_turns"),
        "usage": cli.get("usage"),
    }


def word_count(text: str) -> int:
    return len([w for w in re.split(r"\s+", text) if w])


def process_doc(doc_id: str, force: bool, model: str, timeout_s: int) -> dict:
    """Generate + inject pitch for ONE doc."""
    doc_path = WIKI_DOCS / f"{doc_id}.md"
    if not doc_path.exists():
        return {"doc_id": doc_id, "success": False, "error": "doc_not_found"}

    fm, _, body = read_doc(doc_path)

    if not force and fm.get("enthusiast_pitch_pt_br") and fm.get("enthusiast_pitch_en"):
        return {"doc_id": doc_id, "success": True, "skipped": True, "reason": "already_has_pitch"}

    t0 = time.time()
    ok, result = call_claude(doc_id, fm, body, model, timeout_s)
    wall = round(time.time() - t0, 1)

    rec = {
        "doc_id": doc_id,
        "started_at": utc_iso(),
        "wall_seconds": wall,
        "model": model,
    }
    if not ok:
        rec.update({"success": False, **result})
        append_jsonl(PROGRESS_LOG, rec)
        return rec

    pt = result["pitch_pt_br"]
    en = result["pitch_en"]
    rec.update({
        "success": True,
        "pt_words": word_count(pt),
        "en_words": word_count(en),
        "cost_usd": result.get("cost_usd"),
    })

    # Validate word count
    if not (40 <= word_count(pt) <= 200) or not (40 <= word_count(en) <= 200):
        rec["warning"] = f"word_count_oob pt={word_count(pt)} en={word_count(en)}"

    # Inject into frontmatter
    fm["enthusiast_pitch_pt_br"] = pt
    fm["enthusiast_pitch_en"] = en
    fm["enthusiast_pitch_generated_at"] = utc_iso()
    fm["enthusiast_pitch_model"] = model
    write_doc(doc_path, fm, body)

    append_jsonl(PROGRESS_LOG, rec)
    return rec


def list_target_docs(only: str | None) -> list[str]:
    if only:
        return [only]
    docs: list[str] = []
    for p in sorted(WIKI_DOCS.glob("*.md")):
        docs.append(p.stem)
    return docs


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--workers", type=int, default=4)
    ap.add_argument("--doc-id", default=None)
    ap.add_argument("--force", action="store_true")
    ap.add_argument("--model", default="sonnet", choices=["sonnet", "haiku"])
    ap.add_argument("--timeout-per-doc", type=int, default=180)
    ap.add_argument("--limit", type=int, default=None, help="Smoke test: process at most N")
    args = ap.parse_args()

    docs = list_target_docs(args.doc_id)
    if args.limit:
        docs = docs[: args.limit]

    if not args.force:
        # Skip docs already done
        keep: list[str] = []
        for d in docs:
            fm, _, _ = read_doc(WIKI_DOCS / f"{d}.md")
            if not (fm.get("enthusiast_pitch_pt_br") and fm.get("enthusiast_pitch_en")):
                keep.append(d)
        skipped = len(docs) - len(keep)
        docs = keep
    else:
        skipped = 0

    print(f"=" * 70)
    print(f"  ENTHUSIAST PITCH GENERATOR — {len(docs)} docs queued, {skipped} already done")
    print(f"  workers: {args.workers} · model: {args.model} · timeout: {args.timeout_per_doc}s/doc")
    print(f"  started: {utc_iso()}")
    print(f"=" * 70)
    sys.stdout.flush()

    t0 = time.time()
    ok = err = 0
    total_cost = 0.0

    with ThreadPoolExecutor(max_workers=args.workers) as ex:
        futures = {ex.submit(process_doc, d, args.force, args.model, args.timeout_per_doc): d for d in docs}
        for fut in as_completed(futures):
            doc_id = futures[fut]
            try:
                r = fut.result()
            except Exception as e:
                r = {"doc_id": doc_id, "success": False, "exception": str(e)}

            if r.get("success"):
                ok += 1
                total_cost += r.get("cost_usd") or 0
                marker = "⊘" if r.get("skipped") else "✓"
                wc = f"pt={r.get('pt_words','?')}w en={r.get('en_words','?')}w" if not r.get("skipped") else "(cached)"
                print(f"  [{ok+err}/{len(docs)}] {marker} {doc_id[:55]} · {wc} · ${r.get('cost_usd') or 0:.3f}")
            else:
                err += 1
                marker = "💸" if r.get("error") == "quota_exhausted" else "✗"
                print(f"  [{ok+err}/{len(docs)}] {marker} {doc_id[:55]} · {r.get('error')}")
            sys.stdout.flush()

            if quota_exhausted:
                # Cancel pending — early abort
                for f in futures:
                    if not f.done():
                        f.cancel()
                print("\n  ⚠ QUOTA EXHAUSTED — aborting. Re-run later.")
                break

    print(f"\n{'=' * 70}")
    print(f"  DONE — {ok}/{len(docs)} succeeded · ${total_cost:.2f} · {round(time.time() - t0, 1)}s")
    print(f"{'=' * 70}")


if __name__ == "__main__":
    main()
baseline: Disclosure Bureau pipeline + Next.js UI + Supabase stack 2026-05-18 01:44:36 +00:00			`#!/usr/bin/env python3`
			`"""`
			`34-generate-doc-pitches.py — Generate Johnny Harris-style "enthusiast pitches"`
			`(50-150 words, PT-BR + EN) for every document in wiki/documents/*.md.`

			`Each pitch is injected into the doc's frontmatter as:`
			`enthusiast_pitch_pt_br: "..."`
			`enthusiast_pitch_en: "..."`

			`Style guide (encoded in the prompt):`
			`- Mystery hook opening (date + place)`
			`- Concrete sensory details, real witness names with credentials`
			`- Staccato pacing, repetition for emphasis`
			`- Cliffhanger question at the end`
			`- Length adapts to doc richness: single dense case → ~140w · multi-case → focus on`
			`pattern + zoom on one · sparse doc → 50-80w`

			Pattern: each doc in its OWN `claude -p` subprocess (clean context). Workers
			`parallel for throughput. Idempotent: skips docs that already have pitch.`

			`Usage:`
			`./34-generate-doc-pitches.py # all docs missing pitch`
			`./34-generate-doc-pitches.py --workers 4`
			`./34-generate-doc-pitches.py --doc-id doc-X # single doc`
			`./34-generate-doc-pitches.py --force # regenerate even if exists`
			`./34-generate-doc-pitches.py --model haiku # cheaper, faster`
			`"""`
			`from __future__ import annotations`

			`import argparse`
			`import json`
			`import os`
			`import re`
			`import subprocess`
			`import sys`
			`import time`
			`from concurrent.futures import ThreadPoolExecutor, as_completed`
			`from pathlib import Path`
			`from threading import Lock`

			`try:`
			`import yaml`
			`except ImportError:`
			`sys.stderr.write("pip3 install pyyaml\n"); sys.exit(1)`


			`UFO_ROOT = Path("/Users/guto/ufo")`
			`WIKI_DOCS = UFO_ROOT / "wiki" / "documents"`
			`LOG_DIR = UFO_ROOT / "raw" / "_pitch-generation"`
			`LOG_DIR.mkdir(parents=True, exist_ok=True)`
			`PROGRESS_LOG = LOG_DIR / "progress.jsonl"`

			`progress_lock = Lock()`
			`quota_exhausted = False # detected globally → stops new spawns`


			`SYSTEM_PROMPT = """You are writing in the voice of JOHNNY HARRIS — independent journalist, Vox/YouTube-style visual storyteller. Brazilian Portuguese (pt-br, NOT European). Companion piece for an English version.`

			`Your goal: produce a SHORT (50-150 words) "enthusiast pitch" for a single declassified UAP/UFO document, designed to hook a curious lay reader on a website card.`

			`STYLE RULES:`
			`1. Open with a MYSTERY HOOK — date + place, concrete and grounded ("24 de abril de 1964, fim de tarde. Socorro, Novo México.")`
			`2. Use staccato sentences. Forward motion. Plain language, no jargon.`
			`3. Anchor in specific details: witness names + credentials, altitudes, coordinates, formal stamps (CONFIDENTIAL/RESTRICTED), unit numbers.`
			`4. Repetition for emphasis. ("Sem som. Sem rastro.")`
			`5. End with a CLIFFHANGER question or final stamp/document marker. Never a summary sentence.`
			`6. Bold key facts with markdown if helpful (3-5 max).`
			`7. Preserve verbatim quotes from the source in English when they're vivid (e.g., RESTRICTED, callsigns, military jargon).`

			`LENGTH ADAPTS TO DOC RICHNESS:`
			`- Sparse doc / form fragment: 50-80 words`
			`- One dense case: 100-150 words`
			`- Multi-case doc: lead with scale ("100 incidentes em uma pasta"), zoom on ONE vivid case, signal the rest ("e há mais 99 desses"), pattern recognition, final question.`

			`OUTPUT FORMAT — return EXACTLY this JSON, nothing else:`
			`{`
			`"pitch_pt_br": "...",`
			`"pitch_en": "..."`
			`}`

			`Both versions should hit roughly the same word count. PT-BR is the primary; EN is a faithful adaptation, NOT a literal translation."""`


			`USER_PROMPT_TEMPLATE = """Generate the enthusiast pitch for this declassified document:`

			`DOC ID: {doc_id}`
			`TITLE: {canonical_title}`
			`COLLECTION: {collection}`
			`PAGES: {page_count}`
			`CLASSIFICATION: {classification}`

			`DOCUMENT BODY (truncated to first 6000 chars — focus on substantive content):`
			`{body}`

			`Return the JSON now."""`


			`def utc_iso() -> str:`
			`from datetime import datetime, timezone`
			`return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")`


			`def append_jsonl(path: Path, record: dict) -> None:`
			`with progress_lock:`
			`with path.open("a", encoding="utf-8") as fh:`
			`fh.write(json.dumps(record, ensure_ascii=False) + "\n")`


			`def read_doc(doc_path: Path) -> tuple[dict, str, str]:`
			`raw = doc_path.read_text(encoding="utf-8")`
			`if not raw.startswith("---"):`
			`return {}, "", raw`
			`end = raw.find("---", 4)`
			`if end < 0:`
			`return {}, "", raw`
			`fm_text = raw[3:end].strip()`
			`body = raw[end + 3 :].lstrip("\n")`
			`try:`
			`fm = yaml.safe_load(fm_text) or {}`
			`except yaml.YAMLError:`
			`fm = {}`
			`return fm, fm_text, body`


			`def write_doc(doc_path: Path, fm: dict, body: str) -> None:`
			`new_yaml = yaml.safe_dump(fm, sort_keys=False, allow_unicode=True, width=120)`
			`new_raw = f"---\n{new_yaml}---\n{body}"`
			`doc_path.write_text(new_raw, encoding="utf-8")`


			`def call_claude(doc_id: str, fm: dict, body: str, model: str, timeout_s: int) -> tuple[bool, dict]:`
			"""Call `claude -p` for ONE pitch. Returns (success, payload)."""
			`global quota_exhausted`
			`if quota_exhausted:`
			`return False, {"error": "quota_exhausted_early_abort"}`

			`prompt = USER_PROMPT_TEMPLATE.format(`
			`doc_id=doc_id,`
			`canonical_title=fm.get("canonical_title") or doc_id,`
			`collection=fm.get("collection") or "—",`
			`page_count=fm.get("page_count") or "?",`
			`classification=fm.get("highest_classification") or fm.get("classification") or "—",`
			`body=body[:6000],`
			`)`

			`cmd = [`
			`"claude", "-p",`
			`"--model", model,`
			`"--output-format", "json",`
			`"--max-turns", "2",`
			`"--system-prompt", SYSTEM_PROMPT,`
			`"--",`
			`prompt,`
			`]`

			`try:`
			`proc = subprocess.run(`
			`cmd,`
			`capture_output=True,`
			`text=True,`
			`env={**os.environ},`
			`check=False,`
			`timeout=timeout_s,`
			`)`
			`except subprocess.TimeoutExpired:`
			`return False, {"error": "timeout", "wall_seconds": timeout_s}`

			`if proc.returncode != 0:`
			`excerpt = (proc.stdout or "")[-500:]`
			`if "monthly usage limit" in excerpt.lower() or "usage limit" in excerpt.lower():`
			`quota_exhausted = True`
			`return False, {"error": "quota_exhausted", "result_excerpt": excerpt}`
			`return False, {"error": "rc_nonzero", "rc": proc.returncode, "stderr": (proc.stderr or "")[-500:]}`

			`try:`
			`cli = json.loads(proc.stdout)`
			`except json.JSONDecodeError:`
			`return False, {"error": "cli_json_parse", "raw": proc.stdout[-500:]}`

			`result = cli.get("result", "")`
			`if not result:`
			`return False, {"error": "empty_result", "cli": cli}`

			`# Try multiple strategies to extract the pitches robustly`
			`payload = None`

			`# Strategy 1: try parsing the whole result as JSON`
			`try:`
			`payload = json.loads(result.strip())`
			`except json.JSONDecodeError:`
			`pass`

			`# Strategy 2: regex for the two fields directly (handles unescaped chars in values)`
			`if not payload:`
			`# Match: "pitch_pt_br": "<content until next ", " ... pitch_en or end of object>"`
			`# We use a more flexible approach: split on the field names`
			`pt_match = re.search(`
			`r'"pitch_pt_br"\s:\s"((?:[^"\\]\|\\.))"\s,\s*"pitch_en"',`
			`result, re.DOTALL`
			`)`
			`en_match = re.search(`
			`r'"pitch_en"\s:\s"((?:[^"\\]\|\\.))"\s[},]',`
			`result, re.DOTALL`
			`)`
			`if pt_match and en_match:`
			`pt = pt_match.group(1).replace('\\"', '"').replace('\\n', '\n').replace('\\\\', '\\')`
			`en = en_match.group(1).replace('\\"', '"').replace('\\n', '\n').replace('\\\\', '\\')`
			`payload = {"pitch_pt_br": pt, "pitch_en": en}`

			`# Strategy 3: balanced-brace extraction`
			`if not payload:`
			`try:`
			`start = result.index("{")`
			`depth = 0`
			`for i in range(start, len(result)):`
			`if result[i] == "{": depth += 1`
			`elif result[i] == "}":`
			`depth -= 1`
			`if depth == 0:`
			`try:`
			`payload = json.loads(result[start : i + 1])`
			`except json.JSONDecodeError:`
			`pass`
			`break`
			`except ValueError:`
			`pass`

			`if not payload:`
			`return False, {"error": "no_extractable_json", "result_excerpt": result[:600]}`

			`if "pitch_pt_br" not in payload or "pitch_en" not in payload:`
			`return False, {"error": "missing_fields", "payload": payload}`

			`return True, {`
			`"pitch_pt_br": payload["pitch_pt_br"].strip(),`
			`"pitch_en": payload["pitch_en"].strip(),`
			`"cost_usd": cli.get("total_cost_usd"),`
			`"num_turns": cli.get("num_turns"),`
			`"usage": cli.get("usage"),`
			`}`


			`def word_count(text: str) -> int:`
			`return len([w for w in re.split(r"\s+", text) if w])`


			`def process_doc(doc_id: str, force: bool, model: str, timeout_s: int) -> dict:`
			`"""Generate + inject pitch for ONE doc."""`
			`doc_path = WIKI_DOCS / f"{doc_id}.md"`
			`if not doc_path.exists():`
			`return {"doc_id": doc_id, "success": False, "error": "doc_not_found"}`

			`fm, _, body = read_doc(doc_path)`

			`if not force and fm.get("enthusiast_pitch_pt_br") and fm.get("enthusiast_pitch_en"):`
			`return {"doc_id": doc_id, "success": True, "skipped": True, "reason": "already_has_pitch"}`

			`t0 = time.time()`
			`ok, result = call_claude(doc_id, fm, body, model, timeout_s)`
			`wall = round(time.time() - t0, 1)`

			`rec = {`
			`"doc_id": doc_id,`
			`"started_at": utc_iso(),`
			`"wall_seconds": wall,`
			`"model": model,`
			`}`
			`if not ok:`
			`rec.update({"success": False, **result})`
			`append_jsonl(PROGRESS_LOG, rec)`
			`return rec`

			`pt = result["pitch_pt_br"]`
			`en = result["pitch_en"]`
			`rec.update({`
			`"success": True,`
			`"pt_words": word_count(pt),`
			`"en_words": word_count(en),`
			`"cost_usd": result.get("cost_usd"),`
			`})`

			`# Validate word count`
			`if not (40 <= word_count(pt) <= 200) or not (40 <= word_count(en) <= 200):`
			`rec["warning"] = f"word_count_oob pt={word_count(pt)} en={word_count(en)}"`

			`# Inject into frontmatter`
			`fm["enthusiast_pitch_pt_br"] = pt`
			`fm["enthusiast_pitch_en"] = en`
			`fm["enthusiast_pitch_generated_at"] = utc_iso()`
			`fm["enthusiast_pitch_model"] = model`
			`write_doc(doc_path, fm, body)`

			`append_jsonl(PROGRESS_LOG, rec)`
			`return rec`


			`def list_target_docs(only: str \| None) -> list[str]:`
			`if only:`
			`return [only]`
			`docs: list[str] = []`
			`for p in sorted(WIKI_DOCS.glob("*.md")):`
			`docs.append(p.stem)`
			`return docs`


			`def main():`
			`ap = argparse.ArgumentParser()`
			`ap.add_argument("--workers", type=int, default=4)`
			`ap.add_argument("--doc-id", default=None)`
			`ap.add_argument("--force", action="store_true")`
			`ap.add_argument("--model", default="sonnet", choices=["sonnet", "haiku"])`
			`ap.add_argument("--timeout-per-doc", type=int, default=180)`
			`ap.add_argument("--limit", type=int, default=None, help="Smoke test: process at most N")`
			`args = ap.parse_args()`

			`docs = list_target_docs(args.doc_id)`
			`if args.limit:`
			`docs = docs[: args.limit]`

			`if not args.force:`
			`# Skip docs already done`
			`keep: list[str] = []`
			`for d in docs:`
			`fm, _, _ = read_doc(WIKI_DOCS / f"{d}.md")`
			`if not (fm.get("enthusiast_pitch_pt_br") and fm.get("enthusiast_pitch_en")):`
			`keep.append(d)`
			`skipped = len(docs) - len(keep)`
			`docs = keep`
			`else:`
			`skipped = 0`

			`print(f"=" * 70)`
			`print(f" ENTHUSIAST PITCH GENERATOR — {len(docs)} docs queued, {skipped} already done")`
			`print(f" workers: {args.workers} · model: {args.model} · timeout: {args.timeout_per_doc}s/doc")`
			`print(f" started: {utc_iso()}")`
			`print(f"=" * 70)`
			`sys.stdout.flush()`

			`t0 = time.time()`
			`ok = err = 0`
			`total_cost = 0.0`

			`with ThreadPoolExecutor(max_workers=args.workers) as ex:`
			`futures = {ex.submit(process_doc, d, args.force, args.model, args.timeout_per_doc): d for d in docs}`
			`for fut in as_completed(futures):`
			`doc_id = futures[fut]`
			`try:`
			`r = fut.result()`
			`except Exception as e:`
			`r = {"doc_id": doc_id, "success": False, "exception": str(e)}`

			`if r.get("success"):`
			`ok += 1`
			`total_cost += r.get("cost_usd") or 0`
			`marker = "⊘" if r.get("skipped") else "✓"`
			`wc = f"pt={r.get('pt_words','?')}w en={r.get('en_words','?')}w" if not r.get("skipped") else "(cached)"`
			`print(f" [{ok+err}/{len(docs)}] {marker} {doc_id[:55]} · {wc} · ${r.get('cost_usd') or 0:.3f}")`
			`else:`
			`err += 1`
			`marker = "💸" if r.get("error") == "quota_exhausted" else "✗"`
			`print(f" [{ok+err}/{len(docs)}] {marker} {doc_id[:55]} · {r.get('error')}")`
			`sys.stdout.flush()`

			`if quota_exhausted:`
			`# Cancel pending — early abort`
			`for f in futures:`
			`if not f.done():`
			`f.cancel()`
			`print("\n ⚠ QUOTA EXHAUSTED — aborting. Re-run later.")`
			`break`

			`print(f"\n{'=' * 70}")`
			`print(f" DONE — {ok}/{len(docs)} succeeded · ${total_cost:.2f} · {round(time.time() - t0, 1)}s")`
			`print(f"{'=' * 70}")`


			`if __name__ == "__main__":`
			`main()`