#!/usr/bin/env python3
"""
41_strip_stubs.py — Eliminate "Will be enriched in Phase N" placeholders from
wiki/entities/**/*.md.

For each entity markdown file:
  Frontmatter:
    - narrative_summary: "_Stub. Will be enriched in Phase 7._" → null
    - narrative_summary_pt_br: "_Stub. Será enriquecido na Fase 7._" → null
    - drop narrative_summary_confidence
    - add summary_status: 'none'
    - add summary_confidence: null

  Body:
    - Remove paragraphs containing "_Stub generated by entity dedup. Will be
      enriched in Phase 6._" / "_Stub gerado pela deduplicação de entidades.
      Será enriquecido na Fase 6._"
    - Keep all other content untouched (curated narrative is preserved).

Idempotent: a file already migrated (has `summary_status` field) is skipped.

Usage:
  ./41_strip_stubs.py                   # process all entities
  ./41_strip_stubs.py --class events    # only one class
  ./41_strip_stubs.py --dry-run         # report counts, don't write
  ./41_strip_stubs.py --verbose         # list each touched file
"""
from __future__ import annotations

import argparse
import re
import sys
from datetime import datetime, timezone
from pathlib import Path

try:
    import yaml
except ImportError:
    sys.stderr.write("pip3 install pyyaml\n")
    sys.exit(1)

UFO_ROOT = Path(__file__).resolve().parents[2]
ENTITIES_BASE = UFO_ROOT / "wiki" / "entities"
LOG_PATH = UFO_ROOT / "wiki" / "log.md"

STUB_EN = "_Stub. Will be enriched in Phase 7._"
STUB_PT = "_Stub. Será enriquecido na Fase 7._"
BODY_STUB_EN = "_Stub generated by entity dedup. Will be enriched in Phase 6._"
BODY_STUB_PT = "_Stub gerado pela deduplicação de entidades. Será enriquecido na Fase 6._"

# Match any line that is exactly one of the body stubs (ignoring surrounding whitespace).
BODY_STUB_LINE_RE = re.compile(
    r"^\s*(?:" + re.escape(BODY_STUB_EN) + r"|" + re.escape(BODY_STUB_PT) + r")\s*$",
    re.MULTILINE,
)


def utc_iso() -> str:
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def split_md(path: Path) -> tuple[dict, str, bool]:
    """Return (frontmatter, body, had_frontmatter)."""
    raw = path.read_text(encoding="utf-8")
    if not raw.startswith("---"):
        return {}, raw, False
    end = raw.find("---", 4)
    if end == -1:
        return {}, raw, False
    try:
        fm = yaml.safe_load(raw[3:end].strip()) or {}
    except yaml.YAMLError:
        return {}, raw, False
    body = raw[end + 3 :].lstrip("\n")
    return fm, body, True


def render_md(fm: dict, body: str) -> str:
    yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
    sep = "" if body.startswith("\n") else "\n"
    return f"---\n{yaml_str}---\n{sep}{body}"


def clean_body(body: str) -> str:
    """Drop lines that are exactly a stub paragraph; collapse 3+ blank lines to 2."""
    cleaned = BODY_STUB_LINE_RE.sub("", body)
    cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
    return cleaned


def migrate_file(path: Path, dry_run: bool, verbose: bool) -> tuple[bool, bool]:
    """Returns (changed, already_migrated)."""
    fm, body, had_fm = split_md(path)
    if not had_fm:
        return (False, False)

    # Idempotency: presence of summary_status means already migrated.
    if "summary_status" in fm:
        return (False, True)

    new_fm = dict(fm)  # preserve insertion order

    changed = False

    # Clear stub textual values
    if new_fm.get("narrative_summary") == STUB_EN:
        new_fm["narrative_summary"] = None
        changed = True
    if new_fm.get("narrative_summary_pt_br") == STUB_PT:
        new_fm["narrative_summary_pt_br"] = None
        changed = True

    # Drop old confidence key (replaced by summary_confidence)
    if "narrative_summary_confidence" in new_fm:
        new_fm.pop("narrative_summary_confidence", None)
        changed = True

    # Always add summary_status / summary_confidence so subsequent runs detect
    # the migration as already done (even if this file had no stub fields).
    new_fm["summary_status"] = "none"
    new_fm.setdefault("summary_confidence", None)
    # Touch last_lint so downstream tools see the change
    new_fm["last_lint"] = utc_iso()

    new_body = clean_body(body)
    if new_body != body:
        changed = True

    if not changed:
        # No stub to strip, but we still added the status field — count as changed.
        changed = True

    if dry_run:
        if verbose:
            print(f"  (dry) {path.relative_to(UFO_ROOT)}")
        return (True, False)

    path.write_text(render_md(new_fm, new_body), encoding="utf-8")
    if verbose:
        print(f"  ✓ {path.relative_to(UFO_ROOT)}")
    return (True, False)


def main() -> int:
    p = argparse.ArgumentParser()
    p.add_argument("--class", dest="cls", default=None,
                   help="restrict to one class folder under wiki/entities/")
    p.add_argument("--dry-run", action="store_true")
    p.add_argument("--verbose", action="store_true")
    args = p.parse_args()

    if not ENTITIES_BASE.exists():
        sys.stderr.write(f"missing {ENTITIES_BASE}\n")
        return 1

    if args.cls:
        roots = [ENTITIES_BASE / args.cls]
    else:
        roots = [d for d in ENTITIES_BASE.iterdir() if d.is_dir()]

    total = 0
    migrated = 0
    skipped_already = 0
    for root in roots:
        for path in sorted(root.glob("*.md")):
            total += 1
            changed, already = migrate_file(path, args.dry_run, args.verbose)
            if already:
                skipped_already += 1
            elif changed:
                migrated += 1

    print()
    print(f"  total scanned:        {total}")
    print(f"  migrated:             {migrated}")
    print(f"  already migrated:     {skipped_already}")
    print(f"  dry-run:              {args.dry_run}")

    if not args.dry_run and migrated > 0:
        LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
        with LOG_PATH.open("a", encoding="utf-8") as f:
            f.write(
                f"\n## {utc_iso()} · STRIP_STUBS\n"
                f"- script: scripts/maintain/41_strip_stubs.py\n"
                f"- scanned: {total}\n"
                f"- migrated: {migrated}\n"
                f"- already_migrated: {skipped_already}\n"
            )

    return 0


if __name__ == "__main__":
    sys.exit(main())