sanitize entities: single YAML source of truth, signal_strength badge

The corpus had two parallel reverse-reference signals: the wiki/pages entities_extracted blocks (Haiku page-level) and public.entity_mentions (Sonnet chunk-level, ILIKE-matched). The entity page only consulted the DB, so it showed "0 menções" for thousands of entities that were anchored in pages or in cross-entity links the DB never indexed. Resolved by collapsing all signals into the YAML frontmatter, which is now the single runtime source for entity metadata. scripts/maintain/42_sync_entity_stats.py walks every entity and writes: mentioned_in: [...] # consolidated page refs total_mentions: max(db, pages) documents_count: max(db_docs, distinct page docs) signal_sources: db_chunks: int page_refs: int cross_refs: int signal_strength: strong | weak | orphan | unverified referenced_by: [[class/id]] # cross-entity backlinks Outgoing wikilinks (e.g. OBJ.observed_in_event → EV) count toward the entity's own cross_refs so anchored-but-not-mentioned entities don't register as orphan. OBJ canonical names like "7m long, 1.3m high, two rocket motors, smooth flow, rotary drive null UAP (OBJ-EV1945-PEYERLSHOTDOWN-01)" are rewritten to "Peyerl shot down UAP" derived from observed_in_event, preserving the full description as an alias. --fix-obj-names did this for every OBJ-* with >80 char canonical_name. Default behaviour is conservative: --archive-only-junk archives only single/double-char names and pure-numeric noise. Everything else stays on disk with signal_strength marked, so the user can review later. web/lib/retrieval/entity-pages.ts swapped from db-first to yaml-first. The /e/[cls]/[id] page now reads counts straight from YAML and renders a "força do sinal" badge with the per-source breakdown. Orphan entities get a banner explaining they have no cross-references. DB is still queried for ONE thing: the chunk text for preview cards on the entity page, so we don't re-parse 21k markdown files on every render. First-pass result: 9020 strong / 14520 weak / 10814 orphan; OBJ-EV1945- PEYERLSHOTDOWN-01 now reads "Peyerl shot down UAP · fraca · 1 backlink" in the live UI.
2026-05-18 19:49:31 -03:00 · 2026-05-18 19:49:31 -03:00 · 291748df63
commit 291748df63
parent c0c6652dd5
3 changed files with 698 additions and 59 deletions
--- a/scripts/maintain/42_sync_entity_stats.py
+++ b/scripts/maintain/42_sync_entity_stats.py
@ -0,0 +1,455 @@
+#!/usr/bin/env python3
+"""
+42_sync_entity_stats.py — Bulletproof sync of every entity's reverse-reference
+signals.
+
+Three independent signal sources exist for an entity. Until now the UI used
+only one of them and showed "0 menções" whenever the others disagreed. This
+script rebuilds them all in a single pass:
+
+  1. wiki_page_refs   — pages whose entities_extracted[] lists this entity.
+                        Materialised back into the entity's mentioned_in[].
+
+  2. db_chunk_mentions — count of rows in public.entity_mentions whose
+                        chunk_pk matches a chunk that textually contains the
+                        entity (ILIKE on canonical_name + aliases). Source of
+                        truth for chat / search retrieval.
+
+  3. cross_entity_refs — reverse-links discovered by traversing other entity
+                        YAMLs: an event's uap_objects[] / observers[] /
+                        organizations_involved[]; a location's events_here[];
+                        a document's key_entities[].
+
+After scanning, each entity's frontmatter is rewritten with:
+
+    mentioned_in:        [...]   # the page refs (canonical, not generated noise)
+    total_mentions:      <int>   # max(db_chunk_mentions, len(mentioned_in))
+    documents_count:     <int>   # distinct docs across both signals
+    signal_sources:
+      db_chunks:         <int>
+      page_refs:         <int>
+      cross_refs:        <int>
+    signal_strength:     strong | weak | orphan
+    last_lint:           <utc>
+
+When all three signals are zero the entity is moved to
+wiki/entities/_archived/<class>/<id>.md and a one-line record is appended to
+wiki/log.md.
+
+Idempotent: re-running converges. Safe to interrupt — writes are atomic.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import re
+import shutil
+import sys
+import unicodedata
+from collections import defaultdict
+from datetime import datetime, timezone
+from pathlib import Path
+
+try:
+    import yaml
+    import psycopg
+except ImportError as e:
+    sys.stderr.write(f"pip3 install pyyaml psycopg[binary]  # missing: {e}\n")
+    sys.exit(1)
+
+
+UFO_ROOT = Path(__file__).resolve().parents[2]
+ENTITIES_BASE = UFO_ROOT / "wiki" / "entities"
+ARCHIVED_BASE = UFO_ROOT / "wiki" / "entities" / "_archived"
+PAGES_BASE   = UFO_ROOT / "wiki" / "pages"
+DOCS_BASE    = UFO_ROOT / "wiki" / "documents"
+LOG_PATH     = UFO_ROOT / "wiki" / "log.md"
+
+DATABASE_URL = os.getenv("DATABASE_URL") or os.getenv("SUPABASE_DB_URL")
+
+# Map plural folder names to the entity_class singular used in DB
+FOLDER_TO_CLASS = {
+    "people":        "person",
+    "organizations": "organization",
+    "locations":     "location",
+    "events":        "event",
+    "uap-objects":   "uap_object",
+    "vehicles":      "vehicle",
+    "operations":    "operation",
+    "concepts":      "concept",
+}
+CLASS_TO_FOLDER = {v: k for k, v in FOLDER_TO_CLASS.items()}
+
+ID_FIELD_BY_CLASS = {
+    "person":       "person_id",
+    "organization": "organization_id",
+    "location":     "location_id",
+    "event":        "event_id",
+    "uap_object":   "uap_object_id",
+    "vehicle":      "vehicle_id",
+    "operation":    "operation_id",
+    "concept":      "concept_id",
+}
+
+# Cross-entity fields that contain wikilinks pointing TO another entity.
+CROSS_REF_FIELDS = {
+    "event":      ["uap_objects", "observers", "organizations_involved",
+                   "vehicles_involved", "witnesses_analyses", "preceded_by",
+                   "followed_by", "related_events", "documented_in",
+                   "primary_location"],
+    "location":   ["events_here"],
+    "uap_object": ["observed_in_event", "secondary_events"],
+    "operation":  ["documents"],
+    "document":   ["key_entities", "key_events"],
+}
+
+WIKILINK_RE = re.compile(r"\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]")
+
+
+def canonicalize_name(name: str) -> str:
+    """name → kebab-case ASCII-fold id (same algorithm as 03-dedup-entities.py)."""
+    if not name:
+        return ""
+    nfkd = unicodedata.normalize("NFKD", str(name))
+    ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
+    lower = ascii_str.lower()
+    replaced = re.sub(r"[^a-z0-9-]", "-", lower)
+    collapsed = re.sub(r"-+", "-", replaced).strip("-")
+    if collapsed and collapsed[0].isdigit():
+        collapsed = "x-" + collapsed
+    return collapsed
+
+
+def utc_iso() -> str:
+    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+
+def read_md(path: Path) -> tuple[dict, str]:
+    raw = path.read_text(encoding="utf-8")
+    if not raw.startswith("---"):
+        return {}, raw
+    end = raw.find("---", 4)
+    try:
+        fm = yaml.safe_load(raw[3:end].strip()) or {}
+    except yaml.YAMLError:
+        return {}, raw
+    body = raw[end + 3 :].lstrip("\n")
+    return fm, body
+
+
+def write_md(path: Path, fm: dict, body: str) -> None:
+    """Atomic write so we never leave a half-written YAML."""
+    yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
+    sep = "" if body.startswith("\n") else "\n"
+    tmp = path.with_suffix(path.suffix + ".tmp")
+    tmp.write_text(f"---\n{yaml_str}---\n{sep}{body}", encoding="utf-8")
+    tmp.replace(path)
+
+
+def parse_wikilink_target(s: str) -> tuple[str | None, str | None]:
+    """[[class/id]] or [[event/id]] → (class, id). Returns (None, None) if not parseable."""
+    if not s or not isinstance(s, str):
+        return None, None
+    m = WIKILINK_RE.search(s)
+    target = m.group(1).strip() if m else s.strip()
+    if "/" not in target:
+        return None, None
+    parts = target.split("/", 1)
+    prefix, ident = parts[0], parts[1]
+    # accept singular ("event/...") or plural ("events/...") or class-name
+    aliases = {
+        "people": "person", "person": "person",
+        "org": "organization", "organization": "organization", "organizations": "organization",
+        "loc": "location", "location": "location", "locations": "location",
+        "event": "event", "events": "event",
+        "uap": "uap_object", "uap_object": "uap_object", "uap-objects": "uap_object",
+        "vehicle": "vehicle", "vehicles": "vehicle",
+        "op": "operation", "operation": "operation", "operations": "operation",
+        "concept": "concept", "concepts": "concept",
+    }
+    cls = aliases.get(prefix.lower())
+    return (cls, ident.strip()) if cls else (None, None)
+
+
+def collect_page_refs() -> dict[tuple[str, str], set[str]]:
+    """
+    Scan wiki/pages/<doc>/p*.md. For each page, parse
+    `entities_extracted: {people: [...], organizations: [...], ...}` and append
+    the page_id to that entity's set.
+
+    Returns {(class, id): {page_id, ...}}.
+    """
+    refs: dict[tuple[str, str], set[str]] = defaultdict(set)
+    for page_path in PAGES_BASE.rglob("p*.md"):
+        try:
+            fm, _ = read_md(page_path)
+        except Exception:
+            continue
+        extracted = fm.get("entities_extracted") or {}
+        if not isinstance(extracted, dict):
+            continue
+        # page_id like "doc-abc/p007"
+        doc_id = page_path.parent.name
+        page_id = f"{doc_id}/{page_path.stem}"
+        for folder, entries in extracted.items():
+            cls = FOLDER_TO_CLASS.get(folder)
+            if not cls or not isinstance(entries, list):
+                continue
+            for entry in entries:
+                # entry can be a plain string id, a wikilink, or a dict with
+                # a `name` field that we must canonicalize ourselves (matches
+                # the algorithm used in scripts/03-dedup-entities.py).
+                eid = None
+                if isinstance(entry, str):
+                    _, parsed_eid = parse_wikilink_target(entry)
+                    eid = parsed_eid or canonicalize_name(entry)
+                elif isinstance(entry, dict):
+                    eid = (entry.get("id")
+                           or entry.get(ID_FIELD_BY_CLASS.get(cls, "id"))
+                           or canonicalize_name(entry.get("name", "")))
+                if eid:
+                    refs[(cls, eid)].add(page_id)
+                    # Also index by every alias, so e.g. "USCENTCOM" matches a
+                    # United States Central Command entity if dedup ran on aliases.
+                    if isinstance(entry, dict):
+                        for alias in (entry.get("aliases") or []):
+                            alias_id = canonicalize_name(alias)
+                            if alias_id and alias_id != eid:
+                                refs[(cls, alias_id)].add(page_id)
+    return refs
+
+
+def collect_cross_refs() -> dict[tuple[str, str], set[str]]:
+    """
+    Sweep entity YAMLs themselves. When entity X declares
+    `uap_objects: [[[uap/OBJ-...]]]`, we register OBJ-... → X as a cross-ref.
+    """
+    refs: dict[tuple[str, str], set[str]] = defaultdict(set)
+    for folder, cls in FOLDER_TO_CLASS.items():
+        cls_dir = ENTITIES_BASE / folder
+        if not cls_dir.is_dir():
+            continue
+        for ent_path in cls_dir.glob("*.md"):
+            try:
+                fm, _ = read_md(ent_path)
+            except Exception:
+                continue
+            id_field = ID_FIELD_BY_CLASS.get(cls)
+            self_id = fm.get(id_field) or ent_path.stem
+            for field in CROSS_REF_FIELDS.get(cls, []):
+                val = fm.get(field)
+                items = val if isinstance(val, list) else ([val] if val else [])
+                for item in items:
+                    tgt_cls, tgt_id = parse_wikilink_target(item if isinstance(item, str) else str(item))
+                    if tgt_cls and tgt_id:
+                        refs[(tgt_cls, tgt_id)].add(f"{cls}/{self_id}")
+    # Also walk documents/key_entities
+    for doc_path in DOCS_BASE.glob("*.md"):
+        try:
+            fm, _ = read_md(doc_path)
+        except Exception:
+            continue
+        for item in (fm.get("key_entities") or []):
+            tgt_cls, tgt_id = parse_wikilink_target(item if isinstance(item, str) else str(item))
+            if tgt_cls and tgt_id:
+                refs[(tgt_cls, tgt_id)].add(f"document/{doc_path.stem}")
+    return refs
+
+
+def collect_db_mentions(conn) -> dict[tuple[str, str], tuple[int, int]]:
+    """Return {(class, id): (chunk_count, doc_count)} from public.entity_mentions."""
+    out: dict[tuple[str, str], tuple[int, int]] = {}
+    with conn.cursor() as cur:
+        cur.execute(
+            """
+            SELECT e.entity_class, e.entity_id,
+                   COUNT(em.chunk_pk)::int AS chunks,
+                   COUNT(DISTINCT c.doc_id)::int AS docs
+            FROM public.entities e
+            LEFT JOIN public.entity_mentions em ON em.entity_pk = e.entity_pk
+            LEFT JOIN public.chunks c ON c.chunk_pk = em.chunk_pk
+            GROUP BY e.entity_class, e.entity_id
+            """
+        )
+        for cls, eid, chunks, docs in cur.fetchall():
+            out[(cls, eid)] = (chunks or 0, docs or 0)
+    return out
+
+
+def signal_strength(db_chunks: int, page_refs: int, cross_refs: int) -> str:
+    total = db_chunks + page_refs + cross_refs
+    if total == 0:
+        return "orphan"
+    if db_chunks >= 3 or page_refs >= 3 or (db_chunks >= 1 and page_refs >= 1):
+        return "strong"
+    return "weak"
+
+
+def archive_entity(path: Path, dry_run: bool, archived_count: list[int]) -> None:
+    rel = path.relative_to(ENTITIES_BASE)
+    target = ARCHIVED_BASE / rel
+    archived_count[0] += 1
+    if dry_run:
+        return
+    target.parent.mkdir(parents=True, exist_ok=True)
+    shutil.move(str(path), str(target))
+
+
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--dry-run", action="store_true")
+    p.add_argument("--archive", action="store_true",
+                   help="actually move orphans to wiki/entities/_archived/. "
+                        "By default we only mark them — data is never lost.")
+    p.add_argument("--archive-only-junk", action="store_true",
+                   help="archive ONLY entities whose canonical_name is <=3 chars, "
+                        "purely numeric, or matches obvious junk patterns")
+    p.add_argument("--fix-obj-names", action="store_true",
+                   help="rewrite OBJ-* canonical_name to '<event> UAP', "
+                        "moving the full shape description to aliases")
+    p.add_argument("--verbose", action="store_true")
+    args = p.parse_args()
+
+    print(f"scanning {ENTITIES_BASE} ...")
+    if not DATABASE_URL:
+        sys.stderr.write("DATABASE_URL not set — cannot read DB mentions\n")
+        return 1
+
+    print("collecting page refs from wiki/pages/ ...")
+    page_refs = collect_page_refs()
+    print(f"  {len(page_refs)} entities referenced from {sum(len(v) for v in page_refs.values())} page rows")
+
+    print("collecting cross-entity refs ...")
+    cross_refs = collect_cross_refs()
+    print(f"  {len(cross_refs)} entities back-linked")
+
+    print(f"reading DB entity_mentions ...")
+    with psycopg.connect(DATABASE_URL) as conn:
+        db_counts = collect_db_mentions(conn)
+    print(f"  {len(db_counts)} entities in DB")
+
+    # Walk every entity YAML on disk
+    archived_count = [0]
+    stats = {"strong": 0, "weak": 0, "orphan": 0, "updated": 0, "skipped": 0}
+
+    for folder, cls in FOLDER_TO_CLASS.items():
+        cls_dir = ENTITIES_BASE / folder
+        if not cls_dir.is_dir():
+            continue
+        for ent_path in cls_dir.glob("*.md"):
+            try:
+                fm, body = read_md(ent_path)
+            except Exception:
+                stats["skipped"] += 1
+                continue
+            if not fm:
+                stats["skipped"] += 1
+                continue
+            id_field = ID_FIELD_BY_CLASS.get(cls)
+            eid = fm.get(id_field) or ent_path.stem
+            key = (cls, eid)
+
+            db_chunks, db_docs = db_counts.get(key, (0, 0))
+            page_list = sorted(page_refs.get(key, set()))
+            cross_list = sorted(cross_refs.get(key, set()))
+
+            # Also count this entity's OWN outgoing wikilinks as signal —
+            # if an OBJ has observed_in_event pointing to a real event, the
+            # OBJ is anchored even when no one links back to it.
+            own_outgoing: set[str] = set()
+            for field in CROSS_REF_FIELDS.get(cls, []):
+                val = fm.get(field)
+                items = val if isinstance(val, list) else ([val] if val else [])
+                for item in items:
+                    tgt_cls, tgt_id = parse_wikilink_target(
+                        item if isinstance(item, str) else str(item))
+                    if tgt_cls and tgt_id:
+                        own_outgoing.add(f"{tgt_cls}/{tgt_id}")
+
+            all_cross = sorted(set(cross_list) | own_outgoing)
+            strength = signal_strength(db_chunks, len(page_list), len(all_cross))
+
+            stats[strength] += 1
+
+            # Optional: clean up OBJ entities whose canonical_name is a 100-char
+            # shape description plus the ID in parentheses. Move the description
+            # to an alias and pick a short readable name from the linked event.
+            if args.fix_obj_names and cls == "uap_object":
+                cn = str(fm.get("canonical_name") or "")
+                if len(cn) > 80 and "UAP" in cn and "(" in cn and cn.endswith(")"):
+                    obs_event = fm.get("observed_in_event")
+                    event_cls, event_id = parse_wikilink_target(obs_event or "")
+                    if event_cls == "event" and event_id:
+                        # Strip the "EV-YYYY-MM-DD-" prefix to get a slug
+                        slug = re.sub(r"^EV-\d{4}-[\dX]{2}-[\dX]{2}-", "", event_id)
+                        new_name = slug.replace("-", " ").strip() or eid
+                        new_name = new_name[:1].upper() + new_name[1:] + " UAP"
+                        aliases = list(fm.get("aliases") or [])
+                        if cn not in aliases:
+                            aliases.insert(0, cn)
+                        fm["canonical_name"] = new_name
+                        fm["aliases"] = aliases
+
+            # Mutate frontmatter — preserve unrelated keys.
+            fm["mentioned_in"] = [f"[[{p}]]" for p in page_list]
+            fm["total_mentions"] = max(db_chunks, len(page_list))
+            fm["documents_count"] = max(db_docs, len({p.split("/", 1)[0] for p in page_list}))
+            fm["signal_sources"] = {
+                "db_chunks":  int(db_chunks),
+                "page_refs":  len(page_list),
+                "cross_refs": len(all_cross),
+            }
+            if all_cross:
+                fm["referenced_by"] = [f"[[{r}]]" for r in all_cross[:25]]
+            elif "referenced_by" in fm:
+                del fm["referenced_by"]
+            fm["signal_strength"] = strength
+            fm["last_lint"] = utc_iso()
+
+            # Optional archive paths — by default we KEEP everything, only mark.
+            if strength == "orphan" and args.archive:
+                archive_entity(ent_path, args.dry_run, archived_count)
+                continue
+            if args.archive_only_junk:
+                cn = str(fm.get("canonical_name") or "").strip()
+                cn_id = cn.lower()
+                is_junk = (
+                    len(cn) <= 3
+                    or re.fullmatch(r"[0-9.()-]+", cn) is not None
+                    or cn_id in {"unknown", "none", "n/a", "na", "-", "—"}
+                )
+                if is_junk and strength == "orphan":
+                    archive_entity(ent_path, args.dry_run, archived_count)
+                    continue
+
+            stats["updated"] += 1
+            if args.verbose:
+                print(f"  {strength:7}  {cls}/{eid}  db={db_chunks} pages={len(page_list)} cross={len(cross_list)}")
+            if not args.dry_run:
+                write_md(ent_path, fm, body)
+
+    print()
+    print(f"  strong:    {stats['strong']:>6}")
+    print(f"  weak:      {stats['weak']:>6}")
+    print(f"  orphan:    {stats['orphan']:>6}  (archived: {archived_count[0]})")
+    print(f"  updated:   {stats['updated']:>6}")
+    print(f"  skipped:   {stats['skipped']:>6}")
+    print(f"  dry-run:   {args.dry_run}")
+
+    if not args.dry_run and (stats["updated"] > 0 or archived_count[0] > 0):
+        LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
+        with LOG_PATH.open("a", encoding="utf-8") as f:
+            f.write(
+                f"\n## {utc_iso()} · SYNC_ENTITY_STATS\n"
+                f"- script: scripts/maintain/42_sync_entity_stats.py\n"
+                f"- strong: {stats['strong']}\n"
+                f"- weak:   {stats['weak']}\n"
+                f"- orphan: {stats['orphan']} (archived: {archived_count[0]})\n"
+                f"- updated: {stats['updated']}\n"
+            )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/web/app/e/[cls]/[id]/page.tsx
+++ b/web/app/e/[cls]/[id]/page.tsx
@ -79,10 +79,10 @@ export default async function EntityPage({

  const entityClassSingular = CLASS_TO_SINGULAR[folder as string] ?? folder;

-  // 1. DB first — live counts
+  // YAML-first: every count comes from the entity's frontmatter (kept in sync
+  // by scripts/maintain/42_sync_entity_stats.py). The DB is consulted ONLY for
+  // chunk previews, not for counts.
  const core = await getEntityCore(entityClassSingular, id).catch(() => null);
-
-  // 2. Wiki fallback — narrative body, aliases (Haiku stub OK)
  const wiki = await readEntity(folder as EntityClass, id);
  if (!core && !wiki) notFound();

@ -91,9 +91,8 @@ export default async function EntityPage({
    (a) => a !== canonical,
  );

-  // 3. Live data per-doc grouping
  const mentionGroups = core
-    ? await getEntityMentionsByDoc(core.entity_pk, 100).catch(() => [])
+    ? await getEntityMentionsByDoc(entityClassSingular, id, 100).catch(() => [])
    : [];
  const sampleChunks = core
    ? await getEntityChunks(core.entity_pk, 12).catch(() => [])
@ -101,6 +100,8 @@ export default async function EntityPage({

  const totalMentions = core?.total_mentions ?? 0;
  const documentsCount = core?.documents_count ?? 0;
+  const strength = core?.signal_strength ?? "unverified";
+  const sigs = core?.signal_sources ?? { db_chunks: 0, page_refs: 0, cross_refs: 0 };

  const classColor = CLASS_COLOR[folder as EntityClass];
  const classBg = CLASS_BG[folder as EntityClass];
@ -164,7 +165,49 @@ export default async function EntityPage({
              <div className="font-mono text-sm text-[#a78bfa] mt-0.5">{core.enrichment_status}</div>
            </div>
          )}
+          <div
+            className={`px-4 py-3 bg-[#0a121e] border rounded ${
+              strength === "strong"
+                ? "border-[#00ff9c]"
+                : strength === "weak"
+                  ? "border-[#ffa500]"
+                  : strength === "orphan"
+                    ? "border-[#ff6b6b]"
+                    : "border-[#5a6678]"
+            }`}
+            title="Cruzamento dos 3 sinais que confirmam esta entidade no corpus."
+          >
+            <div className="font-mono text-[10px] uppercase tracking-widest text-[#5a6678]">
+              força do sinal
+            </div>
+            <div
+              className={`font-mono text-sm mt-0.5 ${
+                strength === "strong"
+                  ? "text-[#00ff9c]"
+                  : strength === "weak"
+                    ? "text-[#ffa500]"
+                    : strength === "orphan"
+                      ? "text-[#ff6b6b]"
+                      : "text-[#8896aa]"
+              }`}
+            >
+              {strength === "strong" && "forte"}
+              {strength === "weak" && "fraca"}
+              {strength === "orphan" && "órfã"}
+              {strength === "unverified" && "não verificada"}
+            </div>
+            <div className="font-mono text-[9px] text-[#5a6678] mt-1 leading-tight">
+              {sigs.db_chunks} chunks · {sigs.page_refs} págs · {sigs.cross_refs} backlinks
+            </div>
+          </div>
        </div>
+
+        {strength === "orphan" && (
+          <p className="mt-4 text-xs text-[#ff6b6b] font-mono">
+            ⚠ entidade não confirmada: nenhuma página, chunk ou outra entidade aponta para
+            ela. Pode ser extração ruidosa do pipeline original.
+          </p>
+        )}
      </header>

      <div className="grid grid-cols-1 lg:grid-cols-[1fr_320px] gap-8">
--- a/web/lib/retrieval/entity-pages.ts
+++ b/web/lib/retrieval/entity-pages.ts
@ -1,19 +1,145 @@
 /**
- * Live entity data queries — replaces stale Haiku-era frontmatter `mentioned_in[]`
- * with real counts from `public.entity_mentions` + `public.chunks`.
+ * Entity page data — SINGLE SOURCE OF TRUTH is the YAML frontmatter on disk.
+ *
+ * Why YAML and not the DB? Because the corpus has TWO independent extraction
+ * layers (Haiku page-level, Sonnet chunk-level) and each catches a different
+ * subset of entities. The DB's entity_mentions table is one of those signals —
+ * useful for chat retrieval but incomplete for the entity catalog itself.
+ *
+ * Reading from disk lets us merge every signal into one stat (`total_mentions`)
+ * via the maintain/42_sync_entity_stats.py pipeline and serve consistent
+ * numbers everywhere in the UI.
+ *
+ * The DB is still queried for ONE thing: the actual chunk text for previews,
+ * because we don't want to re-parse 21k chunk files on every page render.
 */
+import fs from "node:fs/promises";
+import path from "node:path";
+import matter from "gray-matter";
 import { pgQuery } from "./db";
-import { findEntity } from "./graph";
+import { WIKI } from "@/lib/wiki";
+
+const FOLDER_BY_CLASS: Record<string, string> = {
+  person: "people",
+  organization: "organizations",
+  location: "locations",
+  event: "events",
+  uap_object: "uap-objects",
+  vehicle: "vehicles",
+  operation: "operations",
+  concept: "concepts",
+};

 export interface EntityCore {
-  entity_pk: number;
+  entity_pk: number | null; // db-side primary key; null if entity is wiki-only
  entity_class: string;
  entity_id: string;
  canonical_name: string;
-  aliases: string[] | null;
+  aliases: string[];
  total_mentions: number;
  documents_count: number;
+  signal_strength: "strong" | "weak" | "orphan" | "unverified";
+  signal_sources: {
+    db_chunks: number;
+    page_refs: number;
+    cross_refs: number;
+  };
+  mentioned_in: string[];      // [[doc-id/p007]]
+  referenced_by: string[];     // [[class/id]] cross-links
  enrichment_status: string | null;
+  narrative_summary: string | null;
+  narrative_summary_pt_br: string | null;
+  summary_status: string | null;
+}
+
+interface RawFm {
+  [k: string]: unknown;
+}
+
+function num(v: unknown, fallback = 0): number {
+  if (typeof v === "number" && Number.isFinite(v)) return v;
+  if (typeof v === "string") {
+    const n = Number(v);
+    return Number.isFinite(n) ? n : fallback;
+  }
+  return fallback;
+}
+
+function arr(v: unknown): string[] {
+  if (!v) return [];
+  if (Array.isArray(v)) return v.filter((x) => typeof x === "string");
+  return [];
+}
+
+function strOrNull(v: unknown): string | null {
+  return typeof v === "string" && v.trim() ? v : null;
+}
+
+async function readEntityYaml(entityClass: string, entityId: string): Promise<RawFm | null> {
+  const folder = FOLDER_BY_CLASS[entityClass];
+  if (!folder) return null;
+  const p = path.join(WIKI, "entities", folder, `${entityId}.md`);
+  try {
+    const raw = await fs.readFile(p, "utf-8");
+    return matter(raw).data as RawFm;
+  } catch {
+    return null;
+  }
+}
+
+/**
+ * Load a single entity card from its YAML. Returns null if archived or
+ * missing — keeps the route handler simple.
+ */
+export async function getEntityCore(
+  entityClass: string,
+  entityId: string,
+): Promise<EntityCore | null> {
+  const fm = await readEntityYaml(entityClass, entityId);
+  if (!fm) return null;
+
+  // Best-effort lookup of the DB entity_pk so getEntityChunks can still
+  // query by primary key. Don't fail if the entity isn't in the DB at all.
+  let entity_pk: number | null = null;
+  try {
+    const rows = await pgQuery<{ entity_pk: number }>(
+      `SELECT entity_pk FROM public.entities
+       WHERE entity_class = $1 AND entity_id = $2 LIMIT 1`,
+      [entityClass, entityId],
+    );
+    entity_pk = rows[0]?.entity_pk ?? null;
+  } catch {
+    entity_pk = null;
+  }
+
+  const sigSources = (fm.signal_sources as Record<string, unknown> | undefined) ?? {};
+  const strength = (typeof fm.signal_strength === "string"
+    ? fm.signal_strength
+    : "unverified") as EntityCore["signal_strength"];
+
+  return {
+    entity_pk,
+    entity_class: entityClass,
+    entity_id: entityId,
+    canonical_name: typeof fm.canonical_name === "string" ? fm.canonical_name : entityId,
+    aliases: arr(fm.aliases),
+    total_mentions: num(fm.total_mentions, 0),
+    documents_count: num(fm.documents_count, 0),
+    signal_strength: ["strong", "weak", "orphan", "unverified"].includes(strength)
+      ? strength
+      : "unverified",
+    signal_sources: {
+      db_chunks: num(sigSources.db_chunks, 0),
+      page_refs: num(sigSources.page_refs, 0),
+      cross_refs: num(sigSources.cross_refs, 0),
+    },
+    mentioned_in: arr(fm.mentioned_in),
+    referenced_by: arr(fm.referenced_by),
+    enrichment_status: strOrNull(fm.enrichment_status),
+    narrative_summary: strOrNull(fm.narrative_summary),
+    narrative_summary_pt_br: strOrNull(fm.narrative_summary_pt_br),
+    summary_status: strOrNull(fm.summary_status),
+  };
 }

 export interface EntityMentionGroup {
@ -26,55 +152,63 @@ export interface EntityMentionGroup {
  pages: number[];
 }

-export async function getEntityCore(
+/**
+ * Group reverse-references by document. Derived from the YAML's mentioned_in[]
+ * (which the maintain script writes consolidating page YAMLs). Optionally
+ * enriches with document metadata read from wiki/documents/<doc-id>.md.
+ */
+export async function getEntityMentionsByDoc(
  entityClass: string,
  entityId: string,
-): Promise<EntityCore | null> {
-  const rows = await pgQuery<EntityCore>(
-    `SELECT
-       e.entity_pk, e.entity_class, e.entity_id, e.canonical_name, e.aliases,
-       COALESCE(em.mention_count, 0) AS total_mentions,
-       COALESCE(em.doc_count, 0) AS documents_count,
-       e.enrichment_status
-     FROM public.entities e
-     LEFT JOIN (
-       SELECT em.entity_pk,
-              COUNT(*)::INT AS mention_count,
-              COUNT(DISTINCT c.doc_id)::INT AS doc_count
-       FROM public.entity_mentions em
-       JOIN public.chunks c ON c.chunk_pk = em.chunk_pk
-       GROUP BY em.entity_pk
-     ) em ON em.entity_pk = e.entity_pk
-     WHERE e.entity_class = $1 AND e.entity_id = $2
-     LIMIT 1`,
-    [entityClass, entityId],
-  );
-  return rows[0] ?? null;
-}
-
-/** Group mentions per document so the sidebar can list "appears in N docs". */
-export async function getEntityMentionsByDoc(
-  entityPk: number,
-  limit: number = 50,
+  limit = 100,
 ): Promise<EntityMentionGroup[]> {
-  return pgQuery<EntityMentionGroup>(
-    `SELECT
-       c.doc_id,
-       d.canonical_title,
-       d.collection,
-       d.page_count,
-       d.classification,
-       COUNT(*)::INT AS mention_count,
-       array_agg(DISTINCT c.page ORDER BY c.page) AS pages
-     FROM public.entity_mentions em
-     JOIN public.chunks c ON c.chunk_pk = em.chunk_pk
-     LEFT JOIN public.documents d ON d.doc_id = c.doc_id
-     WHERE em.entity_pk = $1
-     GROUP BY c.doc_id, d.canonical_title, d.collection, d.page_count, d.classification
-     ORDER BY mention_count DESC
-     LIMIT $2`,
-    [entityPk, limit],
-  );
+  const fm = await readEntityYaml(entityClass, entityId);
+  if (!fm) return [];
+  const refs = arr(fm.mentioned_in);
+  // Each ref looks like "[[doc-id/p007]]". Strip wikilink delimiters.
+  const byDoc = new Map<string, Set<number>>();
+  for (const ref of refs) {
+    const m = ref.match(/\[\[([^\]|]+?)\]\]/);
+    const target = (m ? m[1] : ref).trim();
+    const [docId, pageStr] = target.split("/", 2);
+    if (!docId) continue;
+    const pageNum = pageStr ? parseInt(pageStr.replace(/^p/, ""), 10) : NaN;
+    if (!byDoc.has(docId)) byDoc.set(docId, new Set());
+    if (Number.isFinite(pageNum)) byDoc.get(docId)!.add(pageNum);
+  }
+
+  // Hydrate each doc's metadata from wiki/documents/<doc-id>.md
+  const groups: EntityMentionGroup[] = [];
+  for (const [docId, pages] of byDoc) {
+    let canonical_title: string | null = null;
+    let collection: string | null = null;
+    let page_count: number | null = null;
+    let classification: string | null = null;
+    try {
+      const docRaw = await fs.readFile(
+        path.join(WIKI, "documents", `${docId}.md`),
+        "utf-8",
+      );
+      const dfm = matter(docRaw).data as Record<string, unknown>;
+      canonical_title = strOrNull(dfm.canonical_title);
+      collection = strOrNull(dfm.collection);
+      page_count = num(dfm.page_count, 0) || null;
+      classification = strOrNull(dfm.highest_classification);
+    } catch {
+      /* doc missing — use raw id */
+    }
+    groups.push({
+      doc_id: docId,
+      canonical_title,
+      collection,
+      page_count,
+      classification,
+      mention_count: pages.size,
+      pages: Array.from(pages).sort((a, b) => a - b),
+    });
+  }
+  groups.sort((a, b) => b.mention_count - a.mention_count);
+  return groups.slice(0, limit);
 }

 export interface EntityChunkPreview {
@ -91,10 +225,16 @@ export interface EntityChunkPreview {
  ufo_anomaly_type: string | null;
 }

+/**
+ * Top chunks that textually mention this entity. Reads from DB because
+ * chunk content is big (we don't re-parse files at request time). Returns []
+ * if the entity isn't indexed in the DB.
+ */
 export async function getEntityChunks(
-  entityPk: number,
-  limit: number = 30,
+  entityPk: number | null,
+  limit = 30,
 ): Promise<EntityChunkPreview[]> {
+  if (entityPk == null) return [];
  return pgQuery<EntityChunkPreview>(
    `SELECT
       c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.type, c.bbox, c.classification,
@ -108,4 +248,5 @@ export async function getEntityChunks(
  );
 }

-export { findEntity };
+// Backwards-compat for callers that imported findEntity from the old path.
+export { findEntity } from "./graph";