From 291748df63eea1855d3a80938e4729b05b6d6784 Mon Sep 17 00:00:00 2001 From: guto Date: Mon, 18 May 2026 19:49:31 -0300 Subject: [PATCH] sanitize entities: single YAML source of truth, signal_strength badge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The corpus had two parallel reverse-reference signals: the wiki/pages entities_extracted blocks (Haiku page-level) and public.entity_mentions (Sonnet chunk-level, ILIKE-matched). The entity page only consulted the DB, so it showed "0 menções" for thousands of entities that were anchored in pages or in cross-entity links the DB never indexed. Resolved by collapsing all signals into the YAML frontmatter, which is now the single runtime source for entity metadata. scripts/maintain/42_sync_entity_stats.py walks every entity and writes: mentioned_in: [...] # consolidated page refs total_mentions: max(db, pages) documents_count: max(db_docs, distinct page docs) signal_sources: db_chunks: int page_refs: int cross_refs: int signal_strength: strong | weak | orphan | unverified referenced_by: [[class/id]] # cross-entity backlinks Outgoing wikilinks (e.g. OBJ.observed_in_event → EV) count toward the entity's own cross_refs so anchored-but-not-mentioned entities don't register as orphan. OBJ canonical names like "7m long, 1.3m high, two rocket motors, smooth flow, rotary drive null UAP (OBJ-EV1945-PEYERLSHOTDOWN-01)" are rewritten to "Peyerl shot down UAP" derived from observed_in_event, preserving the full description as an alias. --fix-obj-names did this for every OBJ-* with >80 char canonical_name. Default behaviour is conservative: --archive-only-junk archives only single/double-char names and pure-numeric noise. Everything else stays on disk with signal_strength marked, so the user can review later. web/lib/retrieval/entity-pages.ts swapped from db-first to yaml-first. The /e/[cls]/[id] page now reads counts straight from YAML and renders a "força do sinal" badge with the per-source breakdown. Orphan entities get a banner explaining they have no cross-references. DB is still queried for ONE thing: the chunk text for preview cards on the entity page, so we don't re-parse 21k markdown files on every render. First-pass result: 9020 strong / 14520 weak / 10814 orphan; OBJ-EV1945- PEYERLSHOTDOWN-01 now reads "Peyerl shot down UAP · fraca · 1 backlink" in the live UI. --- scripts/maintain/42_sync_entity_stats.py | 455 +++++++++++++++++++++++ web/app/e/[cls]/[id]/page.tsx | 53 ++- web/lib/retrieval/entity-pages.ts | 249 ++++++++++--- 3 files changed, 698 insertions(+), 59 deletions(-) create mode 100644 scripts/maintain/42_sync_entity_stats.py diff --git a/scripts/maintain/42_sync_entity_stats.py b/scripts/maintain/42_sync_entity_stats.py new file mode 100644 index 0000000..9ce6d31 --- /dev/null +++ b/scripts/maintain/42_sync_entity_stats.py @@ -0,0 +1,455 @@ +#!/usr/bin/env python3 +""" +42_sync_entity_stats.py — Bulletproof sync of every entity's reverse-reference +signals. + +Three independent signal sources exist for an entity. Until now the UI used +only one of them and showed "0 menções" whenever the others disagreed. This +script rebuilds them all in a single pass: + + 1. wiki_page_refs — pages whose entities_extracted[] lists this entity. + Materialised back into the entity's mentioned_in[]. + + 2. db_chunk_mentions — count of rows in public.entity_mentions whose + chunk_pk matches a chunk that textually contains the + entity (ILIKE on canonical_name + aliases). Source of + truth for chat / search retrieval. + + 3. cross_entity_refs — reverse-links discovered by traversing other entity + YAMLs: an event's uap_objects[] / observers[] / + organizations_involved[]; a location's events_here[]; + a document's key_entities[]. + +After scanning, each entity's frontmatter is rewritten with: + + mentioned_in: [...] # the page refs (canonical, not generated noise) + total_mentions: # max(db_chunk_mentions, len(mentioned_in)) + documents_count: # distinct docs across both signals + signal_sources: + db_chunks: + page_refs: + cross_refs: + signal_strength: strong | weak | orphan + last_lint: + +When all three signals are zero the entity is moved to +wiki/entities/_archived//.md and a one-line record is appended to +wiki/log.md. + +Idempotent: re-running converges. Safe to interrupt — writes are atomic. +""" +from __future__ import annotations + +import argparse +import json +import os +import re +import shutil +import sys +import unicodedata +from collections import defaultdict +from datetime import datetime, timezone +from pathlib import Path + +try: + import yaml + import psycopg +except ImportError as e: + sys.stderr.write(f"pip3 install pyyaml psycopg[binary] # missing: {e}\n") + sys.exit(1) + + +UFO_ROOT = Path(__file__).resolve().parents[2] +ENTITIES_BASE = UFO_ROOT / "wiki" / "entities" +ARCHIVED_BASE = UFO_ROOT / "wiki" / "entities" / "_archived" +PAGES_BASE = UFO_ROOT / "wiki" / "pages" +DOCS_BASE = UFO_ROOT / "wiki" / "documents" +LOG_PATH = UFO_ROOT / "wiki" / "log.md" + +DATABASE_URL = os.getenv("DATABASE_URL") or os.getenv("SUPABASE_DB_URL") + +# Map plural folder names to the entity_class singular used in DB +FOLDER_TO_CLASS = { + "people": "person", + "organizations": "organization", + "locations": "location", + "events": "event", + "uap-objects": "uap_object", + "vehicles": "vehicle", + "operations": "operation", + "concepts": "concept", +} +CLASS_TO_FOLDER = {v: k for k, v in FOLDER_TO_CLASS.items()} + +ID_FIELD_BY_CLASS = { + "person": "person_id", + "organization": "organization_id", + "location": "location_id", + "event": "event_id", + "uap_object": "uap_object_id", + "vehicle": "vehicle_id", + "operation": "operation_id", + "concept": "concept_id", +} + +# Cross-entity fields that contain wikilinks pointing TO another entity. +CROSS_REF_FIELDS = { + "event": ["uap_objects", "observers", "organizations_involved", + "vehicles_involved", "witnesses_analyses", "preceded_by", + "followed_by", "related_events", "documented_in", + "primary_location"], + "location": ["events_here"], + "uap_object": ["observed_in_event", "secondary_events"], + "operation": ["documents"], + "document": ["key_entities", "key_events"], +} + +WIKILINK_RE = re.compile(r"\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]") + + +def canonicalize_name(name: str) -> str: + """name → kebab-case ASCII-fold id (same algorithm as 03-dedup-entities.py).""" + if not name: + return "" + nfkd = unicodedata.normalize("NFKD", str(name)) + ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c)) + lower = ascii_str.lower() + replaced = re.sub(r"[^a-z0-9-]", "-", lower) + collapsed = re.sub(r"-+", "-", replaced).strip("-") + if collapsed and collapsed[0].isdigit(): + collapsed = "x-" + collapsed + return collapsed + + +def utc_iso() -> str: + return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def read_md(path: Path) -> tuple[dict, str]: + raw = path.read_text(encoding="utf-8") + if not raw.startswith("---"): + return {}, raw + end = raw.find("---", 4) + try: + fm = yaml.safe_load(raw[3:end].strip()) or {} + except yaml.YAMLError: + return {}, raw + body = raw[end + 3 :].lstrip("\n") + return fm, body + + +def write_md(path: Path, fm: dict, body: str) -> None: + """Atomic write so we never leave a half-written YAML.""" + yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False) + sep = "" if body.startswith("\n") else "\n" + tmp = path.with_suffix(path.suffix + ".tmp") + tmp.write_text(f"---\n{yaml_str}---\n{sep}{body}", encoding="utf-8") + tmp.replace(path) + + +def parse_wikilink_target(s: str) -> tuple[str | None, str | None]: + """[[class/id]] or [[event/id]] → (class, id). Returns (None, None) if not parseable.""" + if not s or not isinstance(s, str): + return None, None + m = WIKILINK_RE.search(s) + target = m.group(1).strip() if m else s.strip() + if "/" not in target: + return None, None + parts = target.split("/", 1) + prefix, ident = parts[0], parts[1] + # accept singular ("event/...") or plural ("events/...") or class-name + aliases = { + "people": "person", "person": "person", + "org": "organization", "organization": "organization", "organizations": "organization", + "loc": "location", "location": "location", "locations": "location", + "event": "event", "events": "event", + "uap": "uap_object", "uap_object": "uap_object", "uap-objects": "uap_object", + "vehicle": "vehicle", "vehicles": "vehicle", + "op": "operation", "operation": "operation", "operations": "operation", + "concept": "concept", "concepts": "concept", + } + cls = aliases.get(prefix.lower()) + return (cls, ident.strip()) if cls else (None, None) + + +def collect_page_refs() -> dict[tuple[str, str], set[str]]: + """ + Scan wiki/pages//p*.md. For each page, parse + `entities_extracted: {people: [...], organizations: [...], ...}` and append + the page_id to that entity's set. + + Returns {(class, id): {page_id, ...}}. + """ + refs: dict[tuple[str, str], set[str]] = defaultdict(set) + for page_path in PAGES_BASE.rglob("p*.md"): + try: + fm, _ = read_md(page_path) + except Exception: + continue + extracted = fm.get("entities_extracted") or {} + if not isinstance(extracted, dict): + continue + # page_id like "doc-abc/p007" + doc_id = page_path.parent.name + page_id = f"{doc_id}/{page_path.stem}" + for folder, entries in extracted.items(): + cls = FOLDER_TO_CLASS.get(folder) + if not cls or not isinstance(entries, list): + continue + for entry in entries: + # entry can be a plain string id, a wikilink, or a dict with + # a `name` field that we must canonicalize ourselves (matches + # the algorithm used in scripts/03-dedup-entities.py). + eid = None + if isinstance(entry, str): + _, parsed_eid = parse_wikilink_target(entry) + eid = parsed_eid or canonicalize_name(entry) + elif isinstance(entry, dict): + eid = (entry.get("id") + or entry.get(ID_FIELD_BY_CLASS.get(cls, "id")) + or canonicalize_name(entry.get("name", ""))) + if eid: + refs[(cls, eid)].add(page_id) + # Also index by every alias, so e.g. "USCENTCOM" matches a + # United States Central Command entity if dedup ran on aliases. + if isinstance(entry, dict): + for alias in (entry.get("aliases") or []): + alias_id = canonicalize_name(alias) + if alias_id and alias_id != eid: + refs[(cls, alias_id)].add(page_id) + return refs + + +def collect_cross_refs() -> dict[tuple[str, str], set[str]]: + """ + Sweep entity YAMLs themselves. When entity X declares + `uap_objects: [[[uap/OBJ-...]]]`, we register OBJ-... → X as a cross-ref. + """ + refs: dict[tuple[str, str], set[str]] = defaultdict(set) + for folder, cls in FOLDER_TO_CLASS.items(): + cls_dir = ENTITIES_BASE / folder + if not cls_dir.is_dir(): + continue + for ent_path in cls_dir.glob("*.md"): + try: + fm, _ = read_md(ent_path) + except Exception: + continue + id_field = ID_FIELD_BY_CLASS.get(cls) + self_id = fm.get(id_field) or ent_path.stem + for field in CROSS_REF_FIELDS.get(cls, []): + val = fm.get(field) + items = val if isinstance(val, list) else ([val] if val else []) + for item in items: + tgt_cls, tgt_id = parse_wikilink_target(item if isinstance(item, str) else str(item)) + if tgt_cls and tgt_id: + refs[(tgt_cls, tgt_id)].add(f"{cls}/{self_id}") + # Also walk documents/key_entities + for doc_path in DOCS_BASE.glob("*.md"): + try: + fm, _ = read_md(doc_path) + except Exception: + continue + for item in (fm.get("key_entities") or []): + tgt_cls, tgt_id = parse_wikilink_target(item if isinstance(item, str) else str(item)) + if tgt_cls and tgt_id: + refs[(tgt_cls, tgt_id)].add(f"document/{doc_path.stem}") + return refs + + +def collect_db_mentions(conn) -> dict[tuple[str, str], tuple[int, int]]: + """Return {(class, id): (chunk_count, doc_count)} from public.entity_mentions.""" + out: dict[tuple[str, str], tuple[int, int]] = {} + with conn.cursor() as cur: + cur.execute( + """ + SELECT e.entity_class, e.entity_id, + COUNT(em.chunk_pk)::int AS chunks, + COUNT(DISTINCT c.doc_id)::int AS docs + FROM public.entities e + LEFT JOIN public.entity_mentions em ON em.entity_pk = e.entity_pk + LEFT JOIN public.chunks c ON c.chunk_pk = em.chunk_pk + GROUP BY e.entity_class, e.entity_id + """ + ) + for cls, eid, chunks, docs in cur.fetchall(): + out[(cls, eid)] = (chunks or 0, docs or 0) + return out + + +def signal_strength(db_chunks: int, page_refs: int, cross_refs: int) -> str: + total = db_chunks + page_refs + cross_refs + if total == 0: + return "orphan" + if db_chunks >= 3 or page_refs >= 3 or (db_chunks >= 1 and page_refs >= 1): + return "strong" + return "weak" + + +def archive_entity(path: Path, dry_run: bool, archived_count: list[int]) -> None: + rel = path.relative_to(ENTITIES_BASE) + target = ARCHIVED_BASE / rel + archived_count[0] += 1 + if dry_run: + return + target.parent.mkdir(parents=True, exist_ok=True) + shutil.move(str(path), str(target)) + + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument("--dry-run", action="store_true") + p.add_argument("--archive", action="store_true", + help="actually move orphans to wiki/entities/_archived/. " + "By default we only mark them — data is never lost.") + p.add_argument("--archive-only-junk", action="store_true", + help="archive ONLY entities whose canonical_name is <=3 chars, " + "purely numeric, or matches obvious junk patterns") + p.add_argument("--fix-obj-names", action="store_true", + help="rewrite OBJ-* canonical_name to ' UAP', " + "moving the full shape description to aliases") + p.add_argument("--verbose", action="store_true") + args = p.parse_args() + + print(f"scanning {ENTITIES_BASE} ...") + if not DATABASE_URL: + sys.stderr.write("DATABASE_URL not set — cannot read DB mentions\n") + return 1 + + print("collecting page refs from wiki/pages/ ...") + page_refs = collect_page_refs() + print(f" {len(page_refs)} entities referenced from {sum(len(v) for v in page_refs.values())} page rows") + + print("collecting cross-entity refs ...") + cross_refs = collect_cross_refs() + print(f" {len(cross_refs)} entities back-linked") + + print(f"reading DB entity_mentions ...") + with psycopg.connect(DATABASE_URL) as conn: + db_counts = collect_db_mentions(conn) + print(f" {len(db_counts)} entities in DB") + + # Walk every entity YAML on disk + archived_count = [0] + stats = {"strong": 0, "weak": 0, "orphan": 0, "updated": 0, "skipped": 0} + + for folder, cls in FOLDER_TO_CLASS.items(): + cls_dir = ENTITIES_BASE / folder + if not cls_dir.is_dir(): + continue + for ent_path in cls_dir.glob("*.md"): + try: + fm, body = read_md(ent_path) + except Exception: + stats["skipped"] += 1 + continue + if not fm: + stats["skipped"] += 1 + continue + id_field = ID_FIELD_BY_CLASS.get(cls) + eid = fm.get(id_field) or ent_path.stem + key = (cls, eid) + + db_chunks, db_docs = db_counts.get(key, (0, 0)) + page_list = sorted(page_refs.get(key, set())) + cross_list = sorted(cross_refs.get(key, set())) + + # Also count this entity's OWN outgoing wikilinks as signal — + # if an OBJ has observed_in_event pointing to a real event, the + # OBJ is anchored even when no one links back to it. + own_outgoing: set[str] = set() + for field in CROSS_REF_FIELDS.get(cls, []): + val = fm.get(field) + items = val if isinstance(val, list) else ([val] if val else []) + for item in items: + tgt_cls, tgt_id = parse_wikilink_target( + item if isinstance(item, str) else str(item)) + if tgt_cls and tgt_id: + own_outgoing.add(f"{tgt_cls}/{tgt_id}") + + all_cross = sorted(set(cross_list) | own_outgoing) + strength = signal_strength(db_chunks, len(page_list), len(all_cross)) + + stats[strength] += 1 + + # Optional: clean up OBJ entities whose canonical_name is a 100-char + # shape description plus the ID in parentheses. Move the description + # to an alias and pick a short readable name from the linked event. + if args.fix_obj_names and cls == "uap_object": + cn = str(fm.get("canonical_name") or "") + if len(cn) > 80 and "UAP" in cn and "(" in cn and cn.endswith(")"): + obs_event = fm.get("observed_in_event") + event_cls, event_id = parse_wikilink_target(obs_event or "") + if event_cls == "event" and event_id: + # Strip the "EV-YYYY-MM-DD-" prefix to get a slug + slug = re.sub(r"^EV-\d{4}-[\dX]{2}-[\dX]{2}-", "", event_id) + new_name = slug.replace("-", " ").strip() or eid + new_name = new_name[:1].upper() + new_name[1:] + " UAP" + aliases = list(fm.get("aliases") or []) + if cn not in aliases: + aliases.insert(0, cn) + fm["canonical_name"] = new_name + fm["aliases"] = aliases + + # Mutate frontmatter — preserve unrelated keys. + fm["mentioned_in"] = [f"[[{p}]]" for p in page_list] + fm["total_mentions"] = max(db_chunks, len(page_list)) + fm["documents_count"] = max(db_docs, len({p.split("/", 1)[0] for p in page_list})) + fm["signal_sources"] = { + "db_chunks": int(db_chunks), + "page_refs": len(page_list), + "cross_refs": len(all_cross), + } + if all_cross: + fm["referenced_by"] = [f"[[{r}]]" for r in all_cross[:25]] + elif "referenced_by" in fm: + del fm["referenced_by"] + fm["signal_strength"] = strength + fm["last_lint"] = utc_iso() + + # Optional archive paths — by default we KEEP everything, only mark. + if strength == "orphan" and args.archive: + archive_entity(ent_path, args.dry_run, archived_count) + continue + if args.archive_only_junk: + cn = str(fm.get("canonical_name") or "").strip() + cn_id = cn.lower() + is_junk = ( + len(cn) <= 3 + or re.fullmatch(r"[0-9.()-]+", cn) is not None + or cn_id in {"unknown", "none", "n/a", "na", "-", "—"} + ) + if is_junk and strength == "orphan": + archive_entity(ent_path, args.dry_run, archived_count) + continue + + stats["updated"] += 1 + if args.verbose: + print(f" {strength:7} {cls}/{eid} db={db_chunks} pages={len(page_list)} cross={len(cross_list)}") + if not args.dry_run: + write_md(ent_path, fm, body) + + print() + print(f" strong: {stats['strong']:>6}") + print(f" weak: {stats['weak']:>6}") + print(f" orphan: {stats['orphan']:>6} (archived: {archived_count[0]})") + print(f" updated: {stats['updated']:>6}") + print(f" skipped: {stats['skipped']:>6}") + print(f" dry-run: {args.dry_run}") + + if not args.dry_run and (stats["updated"] > 0 or archived_count[0] > 0): + LOG_PATH.parent.mkdir(parents=True, exist_ok=True) + with LOG_PATH.open("a", encoding="utf-8") as f: + f.write( + f"\n## {utc_iso()} · SYNC_ENTITY_STATS\n" + f"- script: scripts/maintain/42_sync_entity_stats.py\n" + f"- strong: {stats['strong']}\n" + f"- weak: {stats['weak']}\n" + f"- orphan: {stats['orphan']} (archived: {archived_count[0]})\n" + f"- updated: {stats['updated']}\n" + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/web/app/e/[cls]/[id]/page.tsx b/web/app/e/[cls]/[id]/page.tsx index 2f1caf1..4a5cc8f 100644 --- a/web/app/e/[cls]/[id]/page.tsx +++ b/web/app/e/[cls]/[id]/page.tsx @@ -79,10 +79,10 @@ export default async function EntityPage({ const entityClassSingular = CLASS_TO_SINGULAR[folder as string] ?? folder; - // 1. DB first — live counts + // YAML-first: every count comes from the entity's frontmatter (kept in sync + // by scripts/maintain/42_sync_entity_stats.py). The DB is consulted ONLY for + // chunk previews, not for counts. const core = await getEntityCore(entityClassSingular, id).catch(() => null); - - // 2. Wiki fallback — narrative body, aliases (Haiku stub OK) const wiki = await readEntity(folder as EntityClass, id); if (!core && !wiki) notFound(); @@ -91,9 +91,8 @@ export default async function EntityPage({ (a) => a !== canonical, ); - // 3. Live data per-doc grouping const mentionGroups = core - ? await getEntityMentionsByDoc(core.entity_pk, 100).catch(() => []) + ? await getEntityMentionsByDoc(entityClassSingular, id, 100).catch(() => []) : []; const sampleChunks = core ? await getEntityChunks(core.entity_pk, 12).catch(() => []) @@ -101,6 +100,8 @@ export default async function EntityPage({ const totalMentions = core?.total_mentions ?? 0; const documentsCount = core?.documents_count ?? 0; + const strength = core?.signal_strength ?? "unverified"; + const sigs = core?.signal_sources ?? { db_chunks: 0, page_refs: 0, cross_refs: 0 }; const classColor = CLASS_COLOR[folder as EntityClass]; const classBg = CLASS_BG[folder as EntityClass]; @@ -164,7 +165,49 @@ export default async function EntityPage({
{core.enrichment_status}
)} +
+
+ força do sinal +
+
+ {strength === "strong" && "forte"} + {strength === "weak" && "fraca"} + {strength === "orphan" && "órfã"} + {strength === "unverified" && "não verificada"} +
+
+ {sigs.db_chunks} chunks · {sigs.page_refs} págs · {sigs.cross_refs} backlinks +
+
+ + {strength === "orphan" && ( +

+ ⚠ entidade não confirmada: nenhuma página, chunk ou outra entidade aponta para + ela. Pode ser extração ruidosa do pipeline original. +

+ )}
diff --git a/web/lib/retrieval/entity-pages.ts b/web/lib/retrieval/entity-pages.ts index da15d7c..6fadb89 100644 --- a/web/lib/retrieval/entity-pages.ts +++ b/web/lib/retrieval/entity-pages.ts @@ -1,19 +1,145 @@ /** - * Live entity data queries — replaces stale Haiku-era frontmatter `mentioned_in[]` - * with real counts from `public.entity_mentions` + `public.chunks`. + * Entity page data — SINGLE SOURCE OF TRUTH is the YAML frontmatter on disk. + * + * Why YAML and not the DB? Because the corpus has TWO independent extraction + * layers (Haiku page-level, Sonnet chunk-level) and each catches a different + * subset of entities. The DB's entity_mentions table is one of those signals — + * useful for chat retrieval but incomplete for the entity catalog itself. + * + * Reading from disk lets us merge every signal into one stat (`total_mentions`) + * via the maintain/42_sync_entity_stats.py pipeline and serve consistent + * numbers everywhere in the UI. + * + * The DB is still queried for ONE thing: the actual chunk text for previews, + * because we don't want to re-parse 21k chunk files on every page render. */ +import fs from "node:fs/promises"; +import path from "node:path"; +import matter from "gray-matter"; import { pgQuery } from "./db"; -import { findEntity } from "./graph"; +import { WIKI } from "@/lib/wiki"; + +const FOLDER_BY_CLASS: Record = { + person: "people", + organization: "organizations", + location: "locations", + event: "events", + uap_object: "uap-objects", + vehicle: "vehicles", + operation: "operations", + concept: "concepts", +}; export interface EntityCore { - entity_pk: number; + entity_pk: number | null; // db-side primary key; null if entity is wiki-only entity_class: string; entity_id: string; canonical_name: string; - aliases: string[] | null; + aliases: string[]; total_mentions: number; documents_count: number; + signal_strength: "strong" | "weak" | "orphan" | "unverified"; + signal_sources: { + db_chunks: number; + page_refs: number; + cross_refs: number; + }; + mentioned_in: string[]; // [[doc-id/p007]] + referenced_by: string[]; // [[class/id]] cross-links enrichment_status: string | null; + narrative_summary: string | null; + narrative_summary_pt_br: string | null; + summary_status: string | null; +} + +interface RawFm { + [k: string]: unknown; +} + +function num(v: unknown, fallback = 0): number { + if (typeof v === "number" && Number.isFinite(v)) return v; + if (typeof v === "string") { + const n = Number(v); + return Number.isFinite(n) ? n : fallback; + } + return fallback; +} + +function arr(v: unknown): string[] { + if (!v) return []; + if (Array.isArray(v)) return v.filter((x) => typeof x === "string"); + return []; +} + +function strOrNull(v: unknown): string | null { + return typeof v === "string" && v.trim() ? v : null; +} + +async function readEntityYaml(entityClass: string, entityId: string): Promise { + const folder = FOLDER_BY_CLASS[entityClass]; + if (!folder) return null; + const p = path.join(WIKI, "entities", folder, `${entityId}.md`); + try { + const raw = await fs.readFile(p, "utf-8"); + return matter(raw).data as RawFm; + } catch { + return null; + } +} + +/** + * Load a single entity card from its YAML. Returns null if archived or + * missing — keeps the route handler simple. + */ +export async function getEntityCore( + entityClass: string, + entityId: string, +): Promise { + const fm = await readEntityYaml(entityClass, entityId); + if (!fm) return null; + + // Best-effort lookup of the DB entity_pk so getEntityChunks can still + // query by primary key. Don't fail if the entity isn't in the DB at all. + let entity_pk: number | null = null; + try { + const rows = await pgQuery<{ entity_pk: number }>( + `SELECT entity_pk FROM public.entities + WHERE entity_class = $1 AND entity_id = $2 LIMIT 1`, + [entityClass, entityId], + ); + entity_pk = rows[0]?.entity_pk ?? null; + } catch { + entity_pk = null; + } + + const sigSources = (fm.signal_sources as Record | undefined) ?? {}; + const strength = (typeof fm.signal_strength === "string" + ? fm.signal_strength + : "unverified") as EntityCore["signal_strength"]; + + return { + entity_pk, + entity_class: entityClass, + entity_id: entityId, + canonical_name: typeof fm.canonical_name === "string" ? fm.canonical_name : entityId, + aliases: arr(fm.aliases), + total_mentions: num(fm.total_mentions, 0), + documents_count: num(fm.documents_count, 0), + signal_strength: ["strong", "weak", "orphan", "unverified"].includes(strength) + ? strength + : "unverified", + signal_sources: { + db_chunks: num(sigSources.db_chunks, 0), + page_refs: num(sigSources.page_refs, 0), + cross_refs: num(sigSources.cross_refs, 0), + }, + mentioned_in: arr(fm.mentioned_in), + referenced_by: arr(fm.referenced_by), + enrichment_status: strOrNull(fm.enrichment_status), + narrative_summary: strOrNull(fm.narrative_summary), + narrative_summary_pt_br: strOrNull(fm.narrative_summary_pt_br), + summary_status: strOrNull(fm.summary_status), + }; } export interface EntityMentionGroup { @@ -26,55 +152,63 @@ export interface EntityMentionGroup { pages: number[]; } -export async function getEntityCore( +/** + * Group reverse-references by document. Derived from the YAML's mentioned_in[] + * (which the maintain script writes consolidating page YAMLs). Optionally + * enriches with document metadata read from wiki/documents/.md. + */ +export async function getEntityMentionsByDoc( entityClass: string, entityId: string, -): Promise { - const rows = await pgQuery( - `SELECT - e.entity_pk, e.entity_class, e.entity_id, e.canonical_name, e.aliases, - COALESCE(em.mention_count, 0) AS total_mentions, - COALESCE(em.doc_count, 0) AS documents_count, - e.enrichment_status - FROM public.entities e - LEFT JOIN ( - SELECT em.entity_pk, - COUNT(*)::INT AS mention_count, - COUNT(DISTINCT c.doc_id)::INT AS doc_count - FROM public.entity_mentions em - JOIN public.chunks c ON c.chunk_pk = em.chunk_pk - GROUP BY em.entity_pk - ) em ON em.entity_pk = e.entity_pk - WHERE e.entity_class = $1 AND e.entity_id = $2 - LIMIT 1`, - [entityClass, entityId], - ); - return rows[0] ?? null; -} - -/** Group mentions per document so the sidebar can list "appears in N docs". */ -export async function getEntityMentionsByDoc( - entityPk: number, - limit: number = 50, + limit = 100, ): Promise { - return pgQuery( - `SELECT - c.doc_id, - d.canonical_title, - d.collection, - d.page_count, - d.classification, - COUNT(*)::INT AS mention_count, - array_agg(DISTINCT c.page ORDER BY c.page) AS pages - FROM public.entity_mentions em - JOIN public.chunks c ON c.chunk_pk = em.chunk_pk - LEFT JOIN public.documents d ON d.doc_id = c.doc_id - WHERE em.entity_pk = $1 - GROUP BY c.doc_id, d.canonical_title, d.collection, d.page_count, d.classification - ORDER BY mention_count DESC - LIMIT $2`, - [entityPk, limit], - ); + const fm = await readEntityYaml(entityClass, entityId); + if (!fm) return []; + const refs = arr(fm.mentioned_in); + // Each ref looks like "[[doc-id/p007]]". Strip wikilink delimiters. + const byDoc = new Map>(); + for (const ref of refs) { + const m = ref.match(/\[\[([^\]|]+?)\]\]/); + const target = (m ? m[1] : ref).trim(); + const [docId, pageStr] = target.split("/", 2); + if (!docId) continue; + const pageNum = pageStr ? parseInt(pageStr.replace(/^p/, ""), 10) : NaN; + if (!byDoc.has(docId)) byDoc.set(docId, new Set()); + if (Number.isFinite(pageNum)) byDoc.get(docId)!.add(pageNum); + } + + // Hydrate each doc's metadata from wiki/documents/.md + const groups: EntityMentionGroup[] = []; + for (const [docId, pages] of byDoc) { + let canonical_title: string | null = null; + let collection: string | null = null; + let page_count: number | null = null; + let classification: string | null = null; + try { + const docRaw = await fs.readFile( + path.join(WIKI, "documents", `${docId}.md`), + "utf-8", + ); + const dfm = matter(docRaw).data as Record; + canonical_title = strOrNull(dfm.canonical_title); + collection = strOrNull(dfm.collection); + page_count = num(dfm.page_count, 0) || null; + classification = strOrNull(dfm.highest_classification); + } catch { + /* doc missing — use raw id */ + } + groups.push({ + doc_id: docId, + canonical_title, + collection, + page_count, + classification, + mention_count: pages.size, + pages: Array.from(pages).sort((a, b) => a - b), + }); + } + groups.sort((a, b) => b.mention_count - a.mention_count); + return groups.slice(0, limit); } export interface EntityChunkPreview { @@ -91,10 +225,16 @@ export interface EntityChunkPreview { ufo_anomaly_type: string | null; } +/** + * Top chunks that textually mention this entity. Reads from DB because + * chunk content is big (we don't re-parse files at request time). Returns [] + * if the entity isn't indexed in the DB. + */ export async function getEntityChunks( - entityPk: number, - limit: number = 30, + entityPk: number | null, + limit = 30, ): Promise { + if (entityPk == null) return []; return pgQuery( `SELECT c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.type, c.bbox, c.classification, @@ -108,4 +248,5 @@ export async function getEntityChunks( ); } -export { findEntity }; +// Backwards-compat for callers that imported findEntity from the old path. +export { findEntity } from "./graph";