sanitize entities: single YAML source of truth, signal_strength badge

The corpus had two parallel reverse-reference signals: the wiki/pages
entities_extracted blocks (Haiku page-level) and public.entity_mentions
(Sonnet chunk-level, ILIKE-matched). The entity page only consulted the
DB, so it showed "0 menções" for thousands of entities that were anchored
in pages or in cross-entity links the DB never indexed.

Resolved by collapsing all signals into the YAML frontmatter, which is
now the single runtime source for entity metadata.

scripts/maintain/42_sync_entity_stats.py walks every entity and writes:

  mentioned_in:        [...]        # consolidated page refs
  total_mentions:      max(db, pages)
  documents_count:     max(db_docs, distinct page docs)
  signal_sources:
    db_chunks:         int
    page_refs:         int
    cross_refs:        int
  signal_strength:     strong | weak | orphan | unverified
  referenced_by:       [[class/id]]  # cross-entity backlinks

Outgoing wikilinks (e.g. OBJ.observed_in_event → EV) count toward the
entity's own cross_refs so anchored-but-not-mentioned entities don't
register as orphan.

OBJ canonical names like "7m long, 1.3m high, two rocket motors,
smooth flow, rotary drive null UAP (OBJ-EV1945-PEYERLSHOTDOWN-01)"
are rewritten to "Peyerl shot down UAP" derived from observed_in_event,
preserving the full description as an alias. --fix-obj-names did this
for every OBJ-* with >80 char canonical_name.

Default behaviour is conservative: --archive-only-junk archives only
single/double-char names and pure-numeric noise. Everything else stays
on disk with signal_strength marked, so the user can review later.

web/lib/retrieval/entity-pages.ts swapped from db-first to yaml-first.
The /e/[cls]/[id] page now reads counts straight from YAML and renders
a "força do sinal" badge with the per-source breakdown. Orphan entities
get a banner explaining they have no cross-references.

DB is still queried for ONE thing: the chunk text for preview cards on
the entity page, so we don't re-parse 21k markdown files on every render.

First-pass result: 9020 strong / 14520 weak / 10814 orphan; OBJ-EV1945-
PEYERLSHOTDOWN-01 now reads "Peyerl shot down UAP · fraca · 1 backlink"
in the live UI.
This commit is contained in:
guto 2026-05-18 19:49:31 -03:00
parent c0c6652dd5
commit 291748df63
3 changed files with 698 additions and 59 deletions

View file

@ -0,0 +1,455 @@
#!/usr/bin/env python3
"""
42_sync_entity_stats.py Bulletproof sync of every entity's reverse-reference
signals.
Three independent signal sources exist for an entity. Until now the UI used
only one of them and showed "0 menções" whenever the others disagreed. This
script rebuilds them all in a single pass:
1. wiki_page_refs pages whose entities_extracted[] lists this entity.
Materialised back into the entity's mentioned_in[].
2. db_chunk_mentions count of rows in public.entity_mentions whose
chunk_pk matches a chunk that textually contains the
entity (ILIKE on canonical_name + aliases). Source of
truth for chat / search retrieval.
3. cross_entity_refs reverse-links discovered by traversing other entity
YAMLs: an event's uap_objects[] / observers[] /
organizations_involved[]; a location's events_here[];
a document's key_entities[].
After scanning, each entity's frontmatter is rewritten with:
mentioned_in: [...] # the page refs (canonical, not generated noise)
total_mentions: <int> # max(db_chunk_mentions, len(mentioned_in))
documents_count: <int> # distinct docs across both signals
signal_sources:
db_chunks: <int>
page_refs: <int>
cross_refs: <int>
signal_strength: strong | weak | orphan
last_lint: <utc>
When all three signals are zero the entity is moved to
wiki/entities/_archived/<class>/<id>.md and a one-line record is appended to
wiki/log.md.
Idempotent: re-running converges. Safe to interrupt writes are atomic.
"""
from __future__ import annotations
import argparse
import json
import os
import re
import shutil
import sys
import unicodedata
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
try:
import yaml
import psycopg
except ImportError as e:
sys.stderr.write(f"pip3 install pyyaml psycopg[binary] # missing: {e}\n")
sys.exit(1)
UFO_ROOT = Path(__file__).resolve().parents[2]
ENTITIES_BASE = UFO_ROOT / "wiki" / "entities"
ARCHIVED_BASE = UFO_ROOT / "wiki" / "entities" / "_archived"
PAGES_BASE = UFO_ROOT / "wiki" / "pages"
DOCS_BASE = UFO_ROOT / "wiki" / "documents"
LOG_PATH = UFO_ROOT / "wiki" / "log.md"
DATABASE_URL = os.getenv("DATABASE_URL") or os.getenv("SUPABASE_DB_URL")
# Map plural folder names to the entity_class singular used in DB
FOLDER_TO_CLASS = {
"people": "person",
"organizations": "organization",
"locations": "location",
"events": "event",
"uap-objects": "uap_object",
"vehicles": "vehicle",
"operations": "operation",
"concepts": "concept",
}
CLASS_TO_FOLDER = {v: k for k, v in FOLDER_TO_CLASS.items()}
ID_FIELD_BY_CLASS = {
"person": "person_id",
"organization": "organization_id",
"location": "location_id",
"event": "event_id",
"uap_object": "uap_object_id",
"vehicle": "vehicle_id",
"operation": "operation_id",
"concept": "concept_id",
}
# Cross-entity fields that contain wikilinks pointing TO another entity.
CROSS_REF_FIELDS = {
"event": ["uap_objects", "observers", "organizations_involved",
"vehicles_involved", "witnesses_analyses", "preceded_by",
"followed_by", "related_events", "documented_in",
"primary_location"],
"location": ["events_here"],
"uap_object": ["observed_in_event", "secondary_events"],
"operation": ["documents"],
"document": ["key_entities", "key_events"],
}
WIKILINK_RE = re.compile(r"\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]")
def canonicalize_name(name: str) -> str:
"""name → kebab-case ASCII-fold id (same algorithm as 03-dedup-entities.py)."""
if not name:
return ""
nfkd = unicodedata.normalize("NFKD", str(name))
ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
lower = ascii_str.lower()
replaced = re.sub(r"[^a-z0-9-]", "-", lower)
collapsed = re.sub(r"-+", "-", replaced).strip("-")
if collapsed and collapsed[0].isdigit():
collapsed = "x-" + collapsed
return collapsed
def utc_iso() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def read_md(path: Path) -> tuple[dict, str]:
raw = path.read_text(encoding="utf-8")
if not raw.startswith("---"):
return {}, raw
end = raw.find("---", 4)
try:
fm = yaml.safe_load(raw[3:end].strip()) or {}
except yaml.YAMLError:
return {}, raw
body = raw[end + 3 :].lstrip("\n")
return fm, body
def write_md(path: Path, fm: dict, body: str) -> None:
"""Atomic write so we never leave a half-written YAML."""
yaml_str = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
sep = "" if body.startswith("\n") else "\n"
tmp = path.with_suffix(path.suffix + ".tmp")
tmp.write_text(f"---\n{yaml_str}---\n{sep}{body}", encoding="utf-8")
tmp.replace(path)
def parse_wikilink_target(s: str) -> tuple[str | None, str | None]:
"""[[class/id]] or [[event/id]] → (class, id). Returns (None, None) if not parseable."""
if not s or not isinstance(s, str):
return None, None
m = WIKILINK_RE.search(s)
target = m.group(1).strip() if m else s.strip()
if "/" not in target:
return None, None
parts = target.split("/", 1)
prefix, ident = parts[0], parts[1]
# accept singular ("event/...") or plural ("events/...") or class-name
aliases = {
"people": "person", "person": "person",
"org": "organization", "organization": "organization", "organizations": "organization",
"loc": "location", "location": "location", "locations": "location",
"event": "event", "events": "event",
"uap": "uap_object", "uap_object": "uap_object", "uap-objects": "uap_object",
"vehicle": "vehicle", "vehicles": "vehicle",
"op": "operation", "operation": "operation", "operations": "operation",
"concept": "concept", "concepts": "concept",
}
cls = aliases.get(prefix.lower())
return (cls, ident.strip()) if cls else (None, None)
def collect_page_refs() -> dict[tuple[str, str], set[str]]:
"""
Scan wiki/pages/<doc>/p*.md. For each page, parse
`entities_extracted: {people: [...], organizations: [...], ...}` and append
the page_id to that entity's set.
Returns {(class, id): {page_id, ...}}.
"""
refs: dict[tuple[str, str], set[str]] = defaultdict(set)
for page_path in PAGES_BASE.rglob("p*.md"):
try:
fm, _ = read_md(page_path)
except Exception:
continue
extracted = fm.get("entities_extracted") or {}
if not isinstance(extracted, dict):
continue
# page_id like "doc-abc/p007"
doc_id = page_path.parent.name
page_id = f"{doc_id}/{page_path.stem}"
for folder, entries in extracted.items():
cls = FOLDER_TO_CLASS.get(folder)
if not cls or not isinstance(entries, list):
continue
for entry in entries:
# entry can be a plain string id, a wikilink, or a dict with
# a `name` field that we must canonicalize ourselves (matches
# the algorithm used in scripts/03-dedup-entities.py).
eid = None
if isinstance(entry, str):
_, parsed_eid = parse_wikilink_target(entry)
eid = parsed_eid or canonicalize_name(entry)
elif isinstance(entry, dict):
eid = (entry.get("id")
or entry.get(ID_FIELD_BY_CLASS.get(cls, "id"))
or canonicalize_name(entry.get("name", "")))
if eid:
refs[(cls, eid)].add(page_id)
# Also index by every alias, so e.g. "USCENTCOM" matches a
# United States Central Command entity if dedup ran on aliases.
if isinstance(entry, dict):
for alias in (entry.get("aliases") or []):
alias_id = canonicalize_name(alias)
if alias_id and alias_id != eid:
refs[(cls, alias_id)].add(page_id)
return refs
def collect_cross_refs() -> dict[tuple[str, str], set[str]]:
"""
Sweep entity YAMLs themselves. When entity X declares
`uap_objects: [[[uap/OBJ-...]]]`, we register OBJ-... X as a cross-ref.
"""
refs: dict[tuple[str, str], set[str]] = defaultdict(set)
for folder, cls in FOLDER_TO_CLASS.items():
cls_dir = ENTITIES_BASE / folder
if not cls_dir.is_dir():
continue
for ent_path in cls_dir.glob("*.md"):
try:
fm, _ = read_md(ent_path)
except Exception:
continue
id_field = ID_FIELD_BY_CLASS.get(cls)
self_id = fm.get(id_field) or ent_path.stem
for field in CROSS_REF_FIELDS.get(cls, []):
val = fm.get(field)
items = val if isinstance(val, list) else ([val] if val else [])
for item in items:
tgt_cls, tgt_id = parse_wikilink_target(item if isinstance(item, str) else str(item))
if tgt_cls and tgt_id:
refs[(tgt_cls, tgt_id)].add(f"{cls}/{self_id}")
# Also walk documents/key_entities
for doc_path in DOCS_BASE.glob("*.md"):
try:
fm, _ = read_md(doc_path)
except Exception:
continue
for item in (fm.get("key_entities") or []):
tgt_cls, tgt_id = parse_wikilink_target(item if isinstance(item, str) else str(item))
if tgt_cls and tgt_id:
refs[(tgt_cls, tgt_id)].add(f"document/{doc_path.stem}")
return refs
def collect_db_mentions(conn) -> dict[tuple[str, str], tuple[int, int]]:
"""Return {(class, id): (chunk_count, doc_count)} from public.entity_mentions."""
out: dict[tuple[str, str], tuple[int, int]] = {}
with conn.cursor() as cur:
cur.execute(
"""
SELECT e.entity_class, e.entity_id,
COUNT(em.chunk_pk)::int AS chunks,
COUNT(DISTINCT c.doc_id)::int AS docs
FROM public.entities e
LEFT JOIN public.entity_mentions em ON em.entity_pk = e.entity_pk
LEFT JOIN public.chunks c ON c.chunk_pk = em.chunk_pk
GROUP BY e.entity_class, e.entity_id
"""
)
for cls, eid, chunks, docs in cur.fetchall():
out[(cls, eid)] = (chunks or 0, docs or 0)
return out
def signal_strength(db_chunks: int, page_refs: int, cross_refs: int) -> str:
total = db_chunks + page_refs + cross_refs
if total == 0:
return "orphan"
if db_chunks >= 3 or page_refs >= 3 or (db_chunks >= 1 and page_refs >= 1):
return "strong"
return "weak"
def archive_entity(path: Path, dry_run: bool, archived_count: list[int]) -> None:
rel = path.relative_to(ENTITIES_BASE)
target = ARCHIVED_BASE / rel
archived_count[0] += 1
if dry_run:
return
target.parent.mkdir(parents=True, exist_ok=True)
shutil.move(str(path), str(target))
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument("--dry-run", action="store_true")
p.add_argument("--archive", action="store_true",
help="actually move orphans to wiki/entities/_archived/. "
"By default we only mark them — data is never lost.")
p.add_argument("--archive-only-junk", action="store_true",
help="archive ONLY entities whose canonical_name is <=3 chars, "
"purely numeric, or matches obvious junk patterns")
p.add_argument("--fix-obj-names", action="store_true",
help="rewrite OBJ-* canonical_name to '<event> UAP', "
"moving the full shape description to aliases")
p.add_argument("--verbose", action="store_true")
args = p.parse_args()
print(f"scanning {ENTITIES_BASE} ...")
if not DATABASE_URL:
sys.stderr.write("DATABASE_URL not set — cannot read DB mentions\n")
return 1
print("collecting page refs from wiki/pages/ ...")
page_refs = collect_page_refs()
print(f" {len(page_refs)} entities referenced from {sum(len(v) for v in page_refs.values())} page rows")
print("collecting cross-entity refs ...")
cross_refs = collect_cross_refs()
print(f" {len(cross_refs)} entities back-linked")
print(f"reading DB entity_mentions ...")
with psycopg.connect(DATABASE_URL) as conn:
db_counts = collect_db_mentions(conn)
print(f" {len(db_counts)} entities in DB")
# Walk every entity YAML on disk
archived_count = [0]
stats = {"strong": 0, "weak": 0, "orphan": 0, "updated": 0, "skipped": 0}
for folder, cls in FOLDER_TO_CLASS.items():
cls_dir = ENTITIES_BASE / folder
if not cls_dir.is_dir():
continue
for ent_path in cls_dir.glob("*.md"):
try:
fm, body = read_md(ent_path)
except Exception:
stats["skipped"] += 1
continue
if not fm:
stats["skipped"] += 1
continue
id_field = ID_FIELD_BY_CLASS.get(cls)
eid = fm.get(id_field) or ent_path.stem
key = (cls, eid)
db_chunks, db_docs = db_counts.get(key, (0, 0))
page_list = sorted(page_refs.get(key, set()))
cross_list = sorted(cross_refs.get(key, set()))
# Also count this entity's OWN outgoing wikilinks as signal —
# if an OBJ has observed_in_event pointing to a real event, the
# OBJ is anchored even when no one links back to it.
own_outgoing: set[str] = set()
for field in CROSS_REF_FIELDS.get(cls, []):
val = fm.get(field)
items = val if isinstance(val, list) else ([val] if val else [])
for item in items:
tgt_cls, tgt_id = parse_wikilink_target(
item if isinstance(item, str) else str(item))
if tgt_cls and tgt_id:
own_outgoing.add(f"{tgt_cls}/{tgt_id}")
all_cross = sorted(set(cross_list) | own_outgoing)
strength = signal_strength(db_chunks, len(page_list), len(all_cross))
stats[strength] += 1
# Optional: clean up OBJ entities whose canonical_name is a 100-char
# shape description plus the ID in parentheses. Move the description
# to an alias and pick a short readable name from the linked event.
if args.fix_obj_names and cls == "uap_object":
cn = str(fm.get("canonical_name") or "")
if len(cn) > 80 and "UAP" in cn and "(" in cn and cn.endswith(")"):
obs_event = fm.get("observed_in_event")
event_cls, event_id = parse_wikilink_target(obs_event or "")
if event_cls == "event" and event_id:
# Strip the "EV-YYYY-MM-DD-" prefix to get a slug
slug = re.sub(r"^EV-\d{4}-[\dX]{2}-[\dX]{2}-", "", event_id)
new_name = slug.replace("-", " ").strip() or eid
new_name = new_name[:1].upper() + new_name[1:] + " UAP"
aliases = list(fm.get("aliases") or [])
if cn not in aliases:
aliases.insert(0, cn)
fm["canonical_name"] = new_name
fm["aliases"] = aliases
# Mutate frontmatter — preserve unrelated keys.
fm["mentioned_in"] = [f"[[{p}]]" for p in page_list]
fm["total_mentions"] = max(db_chunks, len(page_list))
fm["documents_count"] = max(db_docs, len({p.split("/", 1)[0] for p in page_list}))
fm["signal_sources"] = {
"db_chunks": int(db_chunks),
"page_refs": len(page_list),
"cross_refs": len(all_cross),
}
if all_cross:
fm["referenced_by"] = [f"[[{r}]]" for r in all_cross[:25]]
elif "referenced_by" in fm:
del fm["referenced_by"]
fm["signal_strength"] = strength
fm["last_lint"] = utc_iso()
# Optional archive paths — by default we KEEP everything, only mark.
if strength == "orphan" and args.archive:
archive_entity(ent_path, args.dry_run, archived_count)
continue
if args.archive_only_junk:
cn = str(fm.get("canonical_name") or "").strip()
cn_id = cn.lower()
is_junk = (
len(cn) <= 3
or re.fullmatch(r"[0-9.()-]+", cn) is not None
or cn_id in {"unknown", "none", "n/a", "na", "-", ""}
)
if is_junk and strength == "orphan":
archive_entity(ent_path, args.dry_run, archived_count)
continue
stats["updated"] += 1
if args.verbose:
print(f" {strength:7} {cls}/{eid} db={db_chunks} pages={len(page_list)} cross={len(cross_list)}")
if not args.dry_run:
write_md(ent_path, fm, body)
print()
print(f" strong: {stats['strong']:>6}")
print(f" weak: {stats['weak']:>6}")
print(f" orphan: {stats['orphan']:>6} (archived: {archived_count[0]})")
print(f" updated: {stats['updated']:>6}")
print(f" skipped: {stats['skipped']:>6}")
print(f" dry-run: {args.dry_run}")
if not args.dry_run and (stats["updated"] > 0 or archived_count[0] > 0):
LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
with LOG_PATH.open("a", encoding="utf-8") as f:
f.write(
f"\n## {utc_iso()} · SYNC_ENTITY_STATS\n"
f"- script: scripts/maintain/42_sync_entity_stats.py\n"
f"- strong: {stats['strong']}\n"
f"- weak: {stats['weak']}\n"
f"- orphan: {stats['orphan']} (archived: {archived_count[0]})\n"
f"- updated: {stats['updated']}\n"
)
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -79,10 +79,10 @@ export default async function EntityPage({
const entityClassSingular = CLASS_TO_SINGULAR[folder as string] ?? folder;
// 1. DB first — live counts
// YAML-first: every count comes from the entity's frontmatter (kept in sync
// by scripts/maintain/42_sync_entity_stats.py). The DB is consulted ONLY for
// chunk previews, not for counts.
const core = await getEntityCore(entityClassSingular, id).catch(() => null);
// 2. Wiki fallback — narrative body, aliases (Haiku stub OK)
const wiki = await readEntity(folder as EntityClass, id);
if (!core && !wiki) notFound();
@ -91,9 +91,8 @@ export default async function EntityPage({
(a) => a !== canonical,
);
// 3. Live data per-doc grouping
const mentionGroups = core
? await getEntityMentionsByDoc(core.entity_pk, 100).catch(() => [])
? await getEntityMentionsByDoc(entityClassSingular, id, 100).catch(() => [])
: [];
const sampleChunks = core
? await getEntityChunks(core.entity_pk, 12).catch(() => [])
@ -101,6 +100,8 @@ export default async function EntityPage({
const totalMentions = core?.total_mentions ?? 0;
const documentsCount = core?.documents_count ?? 0;
const strength = core?.signal_strength ?? "unverified";
const sigs = core?.signal_sources ?? { db_chunks: 0, page_refs: 0, cross_refs: 0 };
const classColor = CLASS_COLOR[folder as EntityClass];
const classBg = CLASS_BG[folder as EntityClass];
@ -164,7 +165,49 @@ export default async function EntityPage({
<div className="font-mono text-sm text-[#a78bfa] mt-0.5">{core.enrichment_status}</div>
</div>
)}
<div
className={`px-4 py-3 bg-[#0a121e] border rounded ${
strength === "strong"
? "border-[#00ff9c]"
: strength === "weak"
? "border-[#ffa500]"
: strength === "orphan"
? "border-[#ff6b6b]"
: "border-[#5a6678]"
}`}
title="Cruzamento dos 3 sinais que confirmam esta entidade no corpus."
>
<div className="font-mono text-[10px] uppercase tracking-widest text-[#5a6678]">
força do sinal
</div>
<div
className={`font-mono text-sm mt-0.5 ${
strength === "strong"
? "text-[#00ff9c]"
: strength === "weak"
? "text-[#ffa500]"
: strength === "orphan"
? "text-[#ff6b6b]"
: "text-[#8896aa]"
}`}
>
{strength === "strong" && "forte"}
{strength === "weak" && "fraca"}
{strength === "orphan" && "órfã"}
{strength === "unverified" && "não verificada"}
</div>
<div className="font-mono text-[9px] text-[#5a6678] mt-1 leading-tight">
{sigs.db_chunks} chunks · {sigs.page_refs} págs · {sigs.cross_refs} backlinks
</div>
</div>
</div>
{strength === "orphan" && (
<p className="mt-4 text-xs text-[#ff6b6b] font-mono">
entidade não confirmada: nenhuma página, chunk ou outra entidade aponta para
ela. Pode ser extração ruidosa do pipeline original.
</p>
)}
</header>
<div className="grid grid-cols-1 lg:grid-cols-[1fr_320px] gap-8">

View file

@ -1,19 +1,145 @@
/**
* Live entity data queries replaces stale Haiku-era frontmatter `mentioned_in[]`
* with real counts from `public.entity_mentions` + `public.chunks`.
* Entity page data SINGLE SOURCE OF TRUTH is the YAML frontmatter on disk.
*
* Why YAML and not the DB? Because the corpus has TWO independent extraction
* layers (Haiku page-level, Sonnet chunk-level) and each catches a different
* subset of entities. The DB's entity_mentions table is one of those signals
* useful for chat retrieval but incomplete for the entity catalog itself.
*
* Reading from disk lets us merge every signal into one stat (`total_mentions`)
* via the maintain/42_sync_entity_stats.py pipeline and serve consistent
* numbers everywhere in the UI.
*
* The DB is still queried for ONE thing: the actual chunk text for previews,
* because we don't want to re-parse 21k chunk files on every page render.
*/
import fs from "node:fs/promises";
import path from "node:path";
import matter from "gray-matter";
import { pgQuery } from "./db";
import { findEntity } from "./graph";
import { WIKI } from "@/lib/wiki";
const FOLDER_BY_CLASS: Record<string, string> = {
person: "people",
organization: "organizations",
location: "locations",
event: "events",
uap_object: "uap-objects",
vehicle: "vehicles",
operation: "operations",
concept: "concepts",
};
export interface EntityCore {
entity_pk: number;
entity_pk: number | null; // db-side primary key; null if entity is wiki-only
entity_class: string;
entity_id: string;
canonical_name: string;
aliases: string[] | null;
aliases: string[];
total_mentions: number;
documents_count: number;
signal_strength: "strong" | "weak" | "orphan" | "unverified";
signal_sources: {
db_chunks: number;
page_refs: number;
cross_refs: number;
};
mentioned_in: string[]; // [[doc-id/p007]]
referenced_by: string[]; // [[class/id]] cross-links
enrichment_status: string | null;
narrative_summary: string | null;
narrative_summary_pt_br: string | null;
summary_status: string | null;
}
interface RawFm {
[k: string]: unknown;
}
function num(v: unknown, fallback = 0): number {
if (typeof v === "number" && Number.isFinite(v)) return v;
if (typeof v === "string") {
const n = Number(v);
return Number.isFinite(n) ? n : fallback;
}
return fallback;
}
function arr(v: unknown): string[] {
if (!v) return [];
if (Array.isArray(v)) return v.filter((x) => typeof x === "string");
return [];
}
function strOrNull(v: unknown): string | null {
return typeof v === "string" && v.trim() ? v : null;
}
async function readEntityYaml(entityClass: string, entityId: string): Promise<RawFm | null> {
const folder = FOLDER_BY_CLASS[entityClass];
if (!folder) return null;
const p = path.join(WIKI, "entities", folder, `${entityId}.md`);
try {
const raw = await fs.readFile(p, "utf-8");
return matter(raw).data as RawFm;
} catch {
return null;
}
}
/**
* Load a single entity card from its YAML. Returns null if archived or
* missing keeps the route handler simple.
*/
export async function getEntityCore(
entityClass: string,
entityId: string,
): Promise<EntityCore | null> {
const fm = await readEntityYaml(entityClass, entityId);
if (!fm) return null;
// Best-effort lookup of the DB entity_pk so getEntityChunks can still
// query by primary key. Don't fail if the entity isn't in the DB at all.
let entity_pk: number | null = null;
try {
const rows = await pgQuery<{ entity_pk: number }>(
`SELECT entity_pk FROM public.entities
WHERE entity_class = $1 AND entity_id = $2 LIMIT 1`,
[entityClass, entityId],
);
entity_pk = rows[0]?.entity_pk ?? null;
} catch {
entity_pk = null;
}
const sigSources = (fm.signal_sources as Record<string, unknown> | undefined) ?? {};
const strength = (typeof fm.signal_strength === "string"
? fm.signal_strength
: "unverified") as EntityCore["signal_strength"];
return {
entity_pk,
entity_class: entityClass,
entity_id: entityId,
canonical_name: typeof fm.canonical_name === "string" ? fm.canonical_name : entityId,
aliases: arr(fm.aliases),
total_mentions: num(fm.total_mentions, 0),
documents_count: num(fm.documents_count, 0),
signal_strength: ["strong", "weak", "orphan", "unverified"].includes(strength)
? strength
: "unverified",
signal_sources: {
db_chunks: num(sigSources.db_chunks, 0),
page_refs: num(sigSources.page_refs, 0),
cross_refs: num(sigSources.cross_refs, 0),
},
mentioned_in: arr(fm.mentioned_in),
referenced_by: arr(fm.referenced_by),
enrichment_status: strOrNull(fm.enrichment_status),
narrative_summary: strOrNull(fm.narrative_summary),
narrative_summary_pt_br: strOrNull(fm.narrative_summary_pt_br),
summary_status: strOrNull(fm.summary_status),
};
}
export interface EntityMentionGroup {
@ -26,55 +152,63 @@ export interface EntityMentionGroup {
pages: number[];
}
export async function getEntityCore(
/**
* Group reverse-references by document. Derived from the YAML's mentioned_in[]
* (which the maintain script writes consolidating page YAMLs). Optionally
* enriches with document metadata read from wiki/documents/<doc-id>.md.
*/
export async function getEntityMentionsByDoc(
entityClass: string,
entityId: string,
): Promise<EntityCore | null> {
const rows = await pgQuery<EntityCore>(
`SELECT
e.entity_pk, e.entity_class, e.entity_id, e.canonical_name, e.aliases,
COALESCE(em.mention_count, 0) AS total_mentions,
COALESCE(em.doc_count, 0) AS documents_count,
e.enrichment_status
FROM public.entities e
LEFT JOIN (
SELECT em.entity_pk,
COUNT(*)::INT AS mention_count,
COUNT(DISTINCT c.doc_id)::INT AS doc_count
FROM public.entity_mentions em
JOIN public.chunks c ON c.chunk_pk = em.chunk_pk
GROUP BY em.entity_pk
) em ON em.entity_pk = e.entity_pk
WHERE e.entity_class = $1 AND e.entity_id = $2
LIMIT 1`,
[entityClass, entityId],
);
return rows[0] ?? null;
}
/** Group mentions per document so the sidebar can list "appears in N docs". */
export async function getEntityMentionsByDoc(
entityPk: number,
limit: number = 50,
limit = 100,
): Promise<EntityMentionGroup[]> {
return pgQuery<EntityMentionGroup>(
`SELECT
c.doc_id,
d.canonical_title,
d.collection,
d.page_count,
d.classification,
COUNT(*)::INT AS mention_count,
array_agg(DISTINCT c.page ORDER BY c.page) AS pages
FROM public.entity_mentions em
JOIN public.chunks c ON c.chunk_pk = em.chunk_pk
LEFT JOIN public.documents d ON d.doc_id = c.doc_id
WHERE em.entity_pk = $1
GROUP BY c.doc_id, d.canonical_title, d.collection, d.page_count, d.classification
ORDER BY mention_count DESC
LIMIT $2`,
[entityPk, limit],
);
const fm = await readEntityYaml(entityClass, entityId);
if (!fm) return [];
const refs = arr(fm.mentioned_in);
// Each ref looks like "[[doc-id/p007]]". Strip wikilink delimiters.
const byDoc = new Map<string, Set<number>>();
for (const ref of refs) {
const m = ref.match(/\[\[([^\]|]+?)\]\]/);
const target = (m ? m[1] : ref).trim();
const [docId, pageStr] = target.split("/", 2);
if (!docId) continue;
const pageNum = pageStr ? parseInt(pageStr.replace(/^p/, ""), 10) : NaN;
if (!byDoc.has(docId)) byDoc.set(docId, new Set());
if (Number.isFinite(pageNum)) byDoc.get(docId)!.add(pageNum);
}
// Hydrate each doc's metadata from wiki/documents/<doc-id>.md
const groups: EntityMentionGroup[] = [];
for (const [docId, pages] of byDoc) {
let canonical_title: string | null = null;
let collection: string | null = null;
let page_count: number | null = null;
let classification: string | null = null;
try {
const docRaw = await fs.readFile(
path.join(WIKI, "documents", `${docId}.md`),
"utf-8",
);
const dfm = matter(docRaw).data as Record<string, unknown>;
canonical_title = strOrNull(dfm.canonical_title);
collection = strOrNull(dfm.collection);
page_count = num(dfm.page_count, 0) || null;
classification = strOrNull(dfm.highest_classification);
} catch {
/* doc missing — use raw id */
}
groups.push({
doc_id: docId,
canonical_title,
collection,
page_count,
classification,
mention_count: pages.size,
pages: Array.from(pages).sort((a, b) => a - b),
});
}
groups.sort((a, b) => b.mention_count - a.mention_count);
return groups.slice(0, limit);
}
export interface EntityChunkPreview {
@ -91,10 +225,16 @@ export interface EntityChunkPreview {
ufo_anomaly_type: string | null;
}
/**
* Top chunks that textually mention this entity. Reads from DB because
* chunk content is big (we don't re-parse files at request time). Returns []
* if the entity isn't indexed in the DB.
*/
export async function getEntityChunks(
entityPk: number,
limit: number = 30,
entityPk: number | null,
limit = 30,
): Promise<EntityChunkPreview[]> {
if (entityPk == null) return [];
return pgQuery<EntityChunkPreview>(
`SELECT
c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.type, c.bbox, c.classification,
@ -108,4 +248,5 @@ export async function getEntityChunks(
);
}
export { findEntity };
// Backwards-compat for callers that imported findEntity from the old path.
export { findEntity } from "./graph";