disclosure-bureau/web/lib/retrieval/entity-pages.ts
Luiz Gustavo f2b7b116ce
Some checks failed
CI / Web — typecheck + lint + build (push) Failing after 45s
CI / Scripts — Python smoke (push) Failing after 4s
CI / Web — npm audit (push) Failing after 41s
CI / Retrieval — golden set (Recall@5 + MRR) (push) Failing after 3s
W5.3 (Phase 3A): entity summaries — sub-pages get magazine-grade prose
Today /sightings, /witnesses, /objects, /locations and /operations show
a name + mention count and nothing else. After this each row carries a
60-100 word bilingual narrative summary written from the chunks where
the entity actually appears.

Migration 0008 (apply as supabase_admin):
  public.entities  +summary_en TEXT
                   +summary_pt_br TEXT
                   +summary_generated_at TIMESTAMPTZ
                   +summary_model TEXT
                   +summary_status TEXT
                     CHECK ('pending'|'ai_generated'|'curated'|'refused')
  + index on summary_status
  + GRANT UPDATE (summary_*) ON entities TO investigator
  + new policy entities_investigator_update_summary (RLS UPDATE for
    investigator role)

Enrichment script (investigator-runtime/scripts/enrich_entity_summaries.ts):
  - Per-class config (chunk_k, min_mentions, max_per_class)
  - Path A: entity_mentions JOIN chunks (high-precision linker)
  - Path B (fallback): hybridSearch on canonical_name + aliases when
    entity_mentions returns zero. This is what unlocked Kenneth Arnold
    and similar entities — their wiki YAML has high total_mentions
    counted from frontmatter mentioned_in[], but the entity_mentions
    extractor was silent because the matches came from the wiki text,
    not the OCR chunks.
  - Sonnet 4.6 via OAuth Max, ~$0.04 per entity, ~$10 for the full
    260-entity bulk run.
  - INSUFFICIENT skip when chunks can't sustain a 60-word summary —
    refused entries get summary_status='refused' so they're not retried.

UI uplift:
  - lib/retrieval/entity-pages.ts: getEntityCore now prefers the DB
    summary (ai_generated or curated) over wiki YAML narrative.
  - components/entity-list-page.tsx:
    * SELECT now pulls summary_en, summary_pt_br, summary_status
    * Sorted with summary-enriched rows first (so the magazine grid
      lands on quality content immediately)
    * MagazineGrid: 4-line summary preview replaces aliases line
    * CompactGrid: enriched rows render as full editorial cards,
      bare rows fall back to a compact table below

Smoke results:
  - Kenneth Arnold sighting: "On June 24, 1947, pilot Kenneth Arnold
    reported sighting unidentified objects over the Pacific Northwest,
    and the account spread worldwide. It set off a run of similar
    reports: County Commissioner Crankes saw comparable objects after
    Arnold's account reached the press, and United Airlines pilot
    Emil H. Smith spotted flying discs on July 4 during a routine
    flight out of Boise, Idaho..."
  - Roswell Incident: includes Colonel Corso's 1997 book + the 1995
    GAO finding that radio messages from Oct 46–Feb 47 were destroyed
    + Senator Strom Thurmond's foreword. Real magazine-grade content.

Background bulk run kicked off across all 5 classes (event,
uap_object, person, location, organization) — populating live as
the homepage rebuilds.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-24 15:37:01 -03:00

280 lines
9.5 KiB
TypeScript

/**
* Entity page data — SINGLE SOURCE OF TRUTH is the YAML frontmatter on disk.
*
* Why YAML and not the DB? Because the corpus has TWO independent extraction
* layers (Haiku page-level, Sonnet chunk-level) and each catches a different
* subset of entities. The DB's entity_mentions table is one of those signals —
* useful for chat retrieval but incomplete for the entity catalog itself.
*
* Reading from disk lets us merge every signal into one stat (`total_mentions`)
* via the maintain/42_sync_entity_stats.py pipeline and serve consistent
* numbers everywhere in the UI.
*
* The DB is still queried for ONE thing: the actual chunk text for previews,
* because we don't want to re-parse 21k chunk files on every page render.
*/
import fs from "node:fs/promises";
import path from "node:path";
import matter from "gray-matter";
import { pgQuery } from "./db";
import { WIKI } from "@/lib/wiki";
const FOLDER_BY_CLASS: Record<string, string> = {
person: "people",
organization: "organizations",
location: "locations",
event: "events",
uap_object: "uap-objects",
vehicle: "vehicles",
operation: "operations",
concept: "concepts",
};
export interface EntityCore {
entity_pk: number | null; // db-side primary key; null if entity is wiki-only
entity_class: string;
entity_id: string;
canonical_name: string;
aliases: string[];
total_mentions: number;
documents_count: number;
signal_strength: "strong" | "weak" | "orphan" | "unverified";
signal_sources: {
db_chunks: number;
page_refs: number;
cross_refs: number;
text_refs: number;
};
mentioned_in: string[]; // [[doc-id/p007]] — structured page refs (Haiku)
text_mentioned_in: string[]; // [[doc-id/p007]] — text-only matches (back-fill)
referenced_by: string[]; // [[class/id]] cross-links
enrichment_status: string | null;
narrative_summary: string | null;
narrative_summary_pt_br: string | null;
summary_status: string | null;
}
interface RawFm {
[k: string]: unknown;
}
function num(v: unknown, fallback = 0): number {
if (typeof v === "number" && Number.isFinite(v)) return v;
if (typeof v === "string") {
const n = Number(v);
return Number.isFinite(n) ? n : fallback;
}
return fallback;
}
function arr(v: unknown): string[] {
if (!v) return [];
if (Array.isArray(v)) return v.filter((x) => typeof x === "string");
return [];
}
function strOrNull(v: unknown): string | null {
return typeof v === "string" && v.trim() ? v : null;
}
async function readEntityYaml(entityClass: string, entityId: string): Promise<RawFm | null> {
const folder = FOLDER_BY_CLASS[entityClass];
if (!folder) return null;
const p = path.join(WIKI, "entities", folder, `${entityId}.md`);
try {
const raw = await fs.readFile(p, "utf-8");
return matter(raw).data as RawFm;
} catch {
return null;
}
}
/**
* Load a single entity card from its YAML. Returns null if archived or
* missing — keeps the route handler simple.
*/
export async function getEntityCore(
entityClass: string,
entityId: string,
): Promise<EntityCore | null> {
const fm = await readEntityYaml(entityClass, entityId);
if (!fm) return null;
// Best-effort lookup of the DB entity_pk so getEntityChunks can still
// query by primary key. Also pull the AI-generated narrative summary
// (W5.3 / migration 0008) for the entity detail page header.
let entity_pk: number | null = null;
let dbSummaryEn: string | null = null;
let dbSummaryPt: string | null = null;
let dbSummaryStatus: string | null = null;
try {
const rows = await pgQuery<{
entity_pk: number;
summary_en: string | null;
summary_pt_br: string | null;
summary_status: string | null;
}>(
`SELECT entity_pk, summary_en, summary_pt_br, summary_status
FROM public.entities
WHERE entity_class = $1 AND entity_id = $2 LIMIT 1`,
[entityClass, entityId],
);
const r = rows[0];
if (r) {
entity_pk = r.entity_pk;
dbSummaryEn = r.summary_en;
dbSummaryPt = r.summary_pt_br;
dbSummaryStatus = r.summary_status;
}
} catch {
entity_pk = null;
}
const sigSources = (fm.signal_sources as Record<string, unknown> | undefined) ?? {};
const strength = (typeof fm.signal_strength === "string"
? fm.signal_strength
: "unverified") as EntityCore["signal_strength"];
return {
entity_pk,
entity_class: entityClass,
entity_id: entityId,
canonical_name: typeof fm.canonical_name === "string" ? fm.canonical_name : entityId,
aliases: arr(fm.aliases),
total_mentions: num(fm.total_mentions, 0),
documents_count: num(fm.documents_count, 0),
signal_strength: ["strong", "weak", "orphan", "unverified"].includes(strength)
? strength
: "unverified",
signal_sources: {
db_chunks: num(sigSources.db_chunks, 0),
page_refs: num(sigSources.page_refs, 0),
cross_refs: num(sigSources.cross_refs, 0),
text_refs: num(sigSources.text_refs, 0),
},
mentioned_in: arr(fm.mentioned_in),
text_mentioned_in: arr(fm.text_mentioned_in),
referenced_by: arr(fm.referenced_by),
enrichment_status: strOrNull(fm.enrichment_status),
// Prefer DB-stored AI-generated summaries (the curated layer the bureau
// writes). Fall back to wiki YAML narrative if the DB is silent.
narrative_summary: dbSummaryEn ?? strOrNull(fm.narrative_summary),
narrative_summary_pt_br: dbSummaryPt ?? strOrNull(fm.narrative_summary_pt_br),
summary_status: dbSummaryStatus ?? strOrNull(fm.summary_status),
};
}
export interface EntityMentionGroup {
doc_id: string;
canonical_title: string | null;
collection: string | null;
page_count: number | null;
classification: string | null;
mention_count: number;
pages: number[];
text_only: boolean; // true when all refs came from the text back-fill (no structured Haiku evidence)
}
/**
* Group reverse-references by document. Derived from the YAML's mentioned_in[]
* (which the maintain script writes consolidating page YAMLs). Optionally
* enriches with document metadata read from wiki/documents/<doc-id>.md.
*/
export async function getEntityMentionsByDoc(
entityClass: string,
entityId: string,
limit = 100,
): Promise<EntityMentionGroup[]> {
const fm = await readEntityYaml(entityClass, entityId);
if (!fm) return [];
const structuredRefs = arr(fm.mentioned_in);
const textRefs = arr(fm.text_mentioned_in);
// Each ref looks like "[[doc-id/p007]]". Strip wikilink delimiters.
const byDoc = new Map<string, { structured: Set<number>; text: Set<number> }>();
const addRef = (ref: string, source: "structured" | "text") => {
const m = ref.match(/\[\[([^\]|]+?)\]\]/);
const target = (m ? m[1] : ref).trim();
const [docId, pageStr] = target.split("/", 2);
if (!docId) return;
const pageNum = pageStr ? parseInt(pageStr.replace(/^p/, ""), 10) : NaN;
if (!byDoc.has(docId)) byDoc.set(docId, { structured: new Set(), text: new Set() });
if (Number.isFinite(pageNum)) byDoc.get(docId)![source].add(pageNum);
};
for (const r of structuredRefs) addRef(r, "structured");
for (const r of textRefs) addRef(r, "text");
// Hydrate each doc's metadata from wiki/documents/<doc-id>.md
const groups: EntityMentionGroup[] = [];
for (const [docId, sets] of byDoc) {
let canonical_title: string | null = null;
let collection: string | null = null;
let page_count: number | null = null;
let classification: string | null = null;
try {
const docRaw = await fs.readFile(
path.join(WIKI, "documents", `${docId}.md`),
"utf-8",
);
const dfm = matter(docRaw).data as Record<string, unknown>;
canonical_title = strOrNull(dfm.canonical_title);
collection = strOrNull(dfm.collection);
page_count = num(dfm.page_count, 0) || null;
classification = strOrNull(dfm.highest_classification);
} catch {
/* doc missing — use raw id */
}
const merged = new Set<number>([...sets.structured, ...sets.text]);
groups.push({
doc_id: docId,
canonical_title,
collection,
page_count,
classification,
mention_count: merged.size,
pages: Array.from(merged).sort((a, b) => a - b),
text_only: sets.structured.size === 0 && sets.text.size > 0,
});
}
groups.sort((a, b) => b.mention_count - a.mention_count);
return groups.slice(0, limit);
}
export interface EntityChunkPreview {
chunk_pk: number;
doc_id: string;
chunk_id: string;
page: number;
type: string;
bbox: { x: number; y: number; w: number; h: number } | null;
classification: string | null;
content_pt: string | null;
content_en: string | null;
ufo_anomaly: boolean | null;
ufo_anomaly_type: string | null;
}
/**
* Top chunks that textually mention this entity. Reads from DB because
* chunk content is big (we don't re-parse files at request time). Returns []
* if the entity isn't indexed in the DB.
*/
export async function getEntityChunks(
entityPk: number | null,
limit = 30,
): Promise<EntityChunkPreview[]> {
if (entityPk == null) return [];
return pgQuery<EntityChunkPreview>(
`SELECT
c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.type, c.bbox, c.classification,
c.content_pt, c.content_en, c.ufo_anomaly, c.ufo_anomaly_type
FROM public.entity_mentions em
JOIN public.chunks c ON c.chunk_pk = em.chunk_pk
WHERE em.entity_pk = $1
ORDER BY c.ufo_anomaly DESC NULLS LAST, c.doc_id, c.order_global
LIMIT $2`,
[entityPk, limit],
);
}
// Backwards-compat for callers that imported findEntity from the old path.
export { findEntity } from "./graph";