diff --git a/infra/supabase/migrations/0008_entity_summaries.sql b/infra/supabase/migrations/0008_entity_summaries.sql new file mode 100644 index 0000000..972f9ff --- /dev/null +++ b/infra/supabase/migrations/0008_entity_summaries.sql @@ -0,0 +1,27 @@ +-- 0008_entity_summaries.sql — bilingual prose summary per entity. +-- +-- The /sightings, /witnesses, /objects, /locations, /operations pages +-- need real prose to feel like a magazine. Today they show just a name + +-- mention count. After this migration, each entity carries an ~80-word +-- bilingual narrative summary written from the chunks where it appears. +-- +-- The narrator (case-writer voice, house style) writes one summary per +-- entity. Generation is offline (scripts/maintain/61_enrich_entity_summaries.ts) +-- and idempotent — re-running the script skips rows already enriched. +-- +-- Apply as supabase_admin (entities table owner). + +BEGIN; + +ALTER TABLE public.entities + ADD COLUMN IF NOT EXISTS summary_en TEXT, + ADD COLUMN IF NOT EXISTS summary_pt_br TEXT, + ADD COLUMN IF NOT EXISTS summary_generated_at TIMESTAMPTZ, + ADD COLUMN IF NOT EXISTS summary_model TEXT, + ADD COLUMN IF NOT EXISTS summary_status TEXT + CHECK (summary_status IN ('pending', 'ai_generated', 'curated', 'refused')); + +CREATE INDEX IF NOT EXISTS entities_summary_status_idx + ON public.entities (summary_status) WHERE summary_status IS NOT NULL; + +COMMIT; diff --git a/investigator-runtime/scripts/enrich_entity_summaries.ts b/investigator-runtime/scripts/enrich_entity_summaries.ts new file mode 100644 index 0000000..1d67006 --- /dev/null +++ b/investigator-runtime/scripts/enrich_entity_summaries.ts @@ -0,0 +1,296 @@ +#!/usr/bin/env bun +/** + * enrich_entity_summaries.ts — generate the bilingual narrative summary + * each entity needs for the public-facing sub-pages (/sightings, + * /witnesses, /objects, /locations, /operations). + * + * For each entity: + * 1. Pull top N chunks where it appears via entity_mentions JOIN chunks. + * 2. Compose a prompt under the house style + the case-writer's voice + * rules (no detective names, no skeptic framing, scene-driven). + * 3. Ask Sonnet for a ~80-word bilingual JSON: { en, pt_br }. + * 4. UPDATE public.entities (summary_en, summary_pt_br, + * summary_generated_at, summary_model, summary_status). + * + * Idempotent — skips entities where summary_status is already + * 'ai_generated' or 'curated'. Pass --force to re-generate. + * + * Usage: + * bun scripts/enrich_entity_summaries.ts # all classes, default limits + * bun scripts/enrich_entity_summaries.ts event 30 # only events, max 30 + * bun scripts/enrich_entity_summaries.ts uap_object 50 # only uap_objects, max 50 + * bun scripts/enrich_entity_summaries.ts all --force # re-enrich everything + */ +import { audit } from "../src/lib/audit"; +import { callClaude } from "../src/lib/claude"; +import { env } from "../src/lib/env"; +import { query, queryOne } from "../src/lib/pg"; +import { hybridSearch } from "../src/lib/search"; + +const FORCE = process.argv.includes("--force"); +const args = process.argv.slice(2).filter((a) => !a.startsWith("--")); +const filterClass = args[0] && args[0] !== "all" ? args[0] : null; +const maxPerClass = args[1] ? parseInt(args[1], 10) : null; + +// Per-class chunk count required + max entities to enrich. +const CONFIG: Record = { + event: { min_mentions: 2, max_per_class: 80, chunk_k: 8 }, + uap_object: { min_mentions: 1, max_per_class: 80, chunk_k: 8 }, + person: { min_mentions: 5, max_per_class: 120, chunk_k: 8 }, + location: { min_mentions: 8, max_per_class: 80, chunk_k: 6 }, + organization: { min_mentions: 5, max_per_class: 80, chunk_k: 6 }, +}; + +interface EntityRow { + entity_pk: number; + entity_id: string; + entity_class: string; + canonical_name: string; + aliases: string[] | null; + total_mentions: number; + documents_count: number; +} + +interface ChunkRow { + doc_id: string; + chunk_id: string; + page: number; + type: string; + content_en: string | null; + content_pt: string | null; + surface_form: string | null; +} + +const HOUSE_STYLE = ` +Style rules (mandatory): +- Plainspoken, scene-driven, factual. Voice: Erik Larson / John McPhee non-fiction. +- NO em dashes used as commas. NO rule-of-three lists. NO "Moreover/Notably/Em suma". +- NO promotional adjectives (robust, comprehensive, multifaceted, marco histórico). +- NO superficial -ing analyses ("marking a shift", "destacando"). +- NO skeptic framing, no detective names, no probability tables. +- PT-BR is Brazilian Portuguese with UTF-8 accents preserved (ç, ã, á, é, í, ó, ú). +- Verbatim chunk content stays in source language. Citation idiom [[doc-id/pNNN#cNNNN]] only when quoting. +`; + +function buildPrompt(e: EntityRow, chunks: ChunkRow[]): string { + const classLabel = ({ + event: "incident / sighting", + uap_object: "described craft / object", + person: "named witness / participant", + location: "place where incidents are documented", + organization: "agency / program / unit", + } as Record)[e.entity_class] ?? e.entity_class; + + const block = chunks.map((c, i) => { + const text = (c.content_en ?? c.content_pt ?? "").slice(0, 800); + const pageStr = String(c.page).padStart(3, "0"); + return [ + `--- chunk ${i + 1} ---`, + `source: [[${c.doc_id}/p${pageStr}#${c.chunk_id}]]`, + c.surface_form ? `surface_form_in_chunk: ${c.surface_form}` : null, + "", + text, + ].filter(Boolean).join("\n"); + }).join("\n\n"); + + return [ + `# Subject of the summary`, + "", + `**Class.** ${classLabel}`, + `**Canonical name.** ${e.canonical_name}`, + e.aliases && e.aliases.length > 0 ? `**Aliases.** ${e.aliases.slice(0, 6).join(", ")}` : "", + "", + `## Source chunks (${chunks.length})`, + "", + block, + "", + "## Your task", + "", + "Write a single 60-100 word narrative summary in BOTH English and", + "Brazilian Portuguese, drawn directly from the source chunks above.", + "Open the EN version with a specific concrete fact (date, place, person,", + "shape, action) — NOT with the entity's name as the first word. Same for", + "PT-BR. The two versions must say the same thing.", + "", + "Emit a strict JSON object. No prose around it. No code fence.", + "", + "```json", + `{"en": "...60-100 words...", "pt_br": "...60-100 palavras..."}`, + "```", + "", + "If the source chunks are too thin to write something substantive", + `(e.g. only one chunk and it's a stamp or address block), emit the`, + "literal word INSUFFICIENT and stop.", + "", + HOUSE_STYLE, + ].filter(Boolean).join("\n"); +} + +function extractObject(text: string): { en: string; pt_br: string } | null { + const t = text.trim(); + if (/^`?INSUFFICIENT`?\b/i.test(t)) return null; + const stripped = t.replace(/^```(?:json)?\s*\n?/i, "").replace(/\n?```\s*$/i, ""); + const first = stripped.indexOf("{"); + const last = stripped.lastIndexOf("}"); + if (first === -1 || last === -1) throw new Error(`no JSON object: ${t.slice(0, 200)}`); + const parsed = JSON.parse(stripped.slice(first, last + 1)); + if (typeof parsed.en !== "string" || typeof parsed.pt_br !== "string") { + throw new Error("JSON missing en/pt_br fields"); + } + return { en: parsed.en.trim(), pt_br: parsed.pt_br.trim() }; +} + +async function enrichOne(e: EntityRow, k: number): Promise<{ done: boolean; skipped?: string }> { + // Path A — entity_mentions JOIN chunks (the high-precision linker). + let chunks = await query( + `SELECT c.doc_id, c.chunk_id, c.page, c.type, + c.content_en, c.content_pt, em.surface_form + FROM public.entity_mentions em + JOIN public.chunks c ON c.chunk_pk = em.chunk_pk + WHERE em.entity_pk = $1 + AND LENGTH(COALESCE(c.content_en, c.content_pt, '')) > 80 + ORDER BY c.ufo_anomaly DESC NULLS LAST, c.page ASC, c.order_in_page ASC + LIMIT $2`, + [e.entity_pk, k], + ); + + // Path B — hybridSearch fallback. The wiki may know about an entity + // (e.g. "Kenneth Arnold sighting") with high total_mentions counted from + // the wiki frontmatter, but the entity_mentions extractor missed every + // chunk. Search the corpus by canonical_name + aliases instead. + if (chunks.length === 0) { + const queryStr = [e.canonical_name, ...(e.aliases ?? []).slice(0, 3)].join(" "); + const hits = await hybridSearch({ + query: queryStr, + lang: "en", + top_k: k, + recall_k: 40, + max_dense_dist: 0.5, + }).catch(() => []); + chunks = hits.map((h) => ({ + doc_id: h.doc_id, + chunk_id: h.chunk_id, + page: h.page, + type: h.type, + content_en: h.content_en, + content_pt: h.content_pt, + surface_form: null, + })); + } + + if (chunks.length === 0) { + await query( + `UPDATE public.entities SET summary_status = 'refused', + summary_generated_at = NOW() WHERE entity_pk = $1`, + [e.entity_pk], + ); + return { done: false, skipped: "no_chunks" }; + } + + let llmText: string; + try { + const llm = await callClaude({ + prompt: buildPrompt(e, chunks), + model: env.CLAUDE_MODEL, + allowedTools: [], + timeoutMs: 90_000, + budgetCapUsd: 0.05, + }); + llmText = llm.text; + await audit({ + event: "entity_summary_generated", + entity_pk: e.entity_pk, + entity_id: e.entity_id, + entity_class: e.entity_class, + cost_usd: llm.costUsd, + tokens_in: llm.tokensIn, + tokens_out: llm.tokensOut, + }); + } catch (err) { + return { done: false, skipped: `llm_error: ${(err as Error).message.slice(0, 80)}` }; + } + + let obj: { en: string; pt_br: string } | null; + try { + obj = extractObject(llmText); + } catch (err) { + return { done: false, skipped: `parse_error: ${(err as Error).message.slice(0, 80)}` }; + } + if (obj === null) { + await query( + `UPDATE public.entities SET summary_status = 'refused', + summary_generated_at = NOW(), summary_model = $1 WHERE entity_pk = $2`, + [env.CLAUDE_MODEL, e.entity_pk], + ); + return { done: false, skipped: "INSUFFICIENT" }; + } + + await query( + `UPDATE public.entities + SET summary_en = $1, summary_pt_br = $2, + summary_generated_at = NOW(), + summary_model = $3, + summary_status = 'ai_generated' + WHERE entity_pk = $4`, + [obj.en, obj.pt_br, env.CLAUDE_MODEL, e.entity_pk], + ); + return { done: true }; +} + +async function main() { + const classes = filterClass ? [filterClass] : Object.keys(CONFIG); + let totalOk = 0, totalSkip = 0; + + for (const klass of classes) { + const cfg = CONFIG[klass]; + if (!cfg) { + console.error(`unknown class: ${klass}`); + continue; + } + const limit = maxPerClass ?? cfg.max_per_class; + const where = FORCE + ? `WHERE entity_class = $1 AND total_mentions >= $2` + : `WHERE entity_class = $1 AND total_mentions >= $2 AND (summary_status IS NULL OR summary_status = 'pending')`; + const rows = await query( + `SELECT entity_pk, entity_id, entity_class, canonical_name, aliases, + total_mentions, documents_count + FROM public.entities + ${where} + ORDER BY total_mentions DESC, entity_id ASC + LIMIT $3`, + [klass, cfg.min_mentions, limit], + ); + + console.log(`[${klass}] ${rows.length} candidates`); + for (const e of rows) { + const r = await enrichOne(e, cfg.chunk_k); + if (r.done) { + totalOk += 1; + console.log(` ✓ ${e.entity_id} (${e.canonical_name})`); + } else { + totalSkip += 1; + console.log(` · ${e.entity_id} skip: ${r.skipped}`); + } + } + } + + console.log(`\nDone. ok=${totalOk} skip=${totalSkip}`); + + // Print a few examples + const sample = await queryOne<{ canonical_name: string; summary_en: string; summary_pt_br: string }>( + `SELECT canonical_name, summary_en, summary_pt_br + FROM public.entities + WHERE summary_status = 'ai_generated' + ORDER BY summary_generated_at DESC LIMIT 1`, + ); + if (sample) { + console.log(`\n=== latest example: ${sample.canonical_name} ===`); + console.log(`EN: ${sample.summary_en}`); + console.log(`PT: ${sample.summary_pt_br}`); + } +} + +main().catch((e) => { + console.error("fatal:", e); + process.exit(1); +}); diff --git a/web/components/entity-list-page.tsx b/web/components/entity-list-page.tsx index 9cf52b0..85d5501 100644 --- a/web/components/entity-list-page.tsx +++ b/web/components/entity-list-page.tsx @@ -19,6 +19,9 @@ interface EntityRow { aliases: string[] | null; total_mentions: number; documents_count: number; + summary_en: string | null; + summary_pt_br: string | null; + summary_status: string | null; } export interface EntityListPageProps { @@ -44,12 +47,14 @@ export async function EntityListPage(props: EntityListPageProps) { const locale = (await getLocale()) === "en" ? "en" : "pt-br"; const rows = await pgQuery( `SELECT entity_class, entity_id, canonical_name, aliases, - total_mentions, documents_count + total_mentions, documents_count, + summary_en, summary_pt_br, summary_status FROM public.entities WHERE entity_class = $1 AND total_mentions >= $2 AND canonical_name !~ '^(unspecified|unknown|n/a|—|UNKNOWN)$' - ORDER BY total_mentions DESC, canonical_name ASC + ORDER BY (summary_status = 'ai_generated' OR summary_status = 'curated') DESC, + total_mentions DESC, canonical_name ASC LIMIT 200`, [props.entityClass, props.min_mentions ?? 1], ).catch(() => [] as EntityRow[]); @@ -121,6 +126,7 @@ function MagazineGrid({
{rows.map((r, i) => { const year = entityClass === "event" ? parseEventId(r.entity_id).year : null; + const summary = locale === "pt-br" ? (r.summary_pt_br ?? r.summary_en) : (r.summary_en ?? r.summary_pt_br); return ( {r.canonical_name} - {r.aliases && r.aliases.length > 0 && ( + {summary ? ( +

+ {summary} +

+ ) : r.aliases && r.aliases.length > 0 ? (
{r.aliases.slice(0, 3).join(" · ")}
- )} + ) : null} ); })} @@ -153,30 +163,71 @@ function MagazineGrid({ function CompactGrid({ rows, folder, locale, }: { rows: EntityRow[]; folder: string; locale: "pt-br" | "en" }) { + // Split rows: with-summary appear as larger editorial cards, no-summary + // fall back to a compact table below. + const enriched = rows.filter((r) => (r.summary_en ?? r.summary_pt_br) != null); + const bare = rows.filter((r) => (r.summary_en ?? r.summary_pt_br) == null); + return ( -
- - - - - - - - - - {rows.map((r) => ( - - - - - - ))} - -
{locale === "en" ? "name" : "nome"}{locale === "en" ? "mentions" : "menções"}{locale === "en" ? "documents" : "docs"}
- - {r.canonical_name} - - {r.total_mentions.toLocaleString("pt-BR")}{r.documents_count}
-
+ <> + {enriched.length > 0 && ( +
+ {enriched.map((r) => { + const summary = locale === "pt-br" ? (r.summary_pt_br ?? r.summary_en) : (r.summary_en ?? r.summary_pt_br); + return ( + +
+

+ {r.canonical_name} +

+ + {r.total_mentions.toLocaleString("pt-BR")} + +
+

+ {summary} +

+ + ); + })} +
+ )} + + {bare.length > 0 && ( +
+ {enriched.length > 0 && ( +
+ {locale === "en" ? "// more entries" : "// outras entradas"} +
+ )} + + + + + + + + + + {bare.map((r) => ( + + + + + + ))} + +
{locale === "en" ? "name" : "nome"}{locale === "en" ? "mentions" : "menções"}{locale === "en" ? "documents" : "docs"}
+ + {r.canonical_name} + + {r.total_mentions.toLocaleString("pt-BR")}{r.documents_count}
+
+ )} + ); } diff --git a/web/lib/retrieval/entity-pages.ts b/web/lib/retrieval/entity-pages.ts index fa378c7..4eb4e32 100644 --- a/web/lib/retrieval/entity-pages.ts +++ b/web/lib/retrieval/entity-pages.ts @@ -101,15 +101,31 @@ export async function getEntityCore( if (!fm) return null; // Best-effort lookup of the DB entity_pk so getEntityChunks can still - // query by primary key. Don't fail if the entity isn't in the DB at all. + // query by primary key. Also pull the AI-generated narrative summary + // (W5.3 / migration 0008) for the entity detail page header. let entity_pk: number | null = null; + let dbSummaryEn: string | null = null; + let dbSummaryPt: string | null = null; + let dbSummaryStatus: string | null = null; try { - const rows = await pgQuery<{ entity_pk: number }>( - `SELECT entity_pk FROM public.entities - WHERE entity_class = $1 AND entity_id = $2 LIMIT 1`, + const rows = await pgQuery<{ + entity_pk: number; + summary_en: string | null; + summary_pt_br: string | null; + summary_status: string | null; + }>( + `SELECT entity_pk, summary_en, summary_pt_br, summary_status + FROM public.entities + WHERE entity_class = $1 AND entity_id = $2 LIMIT 1`, [entityClass, entityId], ); - entity_pk = rows[0]?.entity_pk ?? null; + const r = rows[0]; + if (r) { + entity_pk = r.entity_pk; + dbSummaryEn = r.summary_en; + dbSummaryPt = r.summary_pt_br; + dbSummaryStatus = r.summary_status; + } } catch { entity_pk = null; } @@ -140,9 +156,11 @@ export async function getEntityCore( text_mentioned_in: arr(fm.text_mentioned_in), referenced_by: arr(fm.referenced_by), enrichment_status: strOrNull(fm.enrichment_status), - narrative_summary: strOrNull(fm.narrative_summary), - narrative_summary_pt_br: strOrNull(fm.narrative_summary_pt_br), - summary_status: strOrNull(fm.summary_status), + // Prefer DB-stored AI-generated summaries (the curated layer the bureau + // writes). Fall back to wiki YAML narrative if the DB is silent. + narrative_summary: dbSummaryEn ?? strOrNull(fm.narrative_summary), + narrative_summary_pt_br: dbSummaryPt ?? strOrNull(fm.narrative_summary_pt_br), + summary_status: dbSummaryStatus ?? strOrNull(fm.summary_status), }; }