#!/usr/bin/env bun /** * enrich_entity_summaries.ts — generate the bilingual narrative summary * each entity needs for the public-facing sub-pages (/sightings, * /witnesses, /objects, /locations, /operations). * * For each entity: * 1. Pull top N chunks where it appears via entity_mentions JOIN chunks. * 2. Compose a prompt under the house style + the case-writer's voice * rules (no detective names, no skeptic framing, scene-driven). * 3. Ask Sonnet for a ~80-word bilingual JSON: { en, pt_br }. * 4. UPDATE public.entities (summary_en, summary_pt_br, * summary_generated_at, summary_model, summary_status). * * Idempotent — skips entities where summary_status is already * 'ai_generated' or 'curated'. Pass --force to re-generate. * * Usage: * bun scripts/enrich_entity_summaries.ts # all classes, default limits * bun scripts/enrich_entity_summaries.ts event 30 # only events, max 30 * bun scripts/enrich_entity_summaries.ts uap_object 50 # only uap_objects, max 50 * bun scripts/enrich_entity_summaries.ts all --force # re-enrich everything */ import { audit } from "../src/lib/audit"; import { callClaude } from "../src/lib/claude"; import { env } from "../src/lib/env"; import { query, queryOne } from "../src/lib/pg"; import { hybridSearch } from "../src/lib/search"; const FORCE = process.argv.includes("--force"); const args = process.argv.slice(2).filter((a) => !a.startsWith("--")); const filterClass = args[0] && args[0] !== "all" ? args[0] : null; const maxPerClass = args[1] ? parseInt(args[1], 10) : null; // Per-class chunk count required + max entities to enrich. const CONFIG: Record = { event: { min_mentions: 2, max_per_class: 80, chunk_k: 8 }, uap_object: { min_mentions: 1, max_per_class: 80, chunk_k: 8 }, person: { min_mentions: 5, max_per_class: 120, chunk_k: 8 }, location: { min_mentions: 8, max_per_class: 80, chunk_k: 6 }, organization: { min_mentions: 5, max_per_class: 80, chunk_k: 6 }, }; interface EntityRow { entity_pk: number; entity_id: string; entity_class: string; canonical_name: string; aliases: string[] | null; total_mentions: number; documents_count: number; } interface ChunkRow { doc_id: string; chunk_id: string; page: number; type: string; content_en: string | null; content_pt: string | null; surface_form: string | null; } const HOUSE_STYLE = ` Style rules (mandatory): - Plainspoken, scene-driven, factual. Voice: Erik Larson / John McPhee non-fiction. - NO em dashes used as commas. NO rule-of-three lists. NO "Moreover/Notably/Em suma". - NO promotional adjectives (robust, comprehensive, multifaceted, marco histórico). - NO superficial -ing analyses ("marking a shift", "destacando"). - NO skeptic framing, no detective names, no probability tables. - PT-BR is Brazilian Portuguese with UTF-8 accents preserved (ç, ã, á, é, í, ó, ú). - Verbatim chunk content stays in source language. Citation idiom [[doc-id/pNNN#cNNNN]] only when quoting. `; function buildPrompt(e: EntityRow, chunks: ChunkRow[]): string { const classLabel = ({ event: "incident / sighting", uap_object: "described craft / object", person: "named witness / participant", location: "place where incidents are documented", organization: "agency / program / unit", } as Record)[e.entity_class] ?? e.entity_class; const block = chunks.map((c, i) => { const text = (c.content_en ?? c.content_pt ?? "").slice(0, 800); const pageStr = String(c.page).padStart(3, "0"); return [ `--- chunk ${i + 1} ---`, `source: [[${c.doc_id}/p${pageStr}#${c.chunk_id}]]`, c.surface_form ? `surface_form_in_chunk: ${c.surface_form}` : null, "", text, ].filter(Boolean).join("\n"); }).join("\n\n"); return [ `# Subject of the summary`, "", `**Class.** ${classLabel}`, `**Canonical name.** ${e.canonical_name}`, e.aliases && e.aliases.length > 0 ? `**Aliases.** ${e.aliases.slice(0, 6).join(", ")}` : "", "", `## Source chunks (${chunks.length})`, "", block, "", "## Your task", "", "Write a single 60-100 word narrative summary in BOTH English and", "Brazilian Portuguese, drawn directly from the source chunks above.", "Open the EN version with a specific concrete fact (date, place, person,", "shape, action) — NOT with the entity's name as the first word. Same for", "PT-BR. The two versions must say the same thing.", "", "Emit a strict JSON object. No prose around it. No code fence.", "", "```json", `{"en": "...60-100 words...", "pt_br": "...60-100 palavras..."}`, "```", "", "If the source chunks are too thin to write something substantive", `(e.g. only one chunk and it's a stamp or address block), emit the`, "literal word INSUFFICIENT and stop.", "", HOUSE_STYLE, ].filter(Boolean).join("\n"); } function extractObject(text: string): { en: string; pt_br: string } | null { const t = text.trim(); if (/^`?INSUFFICIENT`?\b/i.test(t)) return null; const stripped = t.replace(/^```(?:json)?\s*\n?/i, "").replace(/\n?```\s*$/i, ""); const first = stripped.indexOf("{"); const last = stripped.lastIndexOf("}"); if (first === -1 || last === -1) throw new Error(`no JSON object: ${t.slice(0, 200)}`); const parsed = JSON.parse(stripped.slice(first, last + 1)); if (typeof parsed.en !== "string" || typeof parsed.pt_br !== "string") { throw new Error("JSON missing en/pt_br fields"); } return { en: parsed.en.trim(), pt_br: parsed.pt_br.trim() }; } async function enrichOne(e: EntityRow, k: number): Promise<{ done: boolean; skipped?: string }> { // Path A — entity_mentions JOIN chunks (the high-precision linker). let chunks = await query( `SELECT c.doc_id, c.chunk_id, c.page, c.type, c.content_en, c.content_pt, em.surface_form FROM public.entity_mentions em JOIN public.chunks c ON c.chunk_pk = em.chunk_pk WHERE em.entity_pk = $1 AND LENGTH(COALESCE(c.content_en, c.content_pt, '')) > 80 ORDER BY c.ufo_anomaly DESC NULLS LAST, c.page ASC, c.order_in_page ASC LIMIT $2`, [e.entity_pk, k], ); // Path B — hybridSearch fallback. The wiki may know about an entity // (e.g. "Kenneth Arnold sighting") with high total_mentions counted from // the wiki frontmatter, but the entity_mentions extractor missed every // chunk. Search the corpus by canonical_name + aliases instead. if (chunks.length === 0) { const queryStr = [e.canonical_name, ...(e.aliases ?? []).slice(0, 3)].join(" "); const hits = await hybridSearch({ query: queryStr, lang: "en", top_k: k, recall_k: 40, max_dense_dist: 0.5, }).catch(() => []); chunks = hits.map((h) => ({ doc_id: h.doc_id, chunk_id: h.chunk_id, page: h.page, type: h.type, content_en: h.content_en, content_pt: h.content_pt, surface_form: null, })); } if (chunks.length === 0) { await query( `UPDATE public.entities SET summary_status = 'refused', summary_generated_at = NOW() WHERE entity_pk = $1`, [e.entity_pk], ); return { done: false, skipped: "no_chunks" }; } let llmText: string; try { const llm = await callClaude({ prompt: buildPrompt(e, chunks), model: env.CLAUDE_MODEL, allowedTools: [], timeoutMs: 90_000, budgetCapUsd: 0.05, }); llmText = llm.text; await audit({ event: "entity_summary_generated", entity_pk: e.entity_pk, entity_id: e.entity_id, entity_class: e.entity_class, cost_usd: llm.costUsd, tokens_in: llm.tokensIn, tokens_out: llm.tokensOut, }); } catch (err) { return { done: false, skipped: `llm_error: ${(err as Error).message.slice(0, 80)}` }; } let obj: { en: string; pt_br: string } | null; try { obj = extractObject(llmText); } catch (err) { return { done: false, skipped: `parse_error: ${(err as Error).message.slice(0, 80)}` }; } if (obj === null) { await query( `UPDATE public.entities SET summary_status = 'refused', summary_generated_at = NOW(), summary_model = $1 WHERE entity_pk = $2`, [env.CLAUDE_MODEL, e.entity_pk], ); return { done: false, skipped: "INSUFFICIENT" }; } await query( `UPDATE public.entities SET summary_en = $1, summary_pt_br = $2, summary_generated_at = NOW(), summary_model = $3, summary_status = 'ai_generated' WHERE entity_pk = $4`, [obj.en, obj.pt_br, env.CLAUDE_MODEL, e.entity_pk], ); return { done: true }; } async function main() { const classes = filterClass ? [filterClass] : Object.keys(CONFIG); let totalOk = 0, totalSkip = 0; for (const klass of classes) { const cfg = CONFIG[klass]; if (!cfg) { console.error(`unknown class: ${klass}`); continue; } const limit = maxPerClass ?? cfg.max_per_class; const where = FORCE ? `WHERE entity_class = $1 AND total_mentions >= $2` : `WHERE entity_class = $1 AND total_mentions >= $2 AND (summary_status IS NULL OR summary_status = 'pending')`; const rows = await query( `SELECT entity_pk, entity_id, entity_class, canonical_name, aliases, total_mentions, documents_count FROM public.entities ${where} ORDER BY total_mentions DESC, entity_id ASC LIMIT $3`, [klass, cfg.min_mentions, limit], ); console.log(`[${klass}] ${rows.length} candidates`); for (const e of rows) { const r = await enrichOne(e, cfg.chunk_k); if (r.done) { totalOk += 1; console.log(` ✓ ${e.entity_id} (${e.canonical_name})`); } else { totalSkip += 1; console.log(` · ${e.entity_id} skip: ${r.skipped}`); } } } console.log(`\nDone. ok=${totalOk} skip=${totalSkip}`); // Print a few examples const sample = await queryOne<{ canonical_name: string; summary_en: string; summary_pt_br: string }>( `SELECT canonical_name, summary_en, summary_pt_br FROM public.entities WHERE summary_status = 'ai_generated' ORDER BY summary_generated_at DESC LIMIT 1`, ); if (sample) { console.log(`\n=== latest example: ${sample.canonical_name} ===`); console.log(`EN: ${sample.summary_en}`); console.log(`PT: ${sample.summary_pt_br}`); } } main().catch((e) => { console.error("fatal:", e); process.exit(1); });