297 lines
10 KiB
TypeScript
297 lines
10 KiB
TypeScript
|
|
#!/usr/bin/env bun
|
||
|
|
/**
|
||
|
|
* enrich_entity_summaries.ts — generate the bilingual narrative summary
|
||
|
|
* each entity needs for the public-facing sub-pages (/sightings,
|
||
|
|
* /witnesses, /objects, /locations, /operations).
|
||
|
|
*
|
||
|
|
* For each entity:
|
||
|
|
* 1. Pull top N chunks where it appears via entity_mentions JOIN chunks.
|
||
|
|
* 2. Compose a prompt under the house style + the case-writer's voice
|
||
|
|
* rules (no detective names, no skeptic framing, scene-driven).
|
||
|
|
* 3. Ask Sonnet for a ~80-word bilingual JSON: { en, pt_br }.
|
||
|
|
* 4. UPDATE public.entities (summary_en, summary_pt_br,
|
||
|
|
* summary_generated_at, summary_model, summary_status).
|
||
|
|
*
|
||
|
|
* Idempotent — skips entities where summary_status is already
|
||
|
|
* 'ai_generated' or 'curated'. Pass --force to re-generate.
|
||
|
|
*
|
||
|
|
* Usage:
|
||
|
|
* bun scripts/enrich_entity_summaries.ts # all classes, default limits
|
||
|
|
* bun scripts/enrich_entity_summaries.ts event 30 # only events, max 30
|
||
|
|
* bun scripts/enrich_entity_summaries.ts uap_object 50 # only uap_objects, max 50
|
||
|
|
* bun scripts/enrich_entity_summaries.ts all --force # re-enrich everything
|
||
|
|
*/
|
||
|
|
import { audit } from "../src/lib/audit";
|
||
|
|
import { callClaude } from "../src/lib/claude";
|
||
|
|
import { env } from "../src/lib/env";
|
||
|
|
import { query, queryOne } from "../src/lib/pg";
|
||
|
|
import { hybridSearch } from "../src/lib/search";
|
||
|
|
|
||
|
|
const FORCE = process.argv.includes("--force");
|
||
|
|
const args = process.argv.slice(2).filter((a) => !a.startsWith("--"));
|
||
|
|
const filterClass = args[0] && args[0] !== "all" ? args[0] : null;
|
||
|
|
const maxPerClass = args[1] ? parseInt(args[1], 10) : null;
|
||
|
|
|
||
|
|
// Per-class chunk count required + max entities to enrich.
|
||
|
|
const CONFIG: Record<string, { min_mentions: number; max_per_class: number; chunk_k: number }> = {
|
||
|
|
event: { min_mentions: 2, max_per_class: 80, chunk_k: 8 },
|
||
|
|
uap_object: { min_mentions: 1, max_per_class: 80, chunk_k: 8 },
|
||
|
|
person: { min_mentions: 5, max_per_class: 120, chunk_k: 8 },
|
||
|
|
location: { min_mentions: 8, max_per_class: 80, chunk_k: 6 },
|
||
|
|
organization: { min_mentions: 5, max_per_class: 80, chunk_k: 6 },
|
||
|
|
};
|
||
|
|
|
||
|
|
interface EntityRow {
|
||
|
|
entity_pk: number;
|
||
|
|
entity_id: string;
|
||
|
|
entity_class: string;
|
||
|
|
canonical_name: string;
|
||
|
|
aliases: string[] | null;
|
||
|
|
total_mentions: number;
|
||
|
|
documents_count: number;
|
||
|
|
}
|
||
|
|
|
||
|
|
interface ChunkRow {
|
||
|
|
doc_id: string;
|
||
|
|
chunk_id: string;
|
||
|
|
page: number;
|
||
|
|
type: string;
|
||
|
|
content_en: string | null;
|
||
|
|
content_pt: string | null;
|
||
|
|
surface_form: string | null;
|
||
|
|
}
|
||
|
|
|
||
|
|
const HOUSE_STYLE = `
|
||
|
|
Style rules (mandatory):
|
||
|
|
- Plainspoken, scene-driven, factual. Voice: Erik Larson / John McPhee non-fiction.
|
||
|
|
- NO em dashes used as commas. NO rule-of-three lists. NO "Moreover/Notably/Em suma".
|
||
|
|
- NO promotional adjectives (robust, comprehensive, multifaceted, marco histórico).
|
||
|
|
- NO superficial -ing analyses ("marking a shift", "destacando").
|
||
|
|
- NO skeptic framing, no detective names, no probability tables.
|
||
|
|
- PT-BR is Brazilian Portuguese with UTF-8 accents preserved (ç, ã, á, é, í, ó, ú).
|
||
|
|
- Verbatim chunk content stays in source language. Citation idiom [[doc-id/pNNN#cNNNN]] only when quoting.
|
||
|
|
`;
|
||
|
|
|
||
|
|
function buildPrompt(e: EntityRow, chunks: ChunkRow[]): string {
|
||
|
|
const classLabel = ({
|
||
|
|
event: "incident / sighting",
|
||
|
|
uap_object: "described craft / object",
|
||
|
|
person: "named witness / participant",
|
||
|
|
location: "place where incidents are documented",
|
||
|
|
organization: "agency / program / unit",
|
||
|
|
} as Record<string, string>)[e.entity_class] ?? e.entity_class;
|
||
|
|
|
||
|
|
const block = chunks.map((c, i) => {
|
||
|
|
const text = (c.content_en ?? c.content_pt ?? "").slice(0, 800);
|
||
|
|
const pageStr = String(c.page).padStart(3, "0");
|
||
|
|
return [
|
||
|
|
`--- chunk ${i + 1} ---`,
|
||
|
|
`source: [[${c.doc_id}/p${pageStr}#${c.chunk_id}]]`,
|
||
|
|
c.surface_form ? `surface_form_in_chunk: ${c.surface_form}` : null,
|
||
|
|
"",
|
||
|
|
text,
|
||
|
|
].filter(Boolean).join("\n");
|
||
|
|
}).join("\n\n");
|
||
|
|
|
||
|
|
return [
|
||
|
|
`# Subject of the summary`,
|
||
|
|
"",
|
||
|
|
`**Class.** ${classLabel}`,
|
||
|
|
`**Canonical name.** ${e.canonical_name}`,
|
||
|
|
e.aliases && e.aliases.length > 0 ? `**Aliases.** ${e.aliases.slice(0, 6).join(", ")}` : "",
|
||
|
|
"",
|
||
|
|
`## Source chunks (${chunks.length})`,
|
||
|
|
"",
|
||
|
|
block,
|
||
|
|
"",
|
||
|
|
"## Your task",
|
||
|
|
"",
|
||
|
|
"Write a single 60-100 word narrative summary in BOTH English and",
|
||
|
|
"Brazilian Portuguese, drawn directly from the source chunks above.",
|
||
|
|
"Open the EN version with a specific concrete fact (date, place, person,",
|
||
|
|
"shape, action) — NOT with the entity's name as the first word. Same for",
|
||
|
|
"PT-BR. The two versions must say the same thing.",
|
||
|
|
"",
|
||
|
|
"Emit a strict JSON object. No prose around it. No code fence.",
|
||
|
|
"",
|
||
|
|
"```json",
|
||
|
|
`{"en": "...60-100 words...", "pt_br": "...60-100 palavras..."}`,
|
||
|
|
"```",
|
||
|
|
"",
|
||
|
|
"If the source chunks are too thin to write something substantive",
|
||
|
|
`(e.g. only one chunk and it's a stamp or address block), emit the`,
|
||
|
|
"literal word INSUFFICIENT and stop.",
|
||
|
|
"",
|
||
|
|
HOUSE_STYLE,
|
||
|
|
].filter(Boolean).join("\n");
|
||
|
|
}
|
||
|
|
|
||
|
|
function extractObject(text: string): { en: string; pt_br: string } | null {
|
||
|
|
const t = text.trim();
|
||
|
|
if (/^`?INSUFFICIENT`?\b/i.test(t)) return null;
|
||
|
|
const stripped = t.replace(/^```(?:json)?\s*\n?/i, "").replace(/\n?```\s*$/i, "");
|
||
|
|
const first = stripped.indexOf("{");
|
||
|
|
const last = stripped.lastIndexOf("}");
|
||
|
|
if (first === -1 || last === -1) throw new Error(`no JSON object: ${t.slice(0, 200)}`);
|
||
|
|
const parsed = JSON.parse(stripped.slice(first, last + 1));
|
||
|
|
if (typeof parsed.en !== "string" || typeof parsed.pt_br !== "string") {
|
||
|
|
throw new Error("JSON missing en/pt_br fields");
|
||
|
|
}
|
||
|
|
return { en: parsed.en.trim(), pt_br: parsed.pt_br.trim() };
|
||
|
|
}
|
||
|
|
|
||
|
|
async function enrichOne(e: EntityRow, k: number): Promise<{ done: boolean; skipped?: string }> {
|
||
|
|
// Path A — entity_mentions JOIN chunks (the high-precision linker).
|
||
|
|
let chunks = await query<ChunkRow>(
|
||
|
|
`SELECT c.doc_id, c.chunk_id, c.page, c.type,
|
||
|
|
c.content_en, c.content_pt, em.surface_form
|
||
|
|
FROM public.entity_mentions em
|
||
|
|
JOIN public.chunks c ON c.chunk_pk = em.chunk_pk
|
||
|
|
WHERE em.entity_pk = $1
|
||
|
|
AND LENGTH(COALESCE(c.content_en, c.content_pt, '')) > 80
|
||
|
|
ORDER BY c.ufo_anomaly DESC NULLS LAST, c.page ASC, c.order_in_page ASC
|
||
|
|
LIMIT $2`,
|
||
|
|
[e.entity_pk, k],
|
||
|
|
);
|
||
|
|
|
||
|
|
// Path B — hybridSearch fallback. The wiki may know about an entity
|
||
|
|
// (e.g. "Kenneth Arnold sighting") with high total_mentions counted from
|
||
|
|
// the wiki frontmatter, but the entity_mentions extractor missed every
|
||
|
|
// chunk. Search the corpus by canonical_name + aliases instead.
|
||
|
|
if (chunks.length === 0) {
|
||
|
|
const queryStr = [e.canonical_name, ...(e.aliases ?? []).slice(0, 3)].join(" ");
|
||
|
|
const hits = await hybridSearch({
|
||
|
|
query: queryStr,
|
||
|
|
lang: "en",
|
||
|
|
top_k: k,
|
||
|
|
recall_k: 40,
|
||
|
|
max_dense_dist: 0.5,
|
||
|
|
}).catch(() => []);
|
||
|
|
chunks = hits.map((h) => ({
|
||
|
|
doc_id: h.doc_id,
|
||
|
|
chunk_id: h.chunk_id,
|
||
|
|
page: h.page,
|
||
|
|
type: h.type,
|
||
|
|
content_en: h.content_en,
|
||
|
|
content_pt: h.content_pt,
|
||
|
|
surface_form: null,
|
||
|
|
}));
|
||
|
|
}
|
||
|
|
|
||
|
|
if (chunks.length === 0) {
|
||
|
|
await query(
|
||
|
|
`UPDATE public.entities SET summary_status = 'refused',
|
||
|
|
summary_generated_at = NOW() WHERE entity_pk = $1`,
|
||
|
|
[e.entity_pk],
|
||
|
|
);
|
||
|
|
return { done: false, skipped: "no_chunks" };
|
||
|
|
}
|
||
|
|
|
||
|
|
let llmText: string;
|
||
|
|
try {
|
||
|
|
const llm = await callClaude({
|
||
|
|
prompt: buildPrompt(e, chunks),
|
||
|
|
model: env.CLAUDE_MODEL,
|
||
|
|
allowedTools: [],
|
||
|
|
timeoutMs: 90_000,
|
||
|
|
budgetCapUsd: 0.05,
|
||
|
|
});
|
||
|
|
llmText = llm.text;
|
||
|
|
await audit({
|
||
|
|
event: "entity_summary_generated",
|
||
|
|
entity_pk: e.entity_pk,
|
||
|
|
entity_id: e.entity_id,
|
||
|
|
entity_class: e.entity_class,
|
||
|
|
cost_usd: llm.costUsd,
|
||
|
|
tokens_in: llm.tokensIn,
|
||
|
|
tokens_out: llm.tokensOut,
|
||
|
|
});
|
||
|
|
} catch (err) {
|
||
|
|
return { done: false, skipped: `llm_error: ${(err as Error).message.slice(0, 80)}` };
|
||
|
|
}
|
||
|
|
|
||
|
|
let obj: { en: string; pt_br: string } | null;
|
||
|
|
try {
|
||
|
|
obj = extractObject(llmText);
|
||
|
|
} catch (err) {
|
||
|
|
return { done: false, skipped: `parse_error: ${(err as Error).message.slice(0, 80)}` };
|
||
|
|
}
|
||
|
|
if (obj === null) {
|
||
|
|
await query(
|
||
|
|
`UPDATE public.entities SET summary_status = 'refused',
|
||
|
|
summary_generated_at = NOW(), summary_model = $1 WHERE entity_pk = $2`,
|
||
|
|
[env.CLAUDE_MODEL, e.entity_pk],
|
||
|
|
);
|
||
|
|
return { done: false, skipped: "INSUFFICIENT" };
|
||
|
|
}
|
||
|
|
|
||
|
|
await query(
|
||
|
|
`UPDATE public.entities
|
||
|
|
SET summary_en = $1, summary_pt_br = $2,
|
||
|
|
summary_generated_at = NOW(),
|
||
|
|
summary_model = $3,
|
||
|
|
summary_status = 'ai_generated'
|
||
|
|
WHERE entity_pk = $4`,
|
||
|
|
[obj.en, obj.pt_br, env.CLAUDE_MODEL, e.entity_pk],
|
||
|
|
);
|
||
|
|
return { done: true };
|
||
|
|
}
|
||
|
|
|
||
|
|
async function main() {
|
||
|
|
const classes = filterClass ? [filterClass] : Object.keys(CONFIG);
|
||
|
|
let totalOk = 0, totalSkip = 0;
|
||
|
|
|
||
|
|
for (const klass of classes) {
|
||
|
|
const cfg = CONFIG[klass];
|
||
|
|
if (!cfg) {
|
||
|
|
console.error(`unknown class: ${klass}`);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
const limit = maxPerClass ?? cfg.max_per_class;
|
||
|
|
const where = FORCE
|
||
|
|
? `WHERE entity_class = $1 AND total_mentions >= $2`
|
||
|
|
: `WHERE entity_class = $1 AND total_mentions >= $2 AND (summary_status IS NULL OR summary_status = 'pending')`;
|
||
|
|
const rows = await query<EntityRow>(
|
||
|
|
`SELECT entity_pk, entity_id, entity_class, canonical_name, aliases,
|
||
|
|
total_mentions, documents_count
|
||
|
|
FROM public.entities
|
||
|
|
${where}
|
||
|
|
ORDER BY total_mentions DESC, entity_id ASC
|
||
|
|
LIMIT $3`,
|
||
|
|
[klass, cfg.min_mentions, limit],
|
||
|
|
);
|
||
|
|
|
||
|
|
console.log(`[${klass}] ${rows.length} candidates`);
|
||
|
|
for (const e of rows) {
|
||
|
|
const r = await enrichOne(e, cfg.chunk_k);
|
||
|
|
if (r.done) {
|
||
|
|
totalOk += 1;
|
||
|
|
console.log(` ✓ ${e.entity_id} (${e.canonical_name})`);
|
||
|
|
} else {
|
||
|
|
totalSkip += 1;
|
||
|
|
console.log(` · ${e.entity_id} skip: ${r.skipped}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(`\nDone. ok=${totalOk} skip=${totalSkip}`);
|
||
|
|
|
||
|
|
// Print a few examples
|
||
|
|
const sample = await queryOne<{ canonical_name: string; summary_en: string; summary_pt_br: string }>(
|
||
|
|
`SELECT canonical_name, summary_en, summary_pt_br
|
||
|
|
FROM public.entities
|
||
|
|
WHERE summary_status = 'ai_generated'
|
||
|
|
ORDER BY summary_generated_at DESC LIMIT 1`,
|
||
|
|
);
|
||
|
|
if (sample) {
|
||
|
|
console.log(`\n=== latest example: ${sample.canonical_name} ===`);
|
||
|
|
console.log(`EN: ${sample.summary_en}`);
|
||
|
|
console.log(`PT: ${sample.summary_pt_br}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
main().catch((e) => {
|
||
|
|
console.error("fatal:", e);
|
||
|
|
process.exit(1);
|
||
|
|
});
|