disclosure-bureau/investigator-runtime/scripts/enrich_entity_summaries.ts

#!/usr/bin/env bun
/**
 * enrich_entity_summaries.ts — generate the bilingual narrative summary
 * each entity needs for the public-facing sub-pages (/sightings,
 * /witnesses, /objects, /locations, /operations).
 *
 * For each entity:
 *   1. Pull top N chunks where it appears via entity_mentions JOIN chunks.
 *   2. Compose a prompt under the house style + the case-writer's voice
 *      rules (no detective names, no skeptic framing, scene-driven).
 *   3. Ask Sonnet for a ~80-word bilingual JSON: { en, pt_br }.
 *   4. UPDATE public.entities (summary_en, summary_pt_br,
 *      summary_generated_at, summary_model, summary_status).
 *
 * Idempotent — skips entities where summary_status is already
 * 'ai_generated' or 'curated'. Pass --force to re-generate.
 *
 * Usage:
 *   bun scripts/enrich_entity_summaries.ts                # all classes, default limits
 *   bun scripts/enrich_entity_summaries.ts event 30       # only events, max 30
 *   bun scripts/enrich_entity_summaries.ts uap_object 50  # only uap_objects, max 50
 *   bun scripts/enrich_entity_summaries.ts all --force    # re-enrich everything
 */
import { audit } from "../src/lib/audit";
import { callClaude } from "../src/lib/claude";
import { env } from "../src/lib/env";
import { query, queryOne } from "../src/lib/pg";
import { hybridSearch } from "../src/lib/search";

const FORCE = process.argv.includes("--force");
const args = process.argv.slice(2).filter((a) => !a.startsWith("--"));
const filterClass = args[0] && args[0] !== "all" ? args[0] : null;
const maxPerClass = args[1] ? parseInt(args[1], 10) : null;

// Per-class chunk count required + max entities to enrich.
const CONFIG: Record<string, { min_mentions: number; max_per_class: number; chunk_k: number }> = {
  event:        { min_mentions: 2, max_per_class:  80, chunk_k: 8 },
  uap_object:   { min_mentions: 1, max_per_class:  80, chunk_k: 8 },
  person:       { min_mentions: 5, max_per_class: 120, chunk_k: 8 },
  location:     { min_mentions: 8, max_per_class:  80, chunk_k: 6 },
  organization: { min_mentions: 5, max_per_class:  80, chunk_k: 6 },
};

interface EntityRow {
  entity_pk: number;
  entity_id: string;
  entity_class: string;
  canonical_name: string;
  aliases: string[] | null;
  total_mentions: number;
  documents_count: number;
}

interface ChunkRow {
  doc_id: string;
  chunk_id: string;
  page: number;
  type: string;
  content_en: string | null;
  content_pt: string | null;
  surface_form: string | null;
}

const HOUSE_STYLE = `
Style rules (mandatory):
- Plainspoken, scene-driven, factual. Voice: Erik Larson / John McPhee non-fiction.
- NO em dashes used as commas. NO rule-of-three lists. NO "Moreover/Notably/Em suma".
- NO promotional adjectives (robust, comprehensive, multifaceted, marco histórico).
- NO superficial -ing analyses ("marking a shift", "destacando").
- NO skeptic framing, no detective names, no probability tables.
- PT-BR is Brazilian Portuguese with UTF-8 accents preserved (ç, ã, á, é, í, ó, ú).
- Verbatim chunk content stays in source language. Citation idiom [[doc-id/pNNN#cNNNN]] only when quoting.
`;

function buildPrompt(e: EntityRow, chunks: ChunkRow[]): string {
  const classLabel = ({
    event: "incident / sighting",
    uap_object: "described craft / object",
    person: "named witness / participant",
    location: "place where incidents are documented",
    organization: "agency / program / unit",
  } as Record<string, string>)[e.entity_class] ?? e.entity_class;

  const block = chunks.map((c, i) => {
    const text = (c.content_en ?? c.content_pt ?? "").slice(0, 800);
    const pageStr = String(c.page).padStart(3, "0");
    return [
      `--- chunk ${i + 1} ---`,
      `source: [[${c.doc_id}/p${pageStr}#${c.chunk_id}]]`,
      c.surface_form ? `surface_form_in_chunk: ${c.surface_form}` : null,
      "",
      text,
    ].filter(Boolean).join("\n");
  }).join("\n\n");

  return [
    `# Subject of the summary`,
    "",
    `**Class.** ${classLabel}`,
    `**Canonical name.** ${e.canonical_name}`,
    e.aliases && e.aliases.length > 0 ? `**Aliases.** ${e.aliases.slice(0, 6).join(", ")}` : "",
    "",
    `## Source chunks (${chunks.length})`,
    "",
    block,
    "",
    "## Your task",
    "",
    "Write a single 60-100 word narrative summary in BOTH English and",
    "Brazilian Portuguese, drawn directly from the source chunks above.",
    "Open the EN version with a specific concrete fact (date, place, person,",
    "shape, action) — NOT with the entity's name as the first word. Same for",
    "PT-BR. The two versions must say the same thing.",
    "",
    "Emit a strict JSON object. No prose around it. No code fence.",
    "",
    "```json",
    `{"en": "...60-100 words...", "pt_br": "...60-100 palavras..."}`,
    "```",
    "",
    "If the source chunks are too thin to write something substantive",
    `(e.g. only one chunk and it's a stamp or address block), emit the`,
    "literal word INSUFFICIENT and stop.",
    "",
    HOUSE_STYLE,
  ].filter(Boolean).join("\n");
}

function extractObject(text: string): { en: string; pt_br: string } | null {
  const t = text.trim();
  if (/^`?INSUFFICIENT`?\b/i.test(t)) return null;
  const stripped = t.replace(/^```(?:json)?\s*\n?/i, "").replace(/\n?```\s*$/i, "");
  const first = stripped.indexOf("{");
  const last = stripped.lastIndexOf("}");
  if (first === -1 || last === -1) throw new Error(`no JSON object: ${t.slice(0, 200)}`);
  const parsed = JSON.parse(stripped.slice(first, last + 1));
  if (typeof parsed.en !== "string" || typeof parsed.pt_br !== "string") {
    throw new Error("JSON missing en/pt_br fields");
  }
  return { en: parsed.en.trim(), pt_br: parsed.pt_br.trim() };
}

async function enrichOne(e: EntityRow, k: number): Promise<{ done: boolean; skipped?: string }> {
  // Path A — entity_mentions JOIN chunks (the high-precision linker).
  let chunks = await query<ChunkRow>(
    `SELECT c.doc_id, c.chunk_id, c.page, c.type,
            c.content_en, c.content_pt, em.surface_form
       FROM public.entity_mentions em
       JOIN public.chunks c ON c.chunk_pk = em.chunk_pk
      WHERE em.entity_pk = $1
        AND LENGTH(COALESCE(c.content_en, c.content_pt, '')) > 80
      ORDER BY c.ufo_anomaly DESC NULLS LAST, c.page ASC, c.order_in_page ASC
      LIMIT $2`,
    [e.entity_pk, k],
  );

  // Path B — hybridSearch fallback. The wiki may know about an entity
  // (e.g. "Kenneth Arnold sighting") with high total_mentions counted from
  // the wiki frontmatter, but the entity_mentions extractor missed every
  // chunk. Search the corpus by canonical_name + aliases instead.
  if (chunks.length === 0) {
    const queryStr = [e.canonical_name, ...(e.aliases ?? []).slice(0, 3)].join(" ");
    const hits = await hybridSearch({
      query: queryStr,
      lang: "en",
      top_k: k,
      recall_k: 40,
      max_dense_dist: 0.5,
    }).catch(() => []);
    chunks = hits.map((h) => ({
      doc_id: h.doc_id,
      chunk_id: h.chunk_id,
      page: h.page,
      type: h.type,
      content_en: h.content_en,
      content_pt: h.content_pt,
      surface_form: null,
    }));
  }

  if (chunks.length === 0) {
    await query(
      `UPDATE public.entities SET summary_status = 'refused',
              summary_generated_at = NOW() WHERE entity_pk = $1`,
      [e.entity_pk],
    );
    return { done: false, skipped: "no_chunks" };
  }

  let llmText: string;
  try {
    const llm = await callClaude({
      prompt: buildPrompt(e, chunks),
      model: env.CLAUDE_MODEL,
      allowedTools: [],
      timeoutMs: 90_000,
      budgetCapUsd: 0.05,
    });
    llmText = llm.text;
    await audit({
      event: "entity_summary_generated",
      entity_pk: e.entity_pk,
      entity_id: e.entity_id,
      entity_class: e.entity_class,
      cost_usd: llm.costUsd,
      tokens_in: llm.tokensIn,
      tokens_out: llm.tokensOut,
    });
  } catch (err) {
    return { done: false, skipped: `llm_error: ${(err as Error).message.slice(0, 80)}` };
  }

  let obj: { en: string; pt_br: string } | null;
  try {
    obj = extractObject(llmText);
  } catch (err) {
    return { done: false, skipped: `parse_error: ${(err as Error).message.slice(0, 80)}` };
  }
  if (obj === null) {
    await query(
      `UPDATE public.entities SET summary_status = 'refused',
              summary_generated_at = NOW(), summary_model = $1 WHERE entity_pk = $2`,
      [env.CLAUDE_MODEL, e.entity_pk],
    );
    return { done: false, skipped: "INSUFFICIENT" };
  }

  await query(
    `UPDATE public.entities
        SET summary_en = $1, summary_pt_br = $2,
            summary_generated_at = NOW(),
            summary_model = $3,
            summary_status = 'ai_generated'
      WHERE entity_pk = $4`,
    [obj.en, obj.pt_br, env.CLAUDE_MODEL, e.entity_pk],
  );
  return { done: true };
}

async function main() {
  const classes = filterClass ? [filterClass] : Object.keys(CONFIG);
  let totalOk = 0, totalSkip = 0;

  for (const klass of classes) {
    const cfg = CONFIG[klass];
    if (!cfg) {
      console.error(`unknown class: ${klass}`);
      continue;
    }
    const limit = maxPerClass ?? cfg.max_per_class;
    const where = FORCE
      ? `WHERE entity_class = $1 AND total_mentions >= $2`
      : `WHERE entity_class = $1 AND total_mentions >= $2 AND (summary_status IS NULL OR summary_status = 'pending')`;
    const rows = await query<EntityRow>(
      `SELECT entity_pk, entity_id, entity_class, canonical_name, aliases,
              total_mentions, documents_count
         FROM public.entities
         ${where}
        ORDER BY total_mentions DESC, entity_id ASC
        LIMIT $3`,
      [klass, cfg.min_mentions, limit],
    );

    console.log(`[${klass}] ${rows.length} candidates`);
    for (const e of rows) {
      const r = await enrichOne(e, cfg.chunk_k);
      if (r.done) {
        totalOk += 1;
        console.log(`  ✓ ${e.entity_id} (${e.canonical_name})`);
      } else {
        totalSkip += 1;
        console.log(`  · ${e.entity_id} skip: ${r.skipped}`);
      }
    }
  }

  console.log(`\nDone. ok=${totalOk} skip=${totalSkip}`);

  // Print a few examples
  const sample = await queryOne<{ canonical_name: string; summary_en: string; summary_pt_br: string }>(
    `SELECT canonical_name, summary_en, summary_pt_br
       FROM public.entities
      WHERE summary_status = 'ai_generated'
      ORDER BY summary_generated_at DESC LIMIT 1`,
  );
  if (sample) {
    console.log(`\n=== latest example: ${sample.canonical_name} ===`);
    console.log(`EN: ${sample.summary_en}`);
    console.log(`PT: ${sample.summary_pt_br}`);
  }
}

main().catch((e) => {
  console.error("fatal:", e);
  process.exit(1);
});