disclosure-bureau/web/lib/doc-summary.ts

124 lines
4.2 KiB
TypeScript
Raw Permalink Normal View History

/**
* Extract a short summary from a wiki/documents/<id>.md body.
*
* Priority:
* 1. `enthusiast_pitch_pt_br` / `enthusiast_pitch_en` in frontmatter (Johnny Harris-style
* generated by scripts/34-generate-doc-pitches.py) preferred when present
* 2. `## Sumário Executivo (PT-BR)` (5 docs have this, synthesized by Sonnet 4.6)
* 3. `## Executive Summary (EN)` (same 5 docs)
* 4. First substantial paragraph in the body (skipping headings, blockquotes, callouts)
*
* Strips markdown formatting (asterisks, backticks, wiki-links) and returns
* a plain-text snippet capped at ~280 chars (3 lines of card width).
*
* pickPitch(): returns the Johnny Harris-style pitch directly (preserves markdown
* for rich rendering in cards/wikis).
*/
const MAX_CHARS = 280;
function stripMd(s: string): string {
return s
// wiki-links → display text
.replace(/\[\[([^\]|]+?)(?:\|([^\]]+))?\]\]/g, (_full, target: string, alias?: string) =>
(alias ?? target).trim(),
)
// markdown links [text](url) → text
.replace(/\[([^\]]+?)\]\([^)]*?\)/g, "$1")
// emphasis & code
.replace(/[*_`~]+/g, "")
// heading hash
.replace(/^#+\s*/gm, "")
// bullets
.replace(/^\s*[-*+]\s+/gm, "")
// ordered list markers
.replace(/^\s*\d+\.\s+/gm, "")
// blockquote markers
.replace(/^>\s*/gm, "")
// collapse whitespace
.replace(/\s+/g, " ")
.trim();
}
function findSection(body: string, headingRe: RegExp): string | null {
const lines = body.split("\n");
let inSection = false;
const captured: string[] = [];
for (const line of lines) {
if (headingRe.test(line)) {
inSection = true;
continue;
}
if (inSection) {
// next ## or ### heading ends this section
if (/^#{1,3}\s/.test(line)) break;
captured.push(line);
}
}
const text = stripMd(captured.join("\n"));
return text.length >= 40 ? text : null;
}
function firstParagraph(body: string): string {
// Skip leading H1, callouts (>), bare headings; pick first paragraph ≥ 80 chars
const lines = body.split("\n");
const buffer: string[] = [];
for (const line of lines) {
const t = line.trim();
if (!t) {
if (buffer.length > 0) {
const text = stripMd(buffer.join(" "));
if (text.length >= 80) return text;
buffer.length = 0;
}
continue;
}
if (/^#{1,6}\s/.test(t)) continue;
if (t.startsWith(">")) continue;
if (t.startsWith("|")) continue; // skip tables
if (/^[-*+]\s/.test(t)) continue; // skip bullet starts
buffer.push(t);
}
// trailing buffer
if (buffer.length > 0) {
const text = stripMd(buffer.join(" "));
if (text.length >= 40) return text;
}
return "";
}
/** Return the Johnny Harris pitch from frontmatter if present, else null. */
export function pickPitch(
fm: Record<string, unknown> | undefined,
lang: "pt" | "en" = "pt",
): string | null {
if (!fm) return null;
const key = lang === "en" ? "enthusiast_pitch_en" : "enthusiast_pitch_pt_br";
const v = fm[key];
if (typeof v === "string" && v.trim().length > 20) return v.trim();
// Cross-fall: prefer ANY pitch (pt or en) over heuristic summary
const other = lang === "en" ? "enthusiast_pitch_pt_br" : "enthusiast_pitch_en";
const v2 = fm[other];
if (typeof v2 === "string" && v2.trim().length > 20) return v2.trim();
return null;
}
export function summarize(body: string, lang: "pt" | "en" = "pt"): string {
if (!body) return "";
const ptSection = findSection(body, /^##\s+Sum[áa]rio Executivo\s*\(PT-BR\)/i);
const enSection = findSection(body, /^##\s+Executive Summary\s*\(EN\)/i);
const narrativeSection = findSection(body, /^##\s+Narrative Arc\s*\(EN\)/i);
let chosen: string;
if (lang === "pt") {
chosen = ptSection ?? enSection ?? narrativeSection ?? firstParagraph(body);
} else {
chosen = enSection ?? narrativeSection ?? ptSection ?? firstParagraph(body);
}
if (!chosen) return "";
if (chosen.length <= MAX_CHARS) return chosen;
// truncate at last word boundary before MAX_CHARS
const cut = chosen.slice(0, MAX_CHARS);
const lastSpace = cut.lastIndexOf(" ");
return (lastSpace > 200 ? cut.slice(0, lastSpace) : cut) + "…";
}