disclosure-bureau/web/lib/retrieval/entity-pages.ts
Luiz Gustavo a7e9dce6d2 rebuild entity layer from Sonnet-vision reextract pipeline
Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity
JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page
extraction. Add synthesize scripts to regenerate wiki/entities from the 116
_reextract.json (30), aggregate missing page.md from chunks (31), and reprocess
805 pages the doc-rebuilder agent dropped on context overflow (32). Add
maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and
typed relation extraction.

Web: wire relations API + entity-relations component; entity/timeline/doc
pages consume the rebuilt layer.

Note: raw/, processing/, wiki/ remain gitignored (bulk data managed
separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on
disk only. The 27 curated anchor events under wiki/entities/events/ are
preserved.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-21 12:20:24 -03:00

262 lines
8.8 KiB
TypeScript

/**
* Entity page data — SINGLE SOURCE OF TRUTH is the YAML frontmatter on disk.
*
* Why YAML and not the DB? Because the corpus has TWO independent extraction
* layers (Haiku page-level, Sonnet chunk-level) and each catches a different
* subset of entities. The DB's entity_mentions table is one of those signals —
* useful for chat retrieval but incomplete for the entity catalog itself.
*
* Reading from disk lets us merge every signal into one stat (`total_mentions`)
* via the maintain/42_sync_entity_stats.py pipeline and serve consistent
* numbers everywhere in the UI.
*
* The DB is still queried for ONE thing: the actual chunk text for previews,
* because we don't want to re-parse 21k chunk files on every page render.
*/
import fs from "node:fs/promises";
import path from "node:path";
import matter from "gray-matter";
import { pgQuery } from "./db";
import { WIKI } from "@/lib/wiki";
const FOLDER_BY_CLASS: Record<string, string> = {
person: "people",
organization: "organizations",
location: "locations",
event: "events",
uap_object: "uap-objects",
vehicle: "vehicles",
operation: "operations",
concept: "concepts",
};
export interface EntityCore {
entity_pk: number | null; // db-side primary key; null if entity is wiki-only
entity_class: string;
entity_id: string;
canonical_name: string;
aliases: string[];
total_mentions: number;
documents_count: number;
signal_strength: "strong" | "weak" | "orphan" | "unverified";
signal_sources: {
db_chunks: number;
page_refs: number;
cross_refs: number;
text_refs: number;
};
mentioned_in: string[]; // [[doc-id/p007]] — structured page refs (Haiku)
text_mentioned_in: string[]; // [[doc-id/p007]] — text-only matches (back-fill)
referenced_by: string[]; // [[class/id]] cross-links
enrichment_status: string | null;
narrative_summary: string | null;
narrative_summary_pt_br: string | null;
summary_status: string | null;
}
interface RawFm {
[k: string]: unknown;
}
function num(v: unknown, fallback = 0): number {
if (typeof v === "number" && Number.isFinite(v)) return v;
if (typeof v === "string") {
const n = Number(v);
return Number.isFinite(n) ? n : fallback;
}
return fallback;
}
function arr(v: unknown): string[] {
if (!v) return [];
if (Array.isArray(v)) return v.filter((x) => typeof x === "string");
return [];
}
function strOrNull(v: unknown): string | null {
return typeof v === "string" && v.trim() ? v : null;
}
async function readEntityYaml(entityClass: string, entityId: string): Promise<RawFm | null> {
const folder = FOLDER_BY_CLASS[entityClass];
if (!folder) return null;
const p = path.join(WIKI, "entities", folder, `${entityId}.md`);
try {
const raw = await fs.readFile(p, "utf-8");
return matter(raw).data as RawFm;
} catch {
return null;
}
}
/**
* Load a single entity card from its YAML. Returns null if archived or
* missing — keeps the route handler simple.
*/
export async function getEntityCore(
entityClass: string,
entityId: string,
): Promise<EntityCore | null> {
const fm = await readEntityYaml(entityClass, entityId);
if (!fm) return null;
// Best-effort lookup of the DB entity_pk so getEntityChunks can still
// query by primary key. Don't fail if the entity isn't in the DB at all.
let entity_pk: number | null = null;
try {
const rows = await pgQuery<{ entity_pk: number }>(
`SELECT entity_pk FROM public.entities
WHERE entity_class = $1 AND entity_id = $2 LIMIT 1`,
[entityClass, entityId],
);
entity_pk = rows[0]?.entity_pk ?? null;
} catch {
entity_pk = null;
}
const sigSources = (fm.signal_sources as Record<string, unknown> | undefined) ?? {};
const strength = (typeof fm.signal_strength === "string"
? fm.signal_strength
: "unverified") as EntityCore["signal_strength"];
return {
entity_pk,
entity_class: entityClass,
entity_id: entityId,
canonical_name: typeof fm.canonical_name === "string" ? fm.canonical_name : entityId,
aliases: arr(fm.aliases),
total_mentions: num(fm.total_mentions, 0),
documents_count: num(fm.documents_count, 0),
signal_strength: ["strong", "weak", "orphan", "unverified"].includes(strength)
? strength
: "unverified",
signal_sources: {
db_chunks: num(sigSources.db_chunks, 0),
page_refs: num(sigSources.page_refs, 0),
cross_refs: num(sigSources.cross_refs, 0),
text_refs: num(sigSources.text_refs, 0),
},
mentioned_in: arr(fm.mentioned_in),
text_mentioned_in: arr(fm.text_mentioned_in),
referenced_by: arr(fm.referenced_by),
enrichment_status: strOrNull(fm.enrichment_status),
narrative_summary: strOrNull(fm.narrative_summary),
narrative_summary_pt_br: strOrNull(fm.narrative_summary_pt_br),
summary_status: strOrNull(fm.summary_status),
};
}
export interface EntityMentionGroup {
doc_id: string;
canonical_title: string | null;
collection: string | null;
page_count: number | null;
classification: string | null;
mention_count: number;
pages: number[];
text_only: boolean; // true when all refs came from the text back-fill (no structured Haiku evidence)
}
/**
* Group reverse-references by document. Derived from the YAML's mentioned_in[]
* (which the maintain script writes consolidating page YAMLs). Optionally
* enriches with document metadata read from wiki/documents/<doc-id>.md.
*/
export async function getEntityMentionsByDoc(
entityClass: string,
entityId: string,
limit = 100,
): Promise<EntityMentionGroup[]> {
const fm = await readEntityYaml(entityClass, entityId);
if (!fm) return [];
const structuredRefs = arr(fm.mentioned_in);
const textRefs = arr(fm.text_mentioned_in);
// Each ref looks like "[[doc-id/p007]]". Strip wikilink delimiters.
const byDoc = new Map<string, { structured: Set<number>; text: Set<number> }>();
const addRef = (ref: string, source: "structured" | "text") => {
const m = ref.match(/\[\[([^\]|]+?)\]\]/);
const target = (m ? m[1] : ref).trim();
const [docId, pageStr] = target.split("/", 2);
if (!docId) return;
const pageNum = pageStr ? parseInt(pageStr.replace(/^p/, ""), 10) : NaN;
if (!byDoc.has(docId)) byDoc.set(docId, { structured: new Set(), text: new Set() });
if (Number.isFinite(pageNum)) byDoc.get(docId)![source].add(pageNum);
};
for (const r of structuredRefs) addRef(r, "structured");
for (const r of textRefs) addRef(r, "text");
// Hydrate each doc's metadata from wiki/documents/<doc-id>.md
const groups: EntityMentionGroup[] = [];
for (const [docId, sets] of byDoc) {
let canonical_title: string | null = null;
let collection: string | null = null;
let page_count: number | null = null;
let classification: string | null = null;
try {
const docRaw = await fs.readFile(
path.join(WIKI, "documents", `${docId}.md`),
"utf-8",
);
const dfm = matter(docRaw).data as Record<string, unknown>;
canonical_title = strOrNull(dfm.canonical_title);
collection = strOrNull(dfm.collection);
page_count = num(dfm.page_count, 0) || null;
classification = strOrNull(dfm.highest_classification);
} catch {
/* doc missing — use raw id */
}
const merged = new Set<number>([...sets.structured, ...sets.text]);
groups.push({
doc_id: docId,
canonical_title,
collection,
page_count,
classification,
mention_count: merged.size,
pages: Array.from(merged).sort((a, b) => a - b),
text_only: sets.structured.size === 0 && sets.text.size > 0,
});
}
groups.sort((a, b) => b.mention_count - a.mention_count);
return groups.slice(0, limit);
}
export interface EntityChunkPreview {
chunk_pk: number;
doc_id: string;
chunk_id: string;
page: number;
type: string;
bbox: { x: number; y: number; w: number; h: number } | null;
classification: string | null;
content_pt: string | null;
content_en: string | null;
ufo_anomaly: boolean | null;
ufo_anomaly_type: string | null;
}
/**
* Top chunks that textually mention this entity. Reads from DB because
* chunk content is big (we don't re-parse files at request time). Returns []
* if the entity isn't indexed in the DB.
*/
export async function getEntityChunks(
entityPk: number | null,
limit = 30,
): Promise<EntityChunkPreview[]> {
if (entityPk == null) return [];
return pgQuery<EntityChunkPreview>(
`SELECT
c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.type, c.bbox, c.classification,
c.content_pt, c.content_en, c.ufo_anomaly, c.ufo_anomaly_type
FROM public.entity_mentions em
JOIN public.chunks c ON c.chunk_pk = em.chunk_pk
WHERE em.entity_pk = $1
ORDER BY c.ufo_anomaly DESC NULLS LAST, c.doc_id, c.order_global
LIMIT $2`,
[entityPk, limit],
);
}
// Backwards-compat for callers that imported findEntity from the old path.
export { findEntity } from "./graph";