Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
262 lines
8.8 KiB
TypeScript
262 lines
8.8 KiB
TypeScript
/**
|
|
* Entity page data — SINGLE SOURCE OF TRUTH is the YAML frontmatter on disk.
|
|
*
|
|
* Why YAML and not the DB? Because the corpus has TWO independent extraction
|
|
* layers (Haiku page-level, Sonnet chunk-level) and each catches a different
|
|
* subset of entities. The DB's entity_mentions table is one of those signals —
|
|
* useful for chat retrieval but incomplete for the entity catalog itself.
|
|
*
|
|
* Reading from disk lets us merge every signal into one stat (`total_mentions`)
|
|
* via the maintain/42_sync_entity_stats.py pipeline and serve consistent
|
|
* numbers everywhere in the UI.
|
|
*
|
|
* The DB is still queried for ONE thing: the actual chunk text for previews,
|
|
* because we don't want to re-parse 21k chunk files on every page render.
|
|
*/
|
|
import fs from "node:fs/promises";
|
|
import path from "node:path";
|
|
import matter from "gray-matter";
|
|
import { pgQuery } from "./db";
|
|
import { WIKI } from "@/lib/wiki";
|
|
|
|
const FOLDER_BY_CLASS: Record<string, string> = {
|
|
person: "people",
|
|
organization: "organizations",
|
|
location: "locations",
|
|
event: "events",
|
|
uap_object: "uap-objects",
|
|
vehicle: "vehicles",
|
|
operation: "operations",
|
|
concept: "concepts",
|
|
};
|
|
|
|
export interface EntityCore {
|
|
entity_pk: number | null; // db-side primary key; null if entity is wiki-only
|
|
entity_class: string;
|
|
entity_id: string;
|
|
canonical_name: string;
|
|
aliases: string[];
|
|
total_mentions: number;
|
|
documents_count: number;
|
|
signal_strength: "strong" | "weak" | "orphan" | "unverified";
|
|
signal_sources: {
|
|
db_chunks: number;
|
|
page_refs: number;
|
|
cross_refs: number;
|
|
text_refs: number;
|
|
};
|
|
mentioned_in: string[]; // [[doc-id/p007]] — structured page refs (Haiku)
|
|
text_mentioned_in: string[]; // [[doc-id/p007]] — text-only matches (back-fill)
|
|
referenced_by: string[]; // [[class/id]] cross-links
|
|
enrichment_status: string | null;
|
|
narrative_summary: string | null;
|
|
narrative_summary_pt_br: string | null;
|
|
summary_status: string | null;
|
|
}
|
|
|
|
interface RawFm {
|
|
[k: string]: unknown;
|
|
}
|
|
|
|
function num(v: unknown, fallback = 0): number {
|
|
if (typeof v === "number" && Number.isFinite(v)) return v;
|
|
if (typeof v === "string") {
|
|
const n = Number(v);
|
|
return Number.isFinite(n) ? n : fallback;
|
|
}
|
|
return fallback;
|
|
}
|
|
|
|
function arr(v: unknown): string[] {
|
|
if (!v) return [];
|
|
if (Array.isArray(v)) return v.filter((x) => typeof x === "string");
|
|
return [];
|
|
}
|
|
|
|
function strOrNull(v: unknown): string | null {
|
|
return typeof v === "string" && v.trim() ? v : null;
|
|
}
|
|
|
|
async function readEntityYaml(entityClass: string, entityId: string): Promise<RawFm | null> {
|
|
const folder = FOLDER_BY_CLASS[entityClass];
|
|
if (!folder) return null;
|
|
const p = path.join(WIKI, "entities", folder, `${entityId}.md`);
|
|
try {
|
|
const raw = await fs.readFile(p, "utf-8");
|
|
return matter(raw).data as RawFm;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Load a single entity card from its YAML. Returns null if archived or
|
|
* missing — keeps the route handler simple.
|
|
*/
|
|
export async function getEntityCore(
|
|
entityClass: string,
|
|
entityId: string,
|
|
): Promise<EntityCore | null> {
|
|
const fm = await readEntityYaml(entityClass, entityId);
|
|
if (!fm) return null;
|
|
|
|
// Best-effort lookup of the DB entity_pk so getEntityChunks can still
|
|
// query by primary key. Don't fail if the entity isn't in the DB at all.
|
|
let entity_pk: number | null = null;
|
|
try {
|
|
const rows = await pgQuery<{ entity_pk: number }>(
|
|
`SELECT entity_pk FROM public.entities
|
|
WHERE entity_class = $1 AND entity_id = $2 LIMIT 1`,
|
|
[entityClass, entityId],
|
|
);
|
|
entity_pk = rows[0]?.entity_pk ?? null;
|
|
} catch {
|
|
entity_pk = null;
|
|
}
|
|
|
|
const sigSources = (fm.signal_sources as Record<string, unknown> | undefined) ?? {};
|
|
const strength = (typeof fm.signal_strength === "string"
|
|
? fm.signal_strength
|
|
: "unverified") as EntityCore["signal_strength"];
|
|
|
|
return {
|
|
entity_pk,
|
|
entity_class: entityClass,
|
|
entity_id: entityId,
|
|
canonical_name: typeof fm.canonical_name === "string" ? fm.canonical_name : entityId,
|
|
aliases: arr(fm.aliases),
|
|
total_mentions: num(fm.total_mentions, 0),
|
|
documents_count: num(fm.documents_count, 0),
|
|
signal_strength: ["strong", "weak", "orphan", "unverified"].includes(strength)
|
|
? strength
|
|
: "unverified",
|
|
signal_sources: {
|
|
db_chunks: num(sigSources.db_chunks, 0),
|
|
page_refs: num(sigSources.page_refs, 0),
|
|
cross_refs: num(sigSources.cross_refs, 0),
|
|
text_refs: num(sigSources.text_refs, 0),
|
|
},
|
|
mentioned_in: arr(fm.mentioned_in),
|
|
text_mentioned_in: arr(fm.text_mentioned_in),
|
|
referenced_by: arr(fm.referenced_by),
|
|
enrichment_status: strOrNull(fm.enrichment_status),
|
|
narrative_summary: strOrNull(fm.narrative_summary),
|
|
narrative_summary_pt_br: strOrNull(fm.narrative_summary_pt_br),
|
|
summary_status: strOrNull(fm.summary_status),
|
|
};
|
|
}
|
|
|
|
export interface EntityMentionGroup {
|
|
doc_id: string;
|
|
canonical_title: string | null;
|
|
collection: string | null;
|
|
page_count: number | null;
|
|
classification: string | null;
|
|
mention_count: number;
|
|
pages: number[];
|
|
text_only: boolean; // true when all refs came from the text back-fill (no structured Haiku evidence)
|
|
}
|
|
|
|
/**
|
|
* Group reverse-references by document. Derived from the YAML's mentioned_in[]
|
|
* (which the maintain script writes consolidating page YAMLs). Optionally
|
|
* enriches with document metadata read from wiki/documents/<doc-id>.md.
|
|
*/
|
|
export async function getEntityMentionsByDoc(
|
|
entityClass: string,
|
|
entityId: string,
|
|
limit = 100,
|
|
): Promise<EntityMentionGroup[]> {
|
|
const fm = await readEntityYaml(entityClass, entityId);
|
|
if (!fm) return [];
|
|
const structuredRefs = arr(fm.mentioned_in);
|
|
const textRefs = arr(fm.text_mentioned_in);
|
|
// Each ref looks like "[[doc-id/p007]]". Strip wikilink delimiters.
|
|
const byDoc = new Map<string, { structured: Set<number>; text: Set<number> }>();
|
|
const addRef = (ref: string, source: "structured" | "text") => {
|
|
const m = ref.match(/\[\[([^\]|]+?)\]\]/);
|
|
const target = (m ? m[1] : ref).trim();
|
|
const [docId, pageStr] = target.split("/", 2);
|
|
if (!docId) return;
|
|
const pageNum = pageStr ? parseInt(pageStr.replace(/^p/, ""), 10) : NaN;
|
|
if (!byDoc.has(docId)) byDoc.set(docId, { structured: new Set(), text: new Set() });
|
|
if (Number.isFinite(pageNum)) byDoc.get(docId)![source].add(pageNum);
|
|
};
|
|
for (const r of structuredRefs) addRef(r, "structured");
|
|
for (const r of textRefs) addRef(r, "text");
|
|
|
|
// Hydrate each doc's metadata from wiki/documents/<doc-id>.md
|
|
const groups: EntityMentionGroup[] = [];
|
|
for (const [docId, sets] of byDoc) {
|
|
let canonical_title: string | null = null;
|
|
let collection: string | null = null;
|
|
let page_count: number | null = null;
|
|
let classification: string | null = null;
|
|
try {
|
|
const docRaw = await fs.readFile(
|
|
path.join(WIKI, "documents", `${docId}.md`),
|
|
"utf-8",
|
|
);
|
|
const dfm = matter(docRaw).data as Record<string, unknown>;
|
|
canonical_title = strOrNull(dfm.canonical_title);
|
|
collection = strOrNull(dfm.collection);
|
|
page_count = num(dfm.page_count, 0) || null;
|
|
classification = strOrNull(dfm.highest_classification);
|
|
} catch {
|
|
/* doc missing — use raw id */
|
|
}
|
|
const merged = new Set<number>([...sets.structured, ...sets.text]);
|
|
groups.push({
|
|
doc_id: docId,
|
|
canonical_title,
|
|
collection,
|
|
page_count,
|
|
classification,
|
|
mention_count: merged.size,
|
|
pages: Array.from(merged).sort((a, b) => a - b),
|
|
text_only: sets.structured.size === 0 && sets.text.size > 0,
|
|
});
|
|
}
|
|
groups.sort((a, b) => b.mention_count - a.mention_count);
|
|
return groups.slice(0, limit);
|
|
}
|
|
|
|
export interface EntityChunkPreview {
|
|
chunk_pk: number;
|
|
doc_id: string;
|
|
chunk_id: string;
|
|
page: number;
|
|
type: string;
|
|
bbox: { x: number; y: number; w: number; h: number } | null;
|
|
classification: string | null;
|
|
content_pt: string | null;
|
|
content_en: string | null;
|
|
ufo_anomaly: boolean | null;
|
|
ufo_anomaly_type: string | null;
|
|
}
|
|
|
|
/**
|
|
* Top chunks that textually mention this entity. Reads from DB because
|
|
* chunk content is big (we don't re-parse files at request time). Returns []
|
|
* if the entity isn't indexed in the DB.
|
|
*/
|
|
export async function getEntityChunks(
|
|
entityPk: number | null,
|
|
limit = 30,
|
|
): Promise<EntityChunkPreview[]> {
|
|
if (entityPk == null) return [];
|
|
return pgQuery<EntityChunkPreview>(
|
|
`SELECT
|
|
c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.type, c.bbox, c.classification,
|
|
c.content_pt, c.content_en, c.ufo_anomaly, c.ufo_anomaly_type
|
|
FROM public.entity_mentions em
|
|
JOIN public.chunks c ON c.chunk_pk = em.chunk_pk
|
|
WHERE em.entity_pk = $1
|
|
ORDER BY c.ufo_anomaly DESC NULLS LAST, c.doc_id, c.order_global
|
|
LIMIT $2`,
|
|
[entityPk, limit],
|
|
);
|
|
}
|
|
|
|
// Backwards-compat for callers that imported findEntity from the old path.
|
|
export { findEntity } from "./graph";
|