/** * Entity match index — maps OCR text spans to canonical entity IDs. * * The pre-process script `scripts/build_entity_index.py` generates one * `wiki/pages//p.matches.json` per page, containing * [{ entity_id, class, alias_matched, start, end }] sorted by start. * * At runtime we just load it and slice the OCR text. */ import fs from "node:fs/promises"; import path from "node:path"; import { WIKI, type EntityClass } from "./wiki"; export interface EntityMatch { entity_id: string; class: EntityClass; alias_matched: string; start: number; end: number; } export async function readMatches(docId: string, pageStem: string): Promise { const p = path.join(WIKI, "pages", docId, `${pageStem}.matches.json`); try { const raw = await fs.readFile(p, "utf-8"); return JSON.parse(raw) as EntityMatch[]; } catch { return []; } } /** * Splits text into alternating string + match segments. * Useful for React rendering: map(seg => seg.isMatch ? ... : seg.text) */ export interface TextSegment { isMatch: boolean; text: string; match?: EntityMatch; } export function segmentText(text: string, matches: EntityMatch[]): TextSegment[] { if (matches.length === 0) return [{ isMatch: false, text }]; const sorted = [...matches].sort((a, b) => a.start - b.start); const segs: TextSegment[] = []; let cursor = 0; for (const m of sorted) { if (m.start < cursor) continue; // overlap — skip if (m.start > cursor) { segs.push({ isMatch: false, text: text.slice(cursor, m.start) }); } segs.push({ isMatch: true, text: text.slice(m.start, m.end), match: m }); cursor = m.end; } if (cursor < text.length) { segs.push({ isMatch: false, text: text.slice(cursor) }); } return segs; }