disclosure-bureau/web/lib/entity-index.ts

60 lines
1.8 KiB
TypeScript
Raw Normal View History

/**
* Entity match index maps OCR text spans to canonical entity IDs.
*
* The pre-process script `scripts/build_entity_index.py` generates one
* `wiki/pages/<doc-id>/p<NNN>.matches.json` per page, containing
* [{ entity_id, class, alias_matched, start, end }] sorted by start.
*
* At runtime we just load it and slice the OCR text.
*/
import fs from "node:fs/promises";
import path from "node:path";
import { WIKI, type EntityClass } from "./wiki";
export interface EntityMatch {
entity_id: string;
class: EntityClass;
alias_matched: string;
start: number;
end: number;
}
export async function readMatches(docId: string, pageStem: string): Promise<EntityMatch[]> {
const p = path.join(WIKI, "pages", docId, `${pageStem}.matches.json`);
try {
const raw = await fs.readFile(p, "utf-8");
return JSON.parse(raw) as EntityMatch[];
} catch {
return [];
}
}
/**
* Splits text into alternating string + match segments.
* Useful for React rendering: map(seg => seg.isMatch ? <span>...</span> : seg.text)
*/
export interface TextSegment {
isMatch: boolean;
text: string;
match?: EntityMatch;
}
export function segmentText(text: string, matches: EntityMatch[]): TextSegment[] {
if (matches.length === 0) return [{ isMatch: false, text }];
const sorted = [...matches].sort((a, b) => a.start - b.start);
const segs: TextSegment[] = [];
let cursor = 0;
for (const m of sorted) {
if (m.start < cursor) continue; // overlap — skip
if (m.start > cursor) {
segs.push({ isMatch: false, text: text.slice(cursor, m.start) });
}
segs.push({ isMatch: true, text: text.slice(m.start, m.end), match: m });
cursor = m.end;
}
if (cursor < text.length) {
segs.push({ isMatch: false, text: text.slice(cursor) });
}
return segs;
}