59 lines
1.8 KiB
TypeScript
59 lines
1.8 KiB
TypeScript
/**
|
|
* Entity match index — maps OCR text spans to canonical entity IDs.
|
|
*
|
|
* The pre-process script `scripts/build_entity_index.py` generates one
|
|
* `wiki/pages/<doc-id>/p<NNN>.matches.json` per page, containing
|
|
* [{ entity_id, class, alias_matched, start, end }] sorted by start.
|
|
*
|
|
* At runtime we just load it and slice the OCR text.
|
|
*/
|
|
import fs from "node:fs/promises";
|
|
import path from "node:path";
|
|
import { WIKI, type EntityClass } from "./wiki";
|
|
|
|
export interface EntityMatch {
|
|
entity_id: string;
|
|
class: EntityClass;
|
|
alias_matched: string;
|
|
start: number;
|
|
end: number;
|
|
}
|
|
|
|
export async function readMatches(docId: string, pageStem: string): Promise<EntityMatch[]> {
|
|
const p = path.join(WIKI, "pages", docId, `${pageStem}.matches.json`);
|
|
try {
|
|
const raw = await fs.readFile(p, "utf-8");
|
|
return JSON.parse(raw) as EntityMatch[];
|
|
} catch {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Splits text into alternating string + match segments.
|
|
* Useful for React rendering: map(seg => seg.isMatch ? <span>...</span> : seg.text)
|
|
*/
|
|
export interface TextSegment {
|
|
isMatch: boolean;
|
|
text: string;
|
|
match?: EntityMatch;
|
|
}
|
|
|
|
export function segmentText(text: string, matches: EntityMatch[]): TextSegment[] {
|
|
if (matches.length === 0) return [{ isMatch: false, text }];
|
|
const sorted = [...matches].sort((a, b) => a.start - b.start);
|
|
const segs: TextSegment[] = [];
|
|
let cursor = 0;
|
|
for (const m of sorted) {
|
|
if (m.start < cursor) continue; // overlap — skip
|
|
if (m.start > cursor) {
|
|
segs.push({ isMatch: false, text: text.slice(cursor, m.start) });
|
|
}
|
|
segs.push({ isMatch: true, text: text.slice(m.start, m.end), match: m });
|
|
cursor = m.end;
|
|
}
|
|
if (cursor < text.length) {
|
|
segs.push({ isMatch: false, text: text.slice(cursor) });
|
|
}
|
|
return segs;
|
|
}
|