disclosure-bureau/web/lib/entity-index.ts

/**
 * Entity match index — maps OCR text spans to canonical entity IDs.
 *
 * The pre-process script `scripts/build_entity_index.py` generates one
 * `wiki/pages/<doc-id>/p<NNN>.matches.json` per page, containing
 * [{ entity_id, class, alias_matched, start, end }] sorted by start.
 *
 * At runtime we just load it and slice the OCR text.
 */
import fs from "node:fs/promises";
import path from "node:path";
import { WIKI, type EntityClass } from "./wiki";

export interface EntityMatch {
  entity_id: string;
  class: EntityClass;
  alias_matched: string;
  start: number;
  end: number;
}

export async function readMatches(docId: string, pageStem: string): Promise<EntityMatch[]> {
  const p = path.join(WIKI, "pages", docId, `${pageStem}.matches.json`);
  try {
    const raw = await fs.readFile(p, "utf-8");
    return JSON.parse(raw) as EntityMatch[];
  } catch {
    return [];
  }
}

/**
 * Splits text into alternating string + match segments.
 * Useful for React rendering: map(seg => seg.isMatch ? <span>...</span> : seg.text)
 */
export interface TextSegment {
  isMatch: boolean;
  text: string;
  match?: EntityMatch;
}

export function segmentText(text: string, matches: EntityMatch[]): TextSegment[] {
  if (matches.length === 0) return [{ isMatch: false, text }];
  const sorted = [...matches].sort((a, b) => a.start - b.start);
  const segs: TextSegment[] = [];
  let cursor = 0;
  for (const m of sorted) {
    if (m.start < cursor) continue; // overlap — skip
    if (m.start > cursor) {
      segs.push({ isMatch: false, text: text.slice(cursor, m.start) });
    }
    segs.push({ isMatch: true, text: text.slice(m.start, m.end), match: m });
    cursor = m.end;
  }
  if (cursor < text.length) {
    segs.push({ isMatch: false, text: text.slice(cursor) });
  }
  return segs;
}