disclosure-bureau/web/lib/entity-index.ts

/**
 * Entity match index — maps OCR text spans to canonical entity IDs.
 *
 * The pre-process script `scripts/build_entity_index.py` generates one
 * `wiki/pages/<doc-id>/p<NNN>.matches.json` per page, containing
 * [{ entity_id, class, alias_matched, start, end }] sorted by start.
 *
 * At runtime we just load it and slice the OCR text.
 */
import fs from "node:fs/promises";
import path from "node:path";
import { WIKI, type EntityClass } from "./wiki";

export interface EntityMatch {
  entity_id: string;
  class: EntityClass;
  alias_matched: string;
  start: number;
  end: number;
}

export async function readMatches(docId: string, pageStem: string): Promise<EntityMatch[]> {
  const p = path.join(WIKI, "pages", docId, `${pageStem}.matches.json`);
  try {
    const raw = await fs.readFile(p, "utf-8");
    return JSON.parse(raw) as EntityMatch[];
  } catch {
    return [];
  }
}

/**
 * Splits text into alternating string + match segments.
 * Useful for React rendering: map(seg => seg.isMatch ? <span>...</span> : seg.text)
 */
export interface TextSegment {
  isMatch: boolean;
  text: string;
  match?: EntityMatch;
}

export function segmentText(text: string, matches: EntityMatch[]): TextSegment[] {
  if (matches.length === 0) return [{ isMatch: false, text }];
  const sorted = [...matches].sort((a, b) => a.start - b.start);
  const segs: TextSegment[] = [];
  let cursor = 0;
  for (const m of sorted) {
    if (m.start < cursor) continue; // overlap — skip
    if (m.start > cursor) {
      segs.push({ isMatch: false, text: text.slice(cursor, m.start) });
    }
    segs.push({ isMatch: true, text: text.slice(m.start, m.end), match: m });
    cursor = m.end;
  }
  if (cursor < text.length) {
    segs.push({ isMatch: false, text: text.slice(cursor) });
  }
  return segs;
}
baseline: Disclosure Bureau pipeline + Next.js UI + Supabase stack 2026-05-18 01:44:36 +00:00			`/**`
			`* Entity match index — maps OCR text spans to canonical entity IDs.`
			`*`
			* The pre-process script `scripts/build_entity_index.py` generates one
			* `wiki/pages/<doc-id>/p<NNN>.matches.json` per page, containing
			`* [{ entity_id, class, alias_matched, start, end }] sorted by start.`
			`*`
			`* At runtime we just load it and slice the OCR text.`
			`*/`
			`import fs from "node:fs/promises";`
			`import path from "node:path";`
			`import { WIKI, type EntityClass } from "./wiki";`

			`export interface EntityMatch {`
			`entity_id: string;`
			`class: EntityClass;`
			`alias_matched: string;`
			`start: number;`
			`end: number;`
			`}`

			`export async function readMatches(docId: string, pageStem: string): Promise<EntityMatch[]> {`
			const p = path.join(WIKI, "pages", docId, `${pageStem}.matches.json`);
			`try {`
			`const raw = await fs.readFile(p, "utf-8");`
			`return JSON.parse(raw) as EntityMatch[];`
			`} catch {`
			`return [];`
			`}`
			`}`

			`/**`
			`* Splits text into alternating string + match segments.`
			`* Useful for React rendering: map(seg => seg.isMatch ? <span>...</span> : seg.text)`
			`*/`
			`export interface TextSegment {`
			`isMatch: boolean;`
			`text: string;`
			`match?: EntityMatch;`
			`}`

			`export function segmentText(text: string, matches: EntityMatch[]): TextSegment[] {`
			`if (matches.length === 0) return [{ isMatch: false, text }];`
			`const sorted = [...matches].sort((a, b) => a.start - b.start);`
			`const segs: TextSegment[] = [];`
			`let cursor = 0;`
			`for (const m of sorted) {`
			`if (m.start < cursor) continue; // overlap — skip`
			`if (m.start > cursor) {`
			`segs.push({ isMatch: false, text: text.slice(cursor, m.start) });`
			`}`
			`segs.push({ isMatch: true, text: text.slice(m.start, m.end), match: m });`
			`cursor = m.end;`
			`}`
			`if (cursor < text.length) {`
			`segs.push({ isMatch: false, text: text.slice(cursor) });`
			`}`
			`return segs;`
			`}`