disclosure-bureau/web/lib/doc-renderer.ts

/**
 * Renders a full document — concatenates per-page OCR + interleaves images,
 * tables, and entity highlights based on each page's frontmatter.
 *
 * Returns a structured representation that the React view renders directly.
 */
import fs from "node:fs/promises";
import path from "node:path";
import { PROCESSING, readPage, readOcr, readTable, type MdFile } from "./wiki";
import { readMatches, type EntityMatch } from "./entity-index";
import type {
  AnyFrontmatter, BBox, ImageDetected, TableDetected,
  Redaction, SignatureObserved,
} from "./fm-types";

export interface InlineImage {
  kind: "image";
  bboxY: number;
  bbox: BBox;
  src: string;
  caption?: string;
  imageType?: string;
}
export interface InlineTable {
  kind: "table";
  bboxY: number;
  bbox: BBox;
  tableId?: string;
  csv?: string[][];
  rowEstimate?: number;
  colEstimate?: number;
  headersSummary?: string;
  fallbackCropY: number;     // y to derive crop from page PNG if no CSV
  docId: string;
  pageNum: number;
}
export interface InlineRedaction {
  kind: "redaction";
  bboxY: number;
  bbox: BBox;
  code?: string;
  description?: string;
  docId: string;
  pageNum: number;
}
export interface InlineSignature {
  kind: "signature";
  bboxY: number;
  bbox: BBox;
  signer?: string;
  docId: string;
  pageNum: number;
}
export type Inline = InlineImage | InlineTable | InlineRedaction | InlineSignature;

export interface RenderedPage {
  pageStem: string;            // "p007"
  pageNum: number;
  pageId: string;              // "doc-id/p007"
  pngUrl: string;
  ocr: string;                 // raw layout-preserved text
  matches: EntityMatch[];
  inline: Inline[];            // images + tables + redactions + signatures, sorted by bboxY
  visionEn?: string;
  visionPt?: string;
  pageType?: string;
  classification?: string;
  contentClassification?: string[];
  redactionsCount: number;
  signaturesCount: number;
}

export interface RenderedDoc {
  docId: string;
  canonicalTitle: string;
  pageCount: number;
  pages: RenderedPage[];
  frontmatter: AnyFrontmatter;
  documentBody: string;        // markdown body from documents/<id>.md
}

/** Find all the OCR pages for a doc + assemble into a RenderedDoc. */
export async function loadFullDocument(docId: string): Promise<RenderedDoc | null> {
  // Read document.md
  const docPath = path.join(PROCESSING, "..", "wiki", "documents", `${docId}.md`);
  let docMd: MdFile;
  try {
    const raw = await fs.readFile(docPath, "utf-8");
    const matter = (await import("gray-matter")).default(raw);
    docMd = { fm: matter.data as AnyFrontmatter, body: matter.content };
  } catch {
    return null;
  }

  const pagesDir = path.join(PROCESSING, "..", "wiki", "pages", docId);
  let pageStems: string[];
  try {
    const files = await fs.readdir(pagesDir);
    pageStems = files
      .filter((f) => /^p\d{3}\.md$/.test(f))
      .map((f) => f.replace(/\.md$/, ""))
      .sort();
  } catch {
    return null;
  }

  const pages = await Promise.all(
    pageStems.map(async (stem): Promise<RenderedPage | null> => {
      const md = await readPage(docId, stem);
      if (!md) return null;
      const fm = md.fm as AnyFrontmatter;
      const pageNum = parseInt(stem.replace("p", ""), 10);
      const padded = String(pageNum).padStart(3, "0");
      const ocr = (await readOcr(docId, pageNum)) ?? "";
      const matches = await readMatches(docId, stem);

      const inline: Inline[] = [];

      for (const im of (fm.images_detected ?? []) as ImageDetected[]) {
        if (!im.bbox) continue;
        const idx = im.local_image_index ?? 1;
        inline.push({
          kind: "image",
          bboxY: im.bbox.y ?? 0,
          bbox: im.bbox,
          src: `/api/static/processing/png/${docId}/p-${padded}.png`,
          caption: im.caption_ocr,
          imageType: im.image_type,
        });
        // local_image_index recorded but not used as URL key (FmBboxThumb uses CSS crop)
        void idx;
      }
      for (const t of (fm.tables_detected ?? []) as TableDetected[]) {
        if (!t.bbox) continue;
        let csv: string[][] | undefined;
        if (t.table_id) {
          const { csv: c } = await readTable(t.table_id);
          if (c) csv = c;
        }
        inline.push({
          kind: "table",
          bboxY: t.bbox.y ?? 0,
          bbox: t.bbox,
          tableId: t.table_id,
          csv,
          rowEstimate: t.row_count_estimate,
          colEstimate: t.col_count_estimate,
          headersSummary: t.headers_summary,
          fallbackCropY: t.bbox.y ?? 0,
          docId,
          pageNum,
        });
      }
      for (const r of (fm.redactions ?? []) as Redaction[]) {
        if (!r.bbox) continue;
        inline.push({
          kind: "redaction",
          bboxY: r.bbox.y ?? 0,
          bbox: r.bbox,
          code: r.code,
          description: r.description,
          docId,
          pageNum,
        });
      }
      for (const s of (fm.signatures_observed ?? []) as SignatureObserved[]) {
        if (!s.bbox) continue;
        inline.push({
          kind: "signature",
          bboxY: s.bbox.y ?? 0,
          bbox: s.bbox,
          signer: s.signer_inferred ?? undefined,
          docId,
          pageNum,
        });
      }
      inline.sort((a, b) => a.bboxY - b.bboxY);

      return {
        pageStem: stem,
        pageNum,
        pageId: `${docId}/${stem}`,
        pngUrl: `/api/static/processing/png/${docId}/p-${padded}.png`,
        ocr,
        matches,
        inline,
        visionEn: typeof fm.vision_description === "string" ? fm.vision_description : undefined,
        visionPt: typeof fm.vision_description_pt_br === "string" ? fm.vision_description_pt_br : undefined,
        pageType: typeof fm.page_type === "string" ? fm.page_type : undefined,
        classification: typeof fm.highest_classification === "string" ? fm.highest_classification : undefined,
        contentClassification: Array.isArray(fm.content_classification) ? (fm.content_classification as string[]) : undefined,
        redactionsCount: Array.isArray(fm.redactions) ? fm.redactions.length : 0,
        signaturesCount: Array.isArray(fm.signatures_observed) ? fm.signatures_observed.length : 0,
      };
    }),
  );

  return {
    docId,
    canonicalTitle: (docMd.fm.canonical_title as string) ?? docId,
    pageCount: pageStems.length,
    pages: pages.filter((p): p is RenderedPage => p !== null),
    frontmatter: docMd.fm,
    documentBody: docMd.body,
  };
}

/**
 * Splits OCR text into N segments (by approximate Y coordinate based on
 * line count). Used to interleave inline blocks at their bbox.y.
 */
export function splitOcrIntoSegments(ocr: string, nBlocks: number): string[] {
  if (nBlocks <= 1) return [ocr];
  const lines = ocr.split("\n");
  if (lines.length === 0) return [ocr];
  const segments: string[] = [];
  const linesPerSeg = Math.max(1, Math.ceil(lines.length / nBlocks));
  for (let i = 0; i < nBlocks; i++) {
    const start = i * linesPerSeg;
    const end = i === nBlocks - 1 ? lines.length : (i + 1) * linesPerSeg;
    segments.push(lines.slice(start, end).join("\n"));
  }
  return segments;
}
baseline: Disclosure Bureau pipeline + Next.js UI + Supabase stack 2026-05-18 01:44:36 +00:00			`/**`
			`* Renders a full document — concatenates per-page OCR + interleaves images,`
			`* tables, and entity highlights based on each page's frontmatter.`
			`*`
			`* Returns a structured representation that the React view renders directly.`
			`*/`
			`import fs from "node:fs/promises";`
			`import path from "node:path";`
			`import { PROCESSING, readPage, readOcr, readTable, type MdFile } from "./wiki";`
			`import { readMatches, type EntityMatch } from "./entity-index";`
			`import type {`
			`AnyFrontmatter, BBox, ImageDetected, TableDetected,`
			`Redaction, SignatureObserved,`
			`} from "./fm-types";`

			`export interface InlineImage {`
			`kind: "image";`
			`bboxY: number;`
			`bbox: BBox;`
			`src: string;`
			`caption?: string;`
			`imageType?: string;`
			`}`
			`export interface InlineTable {`
			`kind: "table";`
			`bboxY: number;`
			`bbox: BBox;`
			`tableId?: string;`
			`csv?: string[][];`
			`rowEstimate?: number;`
			`colEstimate?: number;`
			`headersSummary?: string;`
			`fallbackCropY: number; // y to derive crop from page PNG if no CSV`
			`docId: string;`
			`pageNum: number;`
			`}`
			`export interface InlineRedaction {`
			`kind: "redaction";`
			`bboxY: number;`
			`bbox: BBox;`
			`code?: string;`
			`description?: string;`
			`docId: string;`
			`pageNum: number;`
			`}`
			`export interface InlineSignature {`
			`kind: "signature";`
			`bboxY: number;`
			`bbox: BBox;`
			`signer?: string;`
			`docId: string;`
			`pageNum: number;`
			`}`
			`export type Inline = InlineImage \| InlineTable \| InlineRedaction \| InlineSignature;`

			`export interface RenderedPage {`
			`pageStem: string; // "p007"`
			`pageNum: number;`
			`pageId: string; // "doc-id/p007"`
			`pngUrl: string;`
			`ocr: string; // raw layout-preserved text`
			`matches: EntityMatch[];`
			`inline: Inline[]; // images + tables + redactions + signatures, sorted by bboxY`
			`visionEn?: string;`
			`visionPt?: string;`
			`pageType?: string;`
			`classification?: string;`
			`contentClassification?: string[];`
			`redactionsCount: number;`
			`signaturesCount: number;`
			`}`

			`export interface RenderedDoc {`
			`docId: string;`
			`canonicalTitle: string;`
			`pageCount: number;`
			`pages: RenderedPage[];`
			`frontmatter: AnyFrontmatter;`
			`documentBody: string; // markdown body from documents/<id>.md`
			`}`

			`/** Find all the OCR pages for a doc + assemble into a RenderedDoc. */`
			`export async function loadFullDocument(docId: string): Promise<RenderedDoc \| null> {`
			`// Read document.md`
			const docPath = path.join(PROCESSING, "..", "wiki", "documents", `${docId}.md`);
			`let docMd: MdFile;`
			`try {`
			`const raw = await fs.readFile(docPath, "utf-8");`
			`const matter = (await import("gray-matter")).default(raw);`
			`docMd = { fm: matter.data as AnyFrontmatter, body: matter.content };`
			`} catch {`
			`return null;`
			`}`

			`const pagesDir = path.join(PROCESSING, "..", "wiki", "pages", docId);`
			`let pageStems: string[];`
			`try {`
			`const files = await fs.readdir(pagesDir);`
			`pageStems = files`
			`.filter((f) => /^p\d{3}\.md$/.test(f))`
			`.map((f) => f.replace(/\.md$/, ""))`
			`.sort();`
			`} catch {`
			`return null;`
			`}`

			`const pages = await Promise.all(`
			`pageStems.map(async (stem): Promise<RenderedPage \| null> => {`
			`const md = await readPage(docId, stem);`
			`if (!md) return null;`
			`const fm = md.fm as AnyFrontmatter;`
			`const pageNum = parseInt(stem.replace("p", ""), 10);`
			`const padded = String(pageNum).padStart(3, "0");`
			`const ocr = (await readOcr(docId, pageNum)) ?? "";`
			`const matches = await readMatches(docId, stem);`

			`const inline: Inline[] = [];`

			`for (const im of (fm.images_detected ?? []) as ImageDetected[]) {`
			`if (!im.bbox) continue;`
			`const idx = im.local_image_index ?? 1;`
			`inline.push({`
			`kind: "image",`
			`bboxY: im.bbox.y ?? 0,`
			`bbox: im.bbox,`
			src: `/api/static/processing/png/${docId}/p-${padded}.png`,
			`caption: im.caption_ocr,`
			`imageType: im.image_type,`
			`});`
			`// local_image_index recorded but not used as URL key (FmBboxThumb uses CSS crop)`
			`void idx;`
			`}`
			`for (const t of (fm.tables_detected ?? []) as TableDetected[]) {`
			`if (!t.bbox) continue;`
			`let csv: string[][] \| undefined;`
			`if (t.table_id) {`
			`const { csv: c } = await readTable(t.table_id);`
			`if (c) csv = c;`
			`}`
			`inline.push({`
			`kind: "table",`
			`bboxY: t.bbox.y ?? 0,`
			`bbox: t.bbox,`
			`tableId: t.table_id,`
			`csv,`
			`rowEstimate: t.row_count_estimate,`
			`colEstimate: t.col_count_estimate,`
			`headersSummary: t.headers_summary,`
			`fallbackCropY: t.bbox.y ?? 0,`
			`docId,`
			`pageNum,`
			`});`
			`}`
			`for (const r of (fm.redactions ?? []) as Redaction[]) {`
			`if (!r.bbox) continue;`
			`inline.push({`
			`kind: "redaction",`
			`bboxY: r.bbox.y ?? 0,`
			`bbox: r.bbox,`
			`code: r.code,`
			`description: r.description,`
			`docId,`
			`pageNum,`
			`});`
			`}`
			`for (const s of (fm.signatures_observed ?? []) as SignatureObserved[]) {`
			`if (!s.bbox) continue;`
			`inline.push({`
			`kind: "signature",`
			`bboxY: s.bbox.y ?? 0,`
			`bbox: s.bbox,`
			`signer: s.signer_inferred ?? undefined,`
			`docId,`
			`pageNum,`
			`});`
			`}`
			`inline.sort((a, b) => a.bboxY - b.bboxY);`

			`return {`
			`pageStem: stem,`
			`pageNum,`
			pageId: `${docId}/${stem}`,
			pngUrl: `/api/static/processing/png/${docId}/p-${padded}.png`,
			`ocr,`
			`matches,`
			`inline,`
			`visionEn: typeof fm.vision_description === "string" ? fm.vision_description : undefined,`
			`visionPt: typeof fm.vision_description_pt_br === "string" ? fm.vision_description_pt_br : undefined,`
			`pageType: typeof fm.page_type === "string" ? fm.page_type : undefined,`
			`classification: typeof fm.highest_classification === "string" ? fm.highest_classification : undefined,`
			`contentClassification: Array.isArray(fm.content_classification) ? (fm.content_classification as string[]) : undefined,`
			`redactionsCount: Array.isArray(fm.redactions) ? fm.redactions.length : 0,`
			`signaturesCount: Array.isArray(fm.signatures_observed) ? fm.signatures_observed.length : 0,`
			`};`
			`}),`
			`);`

			`return {`
			`docId,`
			`canonicalTitle: (docMd.fm.canonical_title as string) ?? docId,`
			`pageCount: pageStems.length,`
			`pages: pages.filter((p): p is RenderedPage => p !== null),`
			`frontmatter: docMd.fm,`
			`documentBody: docMd.body,`
			`};`
			`}`

			`/**`
			`* Splits OCR text into N segments (by approximate Y coordinate based on`
			`* line count). Used to interleave inline blocks at their bbox.y.`
			`*/`
			`export function splitOcrIntoSegments(ocr: string, nBlocks: number): string[] {`
			`if (nBlocks <= 1) return [ocr];`
			`const lines = ocr.split("\n");`
			`if (lines.length === 0) return [ocr];`
			`const segments: string[] = [];`
			`const linesPerSeg = Math.max(1, Math.ceil(lines.length / nBlocks));`
			`for (let i = 0; i < nBlocks; i++) {`
			`const start = i * linesPerSeg;`
			`const end = i === nBlocks - 1 ? lines.length : (i + 1) * linesPerSeg;`
			`segments.push(lines.slice(start, end).join("\n"));`
			`}`
			`return segments;`
			`}`