/** * Renders a full document — concatenates per-page OCR + interleaves images, * tables, and entity highlights based on each page's frontmatter. * * Returns a structured representation that the React view renders directly. */ import fs from "node:fs/promises"; import path from "node:path"; import { PROCESSING, readPage, readOcr, readTable, type MdFile } from "./wiki"; import { readMatches, type EntityMatch } from "./entity-index"; import type { AnyFrontmatter, BBox, ImageDetected, TableDetected, Redaction, SignatureObserved, } from "./fm-types"; export interface InlineImage { kind: "image"; bboxY: number; bbox: BBox; src: string; caption?: string; imageType?: string; } export interface InlineTable { kind: "table"; bboxY: number; bbox: BBox; tableId?: string; csv?: string[][]; rowEstimate?: number; colEstimate?: number; headersSummary?: string; fallbackCropY: number; // y to derive crop from page PNG if no CSV docId: string; pageNum: number; } export interface InlineRedaction { kind: "redaction"; bboxY: number; bbox: BBox; code?: string; description?: string; docId: string; pageNum: number; } export interface InlineSignature { kind: "signature"; bboxY: number; bbox: BBox; signer?: string; docId: string; pageNum: number; } export type Inline = InlineImage | InlineTable | InlineRedaction | InlineSignature; export interface RenderedPage { pageStem: string; // "p007" pageNum: number; pageId: string; // "doc-id/p007" pngUrl: string; ocr: string; // raw layout-preserved text matches: EntityMatch[]; inline: Inline[]; // images + tables + redactions + signatures, sorted by bboxY visionEn?: string; visionPt?: string; pageType?: string; classification?: string; contentClassification?: string[]; redactionsCount: number; signaturesCount: number; } export interface RenderedDoc { docId: string; canonicalTitle: string; pageCount: number; pages: RenderedPage[]; frontmatter: AnyFrontmatter; documentBody: string; // markdown body from documents/.md } /** Find all the OCR pages for a doc + assemble into a RenderedDoc. */ export async function loadFullDocument(docId: string): Promise { // Read document.md const docPath = path.join(PROCESSING, "..", "wiki", "documents", `${docId}.md`); let docMd: MdFile; try { const raw = await fs.readFile(docPath, "utf-8"); const matter = (await import("gray-matter")).default(raw); docMd = { fm: matter.data as AnyFrontmatter, body: matter.content }; } catch { return null; } const pagesDir = path.join(PROCESSING, "..", "wiki", "pages", docId); let pageStems: string[]; try { const files = await fs.readdir(pagesDir); pageStems = files .filter((f) => /^p\d{3}\.md$/.test(f)) .map((f) => f.replace(/\.md$/, "")) .sort(); } catch { return null; } const pages = await Promise.all( pageStems.map(async (stem): Promise => { const md = await readPage(docId, stem); if (!md) return null; const fm = md.fm as AnyFrontmatter; const pageNum = parseInt(stem.replace("p", ""), 10); const padded = String(pageNum).padStart(3, "0"); const ocr = (await readOcr(docId, pageNum)) ?? ""; const matches = await readMatches(docId, stem); const inline: Inline[] = []; for (const im of (fm.images_detected ?? []) as ImageDetected[]) { if (!im.bbox) continue; const idx = im.local_image_index ?? 1; inline.push({ kind: "image", bboxY: im.bbox.y ?? 0, bbox: im.bbox, src: `/api/static/processing/png/${docId}/p-${padded}.png`, caption: im.caption_ocr, imageType: im.image_type, }); // local_image_index recorded but not used as URL key (FmBboxThumb uses CSS crop) void idx; } for (const t of (fm.tables_detected ?? []) as TableDetected[]) { if (!t.bbox) continue; let csv: string[][] | undefined; if (t.table_id) { const { csv: c } = await readTable(t.table_id); if (c) csv = c; } inline.push({ kind: "table", bboxY: t.bbox.y ?? 0, bbox: t.bbox, tableId: t.table_id, csv, rowEstimate: t.row_count_estimate, colEstimate: t.col_count_estimate, headersSummary: t.headers_summary, fallbackCropY: t.bbox.y ?? 0, docId, pageNum, }); } for (const r of (fm.redactions ?? []) as Redaction[]) { if (!r.bbox) continue; inline.push({ kind: "redaction", bboxY: r.bbox.y ?? 0, bbox: r.bbox, code: r.code, description: r.description, docId, pageNum, }); } for (const s of (fm.signatures_observed ?? []) as SignatureObserved[]) { if (!s.bbox) continue; inline.push({ kind: "signature", bboxY: s.bbox.y ?? 0, bbox: s.bbox, signer: s.signer_inferred ?? undefined, docId, pageNum, }); } inline.sort((a, b) => a.bboxY - b.bboxY); return { pageStem: stem, pageNum, pageId: `${docId}/${stem}`, pngUrl: `/api/static/processing/png/${docId}/p-${padded}.png`, ocr, matches, inline, visionEn: typeof fm.vision_description === "string" ? fm.vision_description : undefined, visionPt: typeof fm.vision_description_pt_br === "string" ? fm.vision_description_pt_br : undefined, pageType: typeof fm.page_type === "string" ? fm.page_type : undefined, classification: typeof fm.highest_classification === "string" ? fm.highest_classification : undefined, contentClassification: Array.isArray(fm.content_classification) ? (fm.content_classification as string[]) : undefined, redactionsCount: Array.isArray(fm.redactions) ? fm.redactions.length : 0, signaturesCount: Array.isArray(fm.signatures_observed) ? fm.signatures_observed.length : 0, }; }), ); return { docId, canonicalTitle: (docMd.fm.canonical_title as string) ?? docId, pageCount: pageStems.length, pages: pages.filter((p): p is RenderedPage => p !== null), frontmatter: docMd.fm, documentBody: docMd.body, }; } /** * Splits OCR text into N segments (by approximate Y coordinate based on * line count). Used to interleave inline blocks at their bbox.y. */ export function splitOcrIntoSegments(ocr: string, nBlocks: number): string[] { if (nBlocks <= 1) return [ocr]; const lines = ocr.split("\n"); if (lines.length === 0) return [ocr]; const segments: string[] = []; const linesPerSeg = Math.max(1, Math.ceil(lines.length / nBlocks)); for (let i = 0; i < nBlocks; i++) { const start = i * linesPerSeg; const end = i === nBlocks - 1 ? lines.length : (i + 1) * linesPerSeg; segments.push(lines.slice(start, end).join("\n")); } return segments; }