disclosure-bureau/web/lib/doc-renderer.ts

225 lines
7 KiB
TypeScript
Raw Normal View History

/**
* Renders a full document concatenates per-page OCR + interleaves images,
* tables, and entity highlights based on each page's frontmatter.
*
* Returns a structured representation that the React view renders directly.
*/
import fs from "node:fs/promises";
import path from "node:path";
import { PROCESSING, readPage, readOcr, readTable, type MdFile } from "./wiki";
import { readMatches, type EntityMatch } from "./entity-index";
import type {
AnyFrontmatter, BBox, ImageDetected, TableDetected,
Redaction, SignatureObserved,
} from "./fm-types";
export interface InlineImage {
kind: "image";
bboxY: number;
bbox: BBox;
src: string;
caption?: string;
imageType?: string;
}
export interface InlineTable {
kind: "table";
bboxY: number;
bbox: BBox;
tableId?: string;
csv?: string[][];
rowEstimate?: number;
colEstimate?: number;
headersSummary?: string;
fallbackCropY: number; // y to derive crop from page PNG if no CSV
docId: string;
pageNum: number;
}
export interface InlineRedaction {
kind: "redaction";
bboxY: number;
bbox: BBox;
code?: string;
description?: string;
docId: string;
pageNum: number;
}
export interface InlineSignature {
kind: "signature";
bboxY: number;
bbox: BBox;
signer?: string;
docId: string;
pageNum: number;
}
export type Inline = InlineImage | InlineTable | InlineRedaction | InlineSignature;
export interface RenderedPage {
pageStem: string; // "p007"
pageNum: number;
pageId: string; // "doc-id/p007"
pngUrl: string;
ocr: string; // raw layout-preserved text
matches: EntityMatch[];
inline: Inline[]; // images + tables + redactions + signatures, sorted by bboxY
visionEn?: string;
visionPt?: string;
pageType?: string;
classification?: string;
contentClassification?: string[];
redactionsCount: number;
signaturesCount: number;
}
export interface RenderedDoc {
docId: string;
canonicalTitle: string;
pageCount: number;
pages: RenderedPage[];
frontmatter: AnyFrontmatter;
documentBody: string; // markdown body from documents/<id>.md
}
/** Find all the OCR pages for a doc + assemble into a RenderedDoc. */
export async function loadFullDocument(docId: string): Promise<RenderedDoc | null> {
// Read document.md
const docPath = path.join(PROCESSING, "..", "wiki", "documents", `${docId}.md`);
let docMd: MdFile;
try {
const raw = await fs.readFile(docPath, "utf-8");
const matter = (await import("gray-matter")).default(raw);
docMd = { fm: matter.data as AnyFrontmatter, body: matter.content };
} catch {
return null;
}
const pagesDir = path.join(PROCESSING, "..", "wiki", "pages", docId);
let pageStems: string[];
try {
const files = await fs.readdir(pagesDir);
pageStems = files
.filter((f) => /^p\d{3}\.md$/.test(f))
.map((f) => f.replace(/\.md$/, ""))
.sort();
} catch {
return null;
}
const pages = await Promise.all(
pageStems.map(async (stem): Promise<RenderedPage | null> => {
const md = await readPage(docId, stem);
if (!md) return null;
const fm = md.fm as AnyFrontmatter;
const pageNum = parseInt(stem.replace("p", ""), 10);
const padded = String(pageNum).padStart(3, "0");
const ocr = (await readOcr(docId, pageNum)) ?? "";
const matches = await readMatches(docId, stem);
const inline: Inline[] = [];
for (const im of (fm.images_detected ?? []) as ImageDetected[]) {
if (!im.bbox) continue;
const idx = im.local_image_index ?? 1;
inline.push({
kind: "image",
bboxY: im.bbox.y ?? 0,
bbox: im.bbox,
src: `/api/static/processing/png/${docId}/p-${padded}.png`,
caption: im.caption_ocr,
imageType: im.image_type,
});
// local_image_index recorded but not used as URL key (FmBboxThumb uses CSS crop)
void idx;
}
for (const t of (fm.tables_detected ?? []) as TableDetected[]) {
if (!t.bbox) continue;
let csv: string[][] | undefined;
if (t.table_id) {
const { csv: c } = await readTable(t.table_id);
if (c) csv = c;
}
inline.push({
kind: "table",
bboxY: t.bbox.y ?? 0,
bbox: t.bbox,
tableId: t.table_id,
csv,
rowEstimate: t.row_count_estimate,
colEstimate: t.col_count_estimate,
headersSummary: t.headers_summary,
fallbackCropY: t.bbox.y ?? 0,
docId,
pageNum,
});
}
for (const r of (fm.redactions ?? []) as Redaction[]) {
if (!r.bbox) continue;
inline.push({
kind: "redaction",
bboxY: r.bbox.y ?? 0,
bbox: r.bbox,
code: r.code,
description: r.description,
docId,
pageNum,
});
}
for (const s of (fm.signatures_observed ?? []) as SignatureObserved[]) {
if (!s.bbox) continue;
inline.push({
kind: "signature",
bboxY: s.bbox.y ?? 0,
bbox: s.bbox,
signer: s.signer_inferred ?? undefined,
docId,
pageNum,
});
}
inline.sort((a, b) => a.bboxY - b.bboxY);
return {
pageStem: stem,
pageNum,
pageId: `${docId}/${stem}`,
pngUrl: `/api/static/processing/png/${docId}/p-${padded}.png`,
ocr,
matches,
inline,
visionEn: typeof fm.vision_description === "string" ? fm.vision_description : undefined,
visionPt: typeof fm.vision_description_pt_br === "string" ? fm.vision_description_pt_br : undefined,
pageType: typeof fm.page_type === "string" ? fm.page_type : undefined,
classification: typeof fm.highest_classification === "string" ? fm.highest_classification : undefined,
contentClassification: Array.isArray(fm.content_classification) ? (fm.content_classification as string[]) : undefined,
redactionsCount: Array.isArray(fm.redactions) ? fm.redactions.length : 0,
signaturesCount: Array.isArray(fm.signatures_observed) ? fm.signatures_observed.length : 0,
};
}),
);
return {
docId,
canonicalTitle: (docMd.fm.canonical_title as string) ?? docId,
pageCount: pageStems.length,
pages: pages.filter((p): p is RenderedPage => p !== null),
frontmatter: docMd.fm,
documentBody: docMd.body,
};
}
/**
* Splits OCR text into N segments (by approximate Y coordinate based on
* line count). Used to interleave inline blocks at their bbox.y.
*/
export function splitOcrIntoSegments(ocr: string, nBlocks: number): string[] {
if (nBlocks <= 1) return [ocr];
const lines = ocr.split("\n");
if (lines.length === 0) return [ocr];
const segments: string[] = [];
const linesPerSeg = Math.max(1, Math.ceil(lines.length / nBlocks));
for (let i = 0; i < nBlocks; i++) {
const start = i * linesPerSeg;
const end = i === nBlocks - 1 ? lines.length : (i + 1) * linesPerSeg;
segments.push(lines.slice(start, end).join("\n"));
}
return segments;
}