225 lines
7 KiB
TypeScript
225 lines
7 KiB
TypeScript
|
|
/**
|
||
|
|
* Renders a full document — concatenates per-page OCR + interleaves images,
|
||
|
|
* tables, and entity highlights based on each page's frontmatter.
|
||
|
|
*
|
||
|
|
* Returns a structured representation that the React view renders directly.
|
||
|
|
*/
|
||
|
|
import fs from "node:fs/promises";
|
||
|
|
import path from "node:path";
|
||
|
|
import { PROCESSING, readPage, readOcr, readTable, type MdFile } from "./wiki";
|
||
|
|
import { readMatches, type EntityMatch } from "./entity-index";
|
||
|
|
import type {
|
||
|
|
AnyFrontmatter, BBox, ImageDetected, TableDetected,
|
||
|
|
Redaction, SignatureObserved,
|
||
|
|
} from "./fm-types";
|
||
|
|
|
||
|
|
export interface InlineImage {
|
||
|
|
kind: "image";
|
||
|
|
bboxY: number;
|
||
|
|
bbox: BBox;
|
||
|
|
src: string;
|
||
|
|
caption?: string;
|
||
|
|
imageType?: string;
|
||
|
|
}
|
||
|
|
export interface InlineTable {
|
||
|
|
kind: "table";
|
||
|
|
bboxY: number;
|
||
|
|
bbox: BBox;
|
||
|
|
tableId?: string;
|
||
|
|
csv?: string[][];
|
||
|
|
rowEstimate?: number;
|
||
|
|
colEstimate?: number;
|
||
|
|
headersSummary?: string;
|
||
|
|
fallbackCropY: number; // y to derive crop from page PNG if no CSV
|
||
|
|
docId: string;
|
||
|
|
pageNum: number;
|
||
|
|
}
|
||
|
|
export interface InlineRedaction {
|
||
|
|
kind: "redaction";
|
||
|
|
bboxY: number;
|
||
|
|
bbox: BBox;
|
||
|
|
code?: string;
|
||
|
|
description?: string;
|
||
|
|
docId: string;
|
||
|
|
pageNum: number;
|
||
|
|
}
|
||
|
|
export interface InlineSignature {
|
||
|
|
kind: "signature";
|
||
|
|
bboxY: number;
|
||
|
|
bbox: BBox;
|
||
|
|
signer?: string;
|
||
|
|
docId: string;
|
||
|
|
pageNum: number;
|
||
|
|
}
|
||
|
|
export type Inline = InlineImage | InlineTable | InlineRedaction | InlineSignature;
|
||
|
|
|
||
|
|
export interface RenderedPage {
|
||
|
|
pageStem: string; // "p007"
|
||
|
|
pageNum: number;
|
||
|
|
pageId: string; // "doc-id/p007"
|
||
|
|
pngUrl: string;
|
||
|
|
ocr: string; // raw layout-preserved text
|
||
|
|
matches: EntityMatch[];
|
||
|
|
inline: Inline[]; // images + tables + redactions + signatures, sorted by bboxY
|
||
|
|
visionEn?: string;
|
||
|
|
visionPt?: string;
|
||
|
|
pageType?: string;
|
||
|
|
classification?: string;
|
||
|
|
contentClassification?: string[];
|
||
|
|
redactionsCount: number;
|
||
|
|
signaturesCount: number;
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface RenderedDoc {
|
||
|
|
docId: string;
|
||
|
|
canonicalTitle: string;
|
||
|
|
pageCount: number;
|
||
|
|
pages: RenderedPage[];
|
||
|
|
frontmatter: AnyFrontmatter;
|
||
|
|
documentBody: string; // markdown body from documents/<id>.md
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Find all the OCR pages for a doc + assemble into a RenderedDoc. */
|
||
|
|
export async function loadFullDocument(docId: string): Promise<RenderedDoc | null> {
|
||
|
|
// Read document.md
|
||
|
|
const docPath = path.join(PROCESSING, "..", "wiki", "documents", `${docId}.md`);
|
||
|
|
let docMd: MdFile;
|
||
|
|
try {
|
||
|
|
const raw = await fs.readFile(docPath, "utf-8");
|
||
|
|
const matter = (await import("gray-matter")).default(raw);
|
||
|
|
docMd = { fm: matter.data as AnyFrontmatter, body: matter.content };
|
||
|
|
} catch {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
const pagesDir = path.join(PROCESSING, "..", "wiki", "pages", docId);
|
||
|
|
let pageStems: string[];
|
||
|
|
try {
|
||
|
|
const files = await fs.readdir(pagesDir);
|
||
|
|
pageStems = files
|
||
|
|
.filter((f) => /^p\d{3}\.md$/.test(f))
|
||
|
|
.map((f) => f.replace(/\.md$/, ""))
|
||
|
|
.sort();
|
||
|
|
} catch {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
const pages = await Promise.all(
|
||
|
|
pageStems.map(async (stem): Promise<RenderedPage | null> => {
|
||
|
|
const md = await readPage(docId, stem);
|
||
|
|
if (!md) return null;
|
||
|
|
const fm = md.fm as AnyFrontmatter;
|
||
|
|
const pageNum = parseInt(stem.replace("p", ""), 10);
|
||
|
|
const padded = String(pageNum).padStart(3, "0");
|
||
|
|
const ocr = (await readOcr(docId, pageNum)) ?? "";
|
||
|
|
const matches = await readMatches(docId, stem);
|
||
|
|
|
||
|
|
const inline: Inline[] = [];
|
||
|
|
|
||
|
|
for (const im of (fm.images_detected ?? []) as ImageDetected[]) {
|
||
|
|
if (!im.bbox) continue;
|
||
|
|
const idx = im.local_image_index ?? 1;
|
||
|
|
inline.push({
|
||
|
|
kind: "image",
|
||
|
|
bboxY: im.bbox.y ?? 0,
|
||
|
|
bbox: im.bbox,
|
||
|
|
src: `/api/static/processing/png/${docId}/p-${padded}.png`,
|
||
|
|
caption: im.caption_ocr,
|
||
|
|
imageType: im.image_type,
|
||
|
|
});
|
||
|
|
// local_image_index recorded but not used as URL key (FmBboxThumb uses CSS crop)
|
||
|
|
void idx;
|
||
|
|
}
|
||
|
|
for (const t of (fm.tables_detected ?? []) as TableDetected[]) {
|
||
|
|
if (!t.bbox) continue;
|
||
|
|
let csv: string[][] | undefined;
|
||
|
|
if (t.table_id) {
|
||
|
|
const { csv: c } = await readTable(t.table_id);
|
||
|
|
if (c) csv = c;
|
||
|
|
}
|
||
|
|
inline.push({
|
||
|
|
kind: "table",
|
||
|
|
bboxY: t.bbox.y ?? 0,
|
||
|
|
bbox: t.bbox,
|
||
|
|
tableId: t.table_id,
|
||
|
|
csv,
|
||
|
|
rowEstimate: t.row_count_estimate,
|
||
|
|
colEstimate: t.col_count_estimate,
|
||
|
|
headersSummary: t.headers_summary,
|
||
|
|
fallbackCropY: t.bbox.y ?? 0,
|
||
|
|
docId,
|
||
|
|
pageNum,
|
||
|
|
});
|
||
|
|
}
|
||
|
|
for (const r of (fm.redactions ?? []) as Redaction[]) {
|
||
|
|
if (!r.bbox) continue;
|
||
|
|
inline.push({
|
||
|
|
kind: "redaction",
|
||
|
|
bboxY: r.bbox.y ?? 0,
|
||
|
|
bbox: r.bbox,
|
||
|
|
code: r.code,
|
||
|
|
description: r.description,
|
||
|
|
docId,
|
||
|
|
pageNum,
|
||
|
|
});
|
||
|
|
}
|
||
|
|
for (const s of (fm.signatures_observed ?? []) as SignatureObserved[]) {
|
||
|
|
if (!s.bbox) continue;
|
||
|
|
inline.push({
|
||
|
|
kind: "signature",
|
||
|
|
bboxY: s.bbox.y ?? 0,
|
||
|
|
bbox: s.bbox,
|
||
|
|
signer: s.signer_inferred ?? undefined,
|
||
|
|
docId,
|
||
|
|
pageNum,
|
||
|
|
});
|
||
|
|
}
|
||
|
|
inline.sort((a, b) => a.bboxY - b.bboxY);
|
||
|
|
|
||
|
|
return {
|
||
|
|
pageStem: stem,
|
||
|
|
pageNum,
|
||
|
|
pageId: `${docId}/${stem}`,
|
||
|
|
pngUrl: `/api/static/processing/png/${docId}/p-${padded}.png`,
|
||
|
|
ocr,
|
||
|
|
matches,
|
||
|
|
inline,
|
||
|
|
visionEn: typeof fm.vision_description === "string" ? fm.vision_description : undefined,
|
||
|
|
visionPt: typeof fm.vision_description_pt_br === "string" ? fm.vision_description_pt_br : undefined,
|
||
|
|
pageType: typeof fm.page_type === "string" ? fm.page_type : undefined,
|
||
|
|
classification: typeof fm.highest_classification === "string" ? fm.highest_classification : undefined,
|
||
|
|
contentClassification: Array.isArray(fm.content_classification) ? (fm.content_classification as string[]) : undefined,
|
||
|
|
redactionsCount: Array.isArray(fm.redactions) ? fm.redactions.length : 0,
|
||
|
|
signaturesCount: Array.isArray(fm.signatures_observed) ? fm.signatures_observed.length : 0,
|
||
|
|
};
|
||
|
|
}),
|
||
|
|
);
|
||
|
|
|
||
|
|
return {
|
||
|
|
docId,
|
||
|
|
canonicalTitle: (docMd.fm.canonical_title as string) ?? docId,
|
||
|
|
pageCount: pageStems.length,
|
||
|
|
pages: pages.filter((p): p is RenderedPage => p !== null),
|
||
|
|
frontmatter: docMd.fm,
|
||
|
|
documentBody: docMd.body,
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Splits OCR text into N segments (by approximate Y coordinate based on
|
||
|
|
* line count). Used to interleave inline blocks at their bbox.y.
|
||
|
|
*/
|
||
|
|
export function splitOcrIntoSegments(ocr: string, nBlocks: number): string[] {
|
||
|
|
if (nBlocks <= 1) return [ocr];
|
||
|
|
const lines = ocr.split("\n");
|
||
|
|
if (lines.length === 0) return [ocr];
|
||
|
|
const segments: string[] = [];
|
||
|
|
const linesPerSeg = Math.max(1, Math.ceil(lines.length / nBlocks));
|
||
|
|
for (let i = 0; i < nBlocks; i++) {
|
||
|
|
const start = i * linesPerSeg;
|
||
|
|
const end = i === nBlocks - 1 ? lines.length : (i + 1) * linesPerSeg;
|
||
|
|
segments.push(lines.slice(start, end).join("\n"));
|
||
|
|
}
|
||
|
|
return segments;
|
||
|
|
}
|