From ebc6fa41e9c26522d5c0bbe1e355a10109a23353 Mon Sep 17 00:00:00 2001 From: Luiz Gustavo Date: Thu, 21 May 2026 14:32:55 -0300 Subject: [PATCH] fix: keep _index.json total_pages in sync after recovering pages The reprocess pass added chunks for pages beyond the original total_pages but never updated the field, so doc-page navigation thought docs ended early (jumped to next document mid-doc) and the page counter was wrong. Now bump total_pages to the real max chunk page on each integration. Co-Authored-By: Claude Opus 4.7 --- scripts/synthesize/32_reprocess_missing_pages.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/synthesize/32_reprocess_missing_pages.py b/scripts/synthesize/32_reprocess_missing_pages.py index 37ad996..7c26ab7 100644 --- a/scripts/synthesize/32_reprocess_missing_pages.py +++ b/scripts/synthesize/32_reprocess_missing_pages.py @@ -321,6 +321,10 @@ def process_one_page(doc_id: str, page_num: int) -> tuple[bool, int]: except Exception as e: print(f" [ERR ] {doc_id} p{page_num:03d} — integrate: {e}", flush=True) return (False, 0) + # Keep total_pages in sync with the real max page (recovered pages extend it) + max_page = max((c.get("page", 0) for c in idx.get("chunks") or []), default=0) + if max_page > idx.get("total_pages", 0): + idx["total_pages"] = max_page idx_path.write_text(json.dumps(idx, indent=2, ensure_ascii=False), encoding="utf-8") print(f" [OK ] {doc_id} p{page_num:03d} — {n} chunks", flush=True) return (True, n)