Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
83 lines
2.9 KiB
Python
83 lines
2.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Normalize each chunk's `page:` field to match the actual PNG it was rendered
|
|
against (`source_png`).
|
|
|
|
Background: the chunker (Sonnet) populated `page:` with the page-number it
|
|
INFERRED from the document's printed footer/header — which often diverges from
|
|
the PNG index after the PDF→PNG conversion (cover sheets, blank pages, FBI
|
|
section markers, etc).
|
|
|
|
The UI routes `/d/<doc>/<pNNN>` by PNG index, so the chunk `page` field MUST
|
|
match the PNG index for the page view to show the right chunks alongside the
|
|
right scan.
|
|
|
|
This script rewrites `page:` IN PLACE in every raw chunk markdown where the
|
|
field disagrees with the number embedded in `source_png:`. It is idempotent —
|
|
re-running it on a clean tree is a no-op.
|
|
|
|
Run:
|
|
python3 scripts/maintain/43_fix_chunk_page_from_source_png.py [--dry-run]
|
|
"""
|
|
from __future__ import annotations
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
CHUNKS_ROOT = Path("/Users/guto/ufo/raw")
|
|
PAGE_RE = re.compile(r"^page:\s*(\d+)\s*$", re.M)
|
|
SRC_RE = re.compile(r"source_png:\s*\"?[^\"\n]*?p-?(\d+)\.png", re.M)
|
|
|
|
|
|
def main() -> int:
|
|
dry = "--dry-run" in sys.argv
|
|
fixed = 0
|
|
scanned = 0
|
|
by_doc: dict[str, int] = defaultdict(int)
|
|
samples: list[tuple[str, int, int]] = []
|
|
|
|
for chunks_dir in sorted(CHUNKS_ROOT.glob("*--subagent/chunks")):
|
|
doc_id = chunks_dir.parent.name.replace("--subagent", "")
|
|
for f in chunks_dir.glob("*.md"):
|
|
content = f.read_text(encoding="utf-8")
|
|
if not content.startswith("---"):
|
|
continue
|
|
parts = content.split("---", 2)
|
|
if len(parts) < 3:
|
|
continue
|
|
_, fm, body = parts
|
|
page_m = PAGE_RE.search(fm)
|
|
src_m = SRC_RE.search(fm)
|
|
if not (page_m and src_m):
|
|
continue
|
|
scanned += 1
|
|
declared = int(page_m.group(1))
|
|
real = int(src_m.group(1))
|
|
if declared == real:
|
|
continue
|
|
new_fm = PAGE_RE.sub(f"page: {real}", fm, count=1)
|
|
new_content = "---" + new_fm + "---" + body
|
|
if not dry:
|
|
f.write_text(new_content, encoding="utf-8")
|
|
fixed += 1
|
|
by_doc[doc_id] += 1
|
|
if len(samples) < 5:
|
|
samples.append((f"{doc_id}/{f.name}", declared, real))
|
|
|
|
print(f"Scanned: {scanned} chunks")
|
|
print(f"Fixed: {fixed} chunks ({'dry-run' if dry else 'written'})")
|
|
print(f"Docs touched: {len(by_doc)}")
|
|
if by_doc:
|
|
print("\nTop docs by fix count:")
|
|
for doc, n in sorted(by_doc.items(), key=lambda x: -x[1])[:15]:
|
|
print(f" {n:>5} {doc}")
|
|
if samples:
|
|
print("\nSample fixes:")
|
|
for path, d, r in samples:
|
|
print(f" {path}: page {d} -> {r}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|