52 lines
1.6 KiB
Python
52 lines
1.6 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Resync each `_index.json` so its embedded chunks[].page reflects the corrected
|
||
|
|
markdown frontmatter (after script 43).
|
||
|
|
|
||
|
|
Idempotent.
|
||
|
|
"""
|
||
|
|
from __future__ import annotations
|
||
|
|
import json
|
||
|
|
import re
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
CHUNKS_ROOT = Path("/Users/guto/ufo/raw")
|
||
|
|
PAGE_RE = re.compile(r"^page:\s*(\d+)\s*$", re.M)
|
||
|
|
CID_RE = re.compile(r"^chunk_id:\s*(\S+)\s*$", re.M)
|
||
|
|
|
||
|
|
|
||
|
|
def main() -> None:
|
||
|
|
touched = 0
|
||
|
|
for chunks_dir in sorted(CHUNKS_ROOT.glob("*--subagent/chunks")):
|
||
|
|
idx_path = chunks_dir.parent / "_index.json"
|
||
|
|
if not idx_path.is_file():
|
||
|
|
continue
|
||
|
|
idx = json.loads(idx_path.read_text(encoding="utf-8"))
|
||
|
|
chunks = idx.get("chunks") or []
|
||
|
|
if not chunks:
|
||
|
|
continue
|
||
|
|
# Build chunk_id -> page from disk
|
||
|
|
truth: dict[str, int] = {}
|
||
|
|
for f in chunks_dir.glob("*.md"):
|
||
|
|
head = f.read_text(encoding="utf-8")[:2000]
|
||
|
|
cm = CID_RE.search(head)
|
||
|
|
pm = PAGE_RE.search(head)
|
||
|
|
if cm and pm:
|
||
|
|
truth[cm.group(1)] = int(pm.group(1))
|
||
|
|
changed = 0
|
||
|
|
for entry in chunks:
|
||
|
|
cid = entry.get("chunk_id")
|
||
|
|
real = truth.get(cid)
|
||
|
|
if real is not None and entry.get("page") != real:
|
||
|
|
entry["page"] = real
|
||
|
|
changed += 1
|
||
|
|
if changed:
|
||
|
|
idx_path.write_text(json.dumps(idx, indent=2, ensure_ascii=False), encoding="utf-8")
|
||
|
|
print(f" {idx.get('doc_id')}: updated {changed} entries")
|
||
|
|
touched += 1
|
||
|
|
print(f"\nDocs touched: {touched}")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|