Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
51 lines
1.6 KiB
Python
51 lines
1.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Resync each `_index.json` so its embedded chunks[].page reflects the corrected
|
|
markdown frontmatter (after script 43).
|
|
|
|
Idempotent.
|
|
"""
|
|
from __future__ import annotations
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
|
|
CHUNKS_ROOT = Path("/Users/guto/ufo/raw")
|
|
PAGE_RE = re.compile(r"^page:\s*(\d+)\s*$", re.M)
|
|
CID_RE = re.compile(r"^chunk_id:\s*(\S+)\s*$", re.M)
|
|
|
|
|
|
def main() -> None:
|
|
touched = 0
|
|
for chunks_dir in sorted(CHUNKS_ROOT.glob("*--subagent/chunks")):
|
|
idx_path = chunks_dir.parent / "_index.json"
|
|
if not idx_path.is_file():
|
|
continue
|
|
idx = json.loads(idx_path.read_text(encoding="utf-8"))
|
|
chunks = idx.get("chunks") or []
|
|
if not chunks:
|
|
continue
|
|
# Build chunk_id -> page from disk
|
|
truth: dict[str, int] = {}
|
|
for f in chunks_dir.glob("*.md"):
|
|
head = f.read_text(encoding="utf-8")[:2000]
|
|
cm = CID_RE.search(head)
|
|
pm = PAGE_RE.search(head)
|
|
if cm and pm:
|
|
truth[cm.group(1)] = int(pm.group(1))
|
|
changed = 0
|
|
for entry in chunks:
|
|
cid = entry.get("chunk_id")
|
|
real = truth.get(cid)
|
|
if real is not None and entry.get("page") != real:
|
|
entry["page"] = real
|
|
changed += 1
|
|
if changed:
|
|
idx_path.write_text(json.dumps(idx, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
print(f" {idx.get('doc_id')}: updated {changed} entries")
|
|
touched += 1
|
|
print(f"\nDocs touched: {touched}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|