Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
212 lines
8.1 KiB
Python
212 lines
8.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
31_aggregate_pages_from_chunks.py — Generate thin wiki/pages/<doc>/p<NNN>.md
|
|
files for pages where the chunks/ already have content but the per-page vision
|
|
pipeline (02-vision-page.py) never produced an aggregator file.
|
|
|
|
Source of truth: raw/<doc>--subagent/_index.json + chunks/c*.md (Sonnet-extracted)
|
|
Output: wiki/pages/<doc>/p<NNN>.md (thin aggregator, tagged source:chunk-aggregator)
|
|
|
|
Skips pages that already have a wiki/pages/.md (idempotent).
|
|
|
|
Run:
|
|
python3 scripts/synthesize/31_aggregate_pages_from_chunks.py
|
|
python3 scripts/synthesize/31_aggregate_pages_from_chunks.py --doc-id <id> # one doc
|
|
python3 scripts/synthesize/31_aggregate_pages_from_chunks.py --dry-run
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
from collections import defaultdict
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
UFO = Path("/Users/guto/ufo")
|
|
RAW = UFO / "raw"
|
|
PNG_BASE = UFO / "processing" / "png"
|
|
PAGES_BASE = UFO / "wiki" / "pages"
|
|
|
|
SCHEMA_VERSION = "0.1.0"
|
|
WIKI_VERSION = "0.1.0"
|
|
NOW = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
|
|
def split_frontmatter(text: str) -> tuple[dict, str]:
|
|
if not text.startswith("---"): return {}, text
|
|
parts = text.split("---", 2)
|
|
if len(parts) < 3: return {}, text
|
|
fm: dict = {}
|
|
for line in parts[1].splitlines():
|
|
m = re.match(r"^([a-zA-Z_]+):\s*(.*)$", line)
|
|
if not m: continue
|
|
fm[m.group(1)] = m.group(2).strip()
|
|
return fm, parts[2]
|
|
|
|
|
|
def extract_bilingual(body: str) -> tuple[str, str]:
|
|
"""Return (en, pt_br) text from a chunk body. Either may be empty."""
|
|
en_m = re.search(r"\*\*EN:\*\*\s*(.*?)(?:\n\s*\*\*PT-BR:\*\*|\Z)", body, re.S)
|
|
pt_m = re.search(r"\*\*PT-BR:\*\*\s*(.*?)\Z", body, re.S)
|
|
en = (en_m.group(1).strip() if en_m else "").strip()
|
|
pt = (pt_m.group(1).strip() if pt_m else "").strip()
|
|
return en, pt
|
|
|
|
|
|
def find_missing_pages() -> dict[str, list[int]]:
|
|
"""For each doc, return sorted list of pages where PNG exists but wiki/pages/.md doesn't."""
|
|
missing: dict[str, list[int]] = defaultdict(list)
|
|
for png in PNG_BASE.glob("*/p-*.png"):
|
|
doc_id = png.parent.name
|
|
m = re.match(r"p-(\d+)\.png$", png.name)
|
|
if not m: continue
|
|
n = int(m.group(1))
|
|
wiki = PAGES_BASE / doc_id / f"p{n:03d}.md"
|
|
if not wiki.is_file():
|
|
missing[doc_id].append(n)
|
|
return {d: sorted(ps) for d, ps in missing.items()}
|
|
|
|
|
|
def build_page_md(doc_id: str, page_num: int) -> str | None:
|
|
"""Assemble a single page.md from the doc's _index.json + chunks/."""
|
|
sub = RAW / f"{doc_id}--subagent"
|
|
idx_path = sub / "_index.json"
|
|
if not idx_path.is_file(): return None
|
|
idx = json.loads(idx_path.read_text(encoding="utf-8"))
|
|
chunks_for_page = [c for c in (idx.get("chunks") or []) if c.get("page") == page_num]
|
|
if not chunks_for_page:
|
|
return None # no chunk data → can't aggregate
|
|
chunks_for_page.sort(key=lambda x: x.get("order_in_page", 0))
|
|
|
|
total_pages = idx.get("total_pages")
|
|
rel_png = f"../../../processing/png/{doc_id}/p-{page_num:03d}.png"
|
|
|
|
# Aggregate per-chunk EN/PT/metadata
|
|
body_blocks: list[str] = []
|
|
types_seen: set[str] = set()
|
|
chunk_ids: list[str] = []
|
|
has_redaction = has_image = has_table = has_stamp = has_signature = False
|
|
classifications: set[str] = set()
|
|
|
|
for c in chunks_for_page:
|
|
cid = c.get("chunk_id")
|
|
chunk_ids.append(cid)
|
|
ctype = c.get("type") or "?"
|
|
types_seen.add(ctype)
|
|
chunk_path = sub / "chunks" / f"{cid}.md"
|
|
if not chunk_path.is_file(): continue
|
|
text = chunk_path.read_text(encoding="utf-8")
|
|
fm, body = split_frontmatter(text)
|
|
en, pt = extract_bilingual(body)
|
|
|
|
if not en and not pt:
|
|
# fall back to extracted_text / image_description fields
|
|
en = (fm.get("image_description_en") or fm.get("extracted_text") or "").strip().strip('"\'')
|
|
pt = (fm.get("image_description_pt_br") or "").strip().strip('"\'')
|
|
|
|
# Heuristic flags
|
|
if ctype in ("redaction", "redacted_block"): has_redaction = True
|
|
if "image" in ctype or "photo" in ctype or "diagram" in ctype or "sketch" in ctype or "map" in ctype:
|
|
has_image = True
|
|
if "table" in ctype: has_table = True
|
|
if "stamp" in ctype: has_stamp = True
|
|
if "signature" in ctype: has_signature = True
|
|
cls = fm.get("classification")
|
|
if cls and cls != "null": classifications.add(cls)
|
|
|
|
# Body block
|
|
block = f"### Chunk `{cid}` — type: {ctype}\n"
|
|
bbox = c.get("bbox") or {}
|
|
if bbox:
|
|
block += f"_bbox_: x={bbox.get('x')}, y={bbox.get('y')}, w={bbox.get('w')}, h={bbox.get('h')}\n\n"
|
|
if en: block += f"**EN:** {en}\n\n"
|
|
if pt: block += f"**PT-BR:** {pt}\n"
|
|
body_blocks.append(block.rstrip())
|
|
|
|
# Content classification
|
|
content_class = []
|
|
if has_image: content_class.append("contains-photos")
|
|
if has_table: content_class.append("contains-tables")
|
|
if has_stamp: content_class.append("contains-stamps")
|
|
if has_signature: content_class.append("contains-signatures")
|
|
if has_redaction: content_class.append("redaction-heavy")
|
|
if not content_class: content_class.append("text-only")
|
|
|
|
# Page-level inferred type (best-effort)
|
|
if "classification_banner" in types_seen and len(types_seen) <= 3:
|
|
page_type = "cover"
|
|
elif "header" in types_seen and "transcript_block" in types_seen:
|
|
page_type = "transcript"
|
|
elif has_table and not body_blocks:
|
|
page_type = "table_only"
|
|
elif "letterhead" in types_seen:
|
|
page_type = "memo"
|
|
else:
|
|
page_type = "mixed"
|
|
|
|
# Frontmatter
|
|
fm = {
|
|
"schema_version": SCHEMA_VERSION,
|
|
"type": "page",
|
|
"page_id": f"{doc_id}/p{page_num:03d}",
|
|
"doc_id": doc_id,
|
|
"page_number": page_num,
|
|
"total_pages": total_pages,
|
|
"png_path": rel_png,
|
|
"page_type": page_type,
|
|
"content_classification": content_class,
|
|
"classification_markings": [{"level": c} for c in sorted(classifications)] if classifications else [],
|
|
"chunks_on_page": chunk_ids,
|
|
"chunk_count": len(chunk_ids),
|
|
"source": "chunk-aggregator",
|
|
"source_note": "Page-md generated from chunks built by Sonnet vision (raw/<doc>--subagent/chunks/). Per-page vision Haiku pipeline (02-vision-page.py) never produced an output for this page.",
|
|
"last_ingest": NOW,
|
|
"wiki_version": WIKI_VERSION,
|
|
}
|
|
import yaml
|
|
yaml_block = yaml.safe_dump(fm, sort_keys=False, allow_unicode=True,
|
|
default_flow_style=False, width=10_000).rstrip()
|
|
body = "\n\n".join(body_blocks) if body_blocks else "_(no extractable text — see chunk files directly)_"
|
|
return f"---\n{yaml_block}\n---\n\n# Page {page_num} of {doc_id}\n\n{body}\n"
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--doc-id", default=None)
|
|
ap.add_argument("--dry-run", action="store_true")
|
|
args = ap.parse_args()
|
|
|
|
missing = find_missing_pages()
|
|
if args.doc_id:
|
|
missing = {args.doc_id: missing.get(args.doc_id, [])}
|
|
|
|
total_missing = sum(len(ps) for ps in missing.values())
|
|
print(f"[1/2] Inventory: {sum(1 for d, ps in missing.items() if ps)} docs, {total_missing} missing pages")
|
|
if args.dry_run:
|
|
for d, ps in sorted(missing.items(), key=lambda kv: -len(kv[1])):
|
|
if ps: print(f" {d}: {len(ps)}")
|
|
return 0
|
|
|
|
print(f"\n[2/2] Generating thin aggregator page.md files ...")
|
|
written = 0
|
|
skipped_no_chunks = 0
|
|
for doc_id, pages in missing.items():
|
|
for n in pages:
|
|
md = build_page_md(doc_id, n)
|
|
if md is None:
|
|
skipped_no_chunks += 1
|
|
continue
|
|
out = PAGES_BASE / doc_id / f"p{n:03d}.md"
|
|
out.parent.mkdir(parents=True, exist_ok=True)
|
|
out.write_text(md, encoding="utf-8")
|
|
written += 1
|
|
|
|
print(f" written: {written}")
|
|
print(f" skipped (no chunk data): {skipped_no_chunks}")
|
|
print(f"\n✓ done.")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|