Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
340 lines
13 KiB
Python
340 lines
13 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Text-based backfill of entity → page references.
|
||
|
||
The structured pipelines (Sonnet chunks, Haiku page-level events/entities)
|
||
miss many entities the corpus actually discusses — they extract only what
|
||
they confidently structure into the schema. The vision_description and
|
||
narrative_summary fields routinely *talk about* an event/person/place
|
||
without listing it in the structured arrays.
|
||
|
||
This script does a fuzzy back-fill: scans the narrative body of every page
|
||
YAML for textual matches of every entity's canonical_name + aliases, and
|
||
records the hits as a new signal source `text_refs`. Aho-Corasick is used
|
||
so the whole 3k-pages × 34k-entities cross-product collapses to a single
|
||
linear scan per page.
|
||
|
||
Conservative filtering keeps the noise floor low:
|
||
- minimum 5 chars per alias (4 if alias has a digit, to keep e.g. "USS")
|
||
- blacklist of common stopwords / generic terms
|
||
- word-boundary enforcement (\b in regex, manual check after AC scan)
|
||
- skip purely numeric and ASCII-fold-identical-to-id aliases
|
||
|
||
YAML output (added in-place on each entity file):
|
||
text_mentioned_in: ['[[doc-id/pNNN]]', ...] # only refs NOT already in mentioned_in
|
||
signal_sources.text_refs: N
|
||
total_mentions = db_chunks + page_refs + cross_refs + text_refs
|
||
signal_strength recomputed using text_refs as a weak signal
|
||
|
||
Run:
|
||
python3 scripts/maintain/46_text_backfill_mentions.py [--dry-run]
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import re
|
||
import sys
|
||
from collections import defaultdict
|
||
from pathlib import Path
|
||
from typing import Iterable
|
||
|
||
import ahocorasick
|
||
import yaml
|
||
|
||
WIKI = Path("/Users/guto/ufo/wiki")
|
||
ENTITIES_BASE = WIKI / "entities"
|
||
PAGES_BASE = WIKI / "pages"
|
||
|
||
# Generic / stop-words: never accept these as match patterns even if listed
|
||
# as an alias. Lowercased. PT-BR + EN + universally vague terms.
|
||
BLACKLIST: set[str] = {
|
||
# english stopwords / common
|
||
"the", "and", "for", "with", "from", "this", "that", "these", "those",
|
||
"report", "reports", "object", "objects", "unknown", "unidentified",
|
||
"anomalous", "aerial", "phenomenon", "phenomena", "sighting", "sightings",
|
||
"case", "cases", "incident", "incidents", "event", "events", "encounter",
|
||
"encounters", "observation", "observations", "document", "documents",
|
||
"memo", "memos", "letter", "letters", "table", "tables", "image", "images",
|
||
"general", "section", "agent", "agents", "subject", "subjects",
|
||
"office", "offices", "field", "fields", "summary", "summaries",
|
||
"true", "false", "type", "types", "data", "name", "names",
|
||
"person", "people", "place", "places", "location", "locations",
|
||
"vehicle", "vehicles", "operation", "operations", "concept", "concepts",
|
||
"page", "pages", "chunk", "chunks", "scan", "scans",
|
||
"north", "south", "east", "west", "central",
|
||
"captain", "major", "colonel", "general", "lieutenant", "sergeant",
|
||
"monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
|
||
"january", "february", "march", "april", "june", "july", "august",
|
||
"september", "october", "november", "december",
|
||
# pt-br stopwords / common
|
||
"para", "como", "este", "esta", "esse", "essa", "isso", "aquele",
|
||
"ainda", "outro", "outra", "outros", "outras", "todos", "todas",
|
||
"relatorio", "relatório", "objeto", "objetos", "documento", "documentos",
|
||
"página", "paginas", "páginas", "evento", "eventos", "incidente",
|
||
"incidentes", "pessoa", "pessoas", "lugar", "lugares", "local", "locais",
|
||
"operação", "operacao", "geral", "agente", "agentes", "campo", "campos",
|
||
"norte", "sul", "leste", "oeste",
|
||
"janeiro", "fevereiro", "março", "marco", "abril", "junho", "julho",
|
||
"agosto", "setembro", "outubro", "novembro", "dezembro",
|
||
# generic acronyms widely embedded in unrelated text
|
||
"uap", "ufo", "usaaf", "usaf", "usa", "fbi", "dod", "nasa",
|
||
"atom", "atoms", "atomic",
|
||
}
|
||
|
||
|
||
def is_acceptable_alias(name: str) -> bool:
|
||
n = name.strip()
|
||
if not n:
|
||
return False
|
||
nl = n.lower()
|
||
if nl in BLACKLIST:
|
||
return False
|
||
# Must contain at least one letter
|
||
if not re.search(r"[a-zA-ZÀ-ÿ]", n):
|
||
return False
|
||
# Purely numeric or punctuation
|
||
if re.fullmatch(r"[\d\s\-_.,]+", n):
|
||
return False
|
||
# Single-word too short (5 char min unless contains a digit)
|
||
if " " not in n and "-" not in n and len(n) < 5 and not re.search(r"\d", n):
|
||
return False
|
||
return True
|
||
|
||
|
||
def parse_frontmatter(text: str) -> tuple[dict, str]:
|
||
if not text.startswith("---"):
|
||
return {}, text
|
||
parts = text.split("---", 2)
|
||
if len(parts) < 3:
|
||
return {}, text
|
||
try:
|
||
fm = yaml.safe_load(parts[1]) or {}
|
||
except yaml.YAMLError:
|
||
fm = {}
|
||
return fm, parts[2]
|
||
|
||
|
||
def dump_frontmatter_preserving_body(fm: dict, body: str) -> str:
|
||
return "---\n" + yaml.safe_dump(fm, sort_keys=False, allow_unicode=True, width=1000) + "---" + body
|
||
|
||
|
||
def extract_searchable_text(page_fm: dict, page_body: str) -> str:
|
||
"""Pick narrative-only fields from a page YAML — avoid YAML keys, IDs, enums."""
|
||
parts: list[str] = []
|
||
for key in (
|
||
"vision_description",
|
||
"vision_description_pt_br",
|
||
"narrative_summary",
|
||
"narrative_summary_pt_br",
|
||
"extracted_text",
|
||
):
|
||
v = page_fm.get(key)
|
||
if isinstance(v, str):
|
||
parts.append(v)
|
||
parts.append(page_body)
|
||
return "\n".join(parts)
|
||
|
||
|
||
# Map entity_class -> folder name
|
||
FOLDER_BY_CLASS = {
|
||
"person": "people",
|
||
"organization": "organizations",
|
||
"location": "locations",
|
||
"event": "events",
|
||
"uap_object": "uap-objects",
|
||
"vehicle": "vehicles",
|
||
"operation": "operations",
|
||
"concept": "concepts",
|
||
}
|
||
|
||
|
||
def entity_id_from_fm(fm: dict) -> tuple[str, str] | None:
|
||
cls = fm.get("entity_class")
|
||
if cls:
|
||
eid_key = f"{cls}_id"
|
||
eid = fm.get(eid_key) or fm.get("entity_id")
|
||
if eid:
|
||
return cls, eid
|
||
# legacy fallback
|
||
for k in ("person_id", "organization_id", "location_id", "event_id",
|
||
"uap_object_id", "vehicle_id", "operation_id", "concept_id"):
|
||
if k in fm:
|
||
return k.replace("_id", ""), fm[k]
|
||
return None
|
||
|
||
|
||
def signal_strength(db_chunks: int, page_refs: int, cross_refs: int, text_refs: int) -> str:
|
||
total = db_chunks + page_refs + cross_refs + text_refs
|
||
if total == 0:
|
||
return "orphan"
|
||
if db_chunks >= 3 or page_refs >= 3 or (db_chunks >= 1 and page_refs >= 1) or text_refs >= 5:
|
||
return "strong"
|
||
return "weak"
|
||
|
||
|
||
def main() -> int:
|
||
p = argparse.ArgumentParser()
|
||
p.add_argument("--dry-run", action="store_true")
|
||
p.add_argument("--verbose", action="store_true")
|
||
args = p.parse_args()
|
||
|
||
# 1. Load entities + collect (alias_lower → list of (entity_file_path, original_alias))
|
||
print("Loading entities ...")
|
||
automaton = ahocorasick.Automaton()
|
||
entities: dict[Path, dict] = {}
|
||
alias_per_entity_count = 0
|
||
accepted_entities = 0
|
||
for ent_file in ENTITIES_BASE.rglob("*.md"):
|
||
if "_archived" in ent_file.parts:
|
||
continue
|
||
try:
|
||
text = ent_file.read_text(encoding="utf-8")
|
||
except Exception:
|
||
continue
|
||
fm, _body = parse_frontmatter(text)
|
||
if not fm:
|
||
continue
|
||
if entity_id_from_fm(fm) is None:
|
||
continue
|
||
canonical = fm.get("canonical_name") or fm.get("canonical_title")
|
||
aliases = fm.get("aliases") or []
|
||
names = []
|
||
if isinstance(canonical, str):
|
||
names.append(canonical)
|
||
for a in aliases:
|
||
if isinstance(a, str):
|
||
names.append(a)
|
||
accepted = [n for n in names if is_acceptable_alias(n)]
|
||
if not accepted:
|
||
continue
|
||
entities[ent_file] = {"fm": fm, "raw_text": text, "accepted": accepted}
|
||
accepted_entities += 1
|
||
for n in accepted:
|
||
automaton.add_word(n.lower(), (str(ent_file), n))
|
||
alias_per_entity_count += 1
|
||
|
||
automaton.make_automaton()
|
||
print(f" entities considered: {accepted_entities}")
|
||
print(f" searchable aliases: {alias_per_entity_count}")
|
||
|
||
# 2. Scan every page YAML
|
||
print("Scanning pages ...")
|
||
hits_by_entity: dict[str, set[str]] = defaultdict(set)
|
||
pages_scanned = 0
|
||
total_hits = 0
|
||
for page_file in PAGES_BASE.rglob("p*.md"):
|
||
try:
|
||
text = page_file.read_text(encoding="utf-8")
|
||
except Exception:
|
||
continue
|
||
fm, body = parse_frontmatter(text)
|
||
if not fm:
|
||
continue
|
||
page_id = fm.get("page_id")
|
||
if not page_id:
|
||
# Derive from filesystem: <doc-id>/p<NNN>
|
||
try:
|
||
rel = page_file.relative_to(PAGES_BASE)
|
||
page_id = f"{rel.parent}/{rel.stem}"
|
||
except ValueError:
|
||
continue
|
||
ref = f"[[{page_id}]]"
|
||
searchable = extract_searchable_text(fm, body).lower()
|
||
pages_scanned += 1
|
||
seen_this_page: set[str] = set()
|
||
for end_idx, (ent_path_str, original) in automaton.iter(searchable):
|
||
pattern = original.lower()
|
||
start_idx = end_idx - len(pattern) + 1
|
||
# Word boundary check
|
||
if start_idx > 0 and (searchable[start_idx - 1].isalnum() or searchable[start_idx - 1] == "_"):
|
||
continue
|
||
after = end_idx + 1
|
||
if after < len(searchable) and (searchable[after].isalnum() or searchable[after] == "_"):
|
||
continue
|
||
if ent_path_str in seen_this_page:
|
||
continue
|
||
seen_this_page.add(ent_path_str)
|
||
hits_by_entity[ent_path_str].add(ref)
|
||
total_hits += 1
|
||
|
||
print(f" pages scanned: {pages_scanned}")
|
||
print(f" total hits: {total_hits}")
|
||
print(f" entities matched: {len(hits_by_entity)}")
|
||
|
||
# 3. Write back to entity YAML
|
||
print("Writing back ...")
|
||
promoted = 0
|
||
upgraded = 0
|
||
updated = 0
|
||
for ent_path_str, refs in hits_by_entity.items():
|
||
ent_file = Path(ent_path_str)
|
||
rec = entities.get(ent_file)
|
||
if not rec:
|
||
continue
|
||
fm = rec["fm"]
|
||
raw_text = rec["raw_text"]
|
||
|
||
# Don't double-count refs already in mentioned_in (structured page_refs)
|
||
existing_mentioned = set(fm.get("mentioned_in") or [])
|
||
new_text_refs = sorted(refs - existing_mentioned)
|
||
|
||
old_sources = (fm.get("signal_sources") or {}).copy()
|
||
db_chunks = int(old_sources.get("db_chunks", 0))
|
||
page_refs = int(old_sources.get("page_refs", len(existing_mentioned)))
|
||
cross_refs = int(old_sources.get("cross_refs", 0))
|
||
text_refs = len(new_text_refs)
|
||
|
||
old_strength = fm.get("signal_strength", "unverified")
|
||
new_strength = signal_strength(db_chunks, page_refs, cross_refs, text_refs)
|
||
new_total = db_chunks + page_refs + cross_refs + text_refs
|
||
|
||
fm["text_mentioned_in"] = new_text_refs
|
||
sources = old_sources
|
||
sources["db_chunks"] = db_chunks
|
||
sources["page_refs"] = page_refs
|
||
sources["cross_refs"] = cross_refs
|
||
sources["text_refs"] = text_refs
|
||
fm["signal_sources"] = sources
|
||
fm["total_mentions"] = new_total
|
||
fm["signal_strength"] = new_strength
|
||
|
||
if old_strength == "orphan" and new_strength != "orphan":
|
||
promoted += 1
|
||
if old_strength == "weak" and new_strength == "strong":
|
||
upgraded += 1
|
||
updated += 1
|
||
|
||
if not args.dry_run:
|
||
# Preserve body verbatim
|
||
_, body = parse_frontmatter(raw_text)
|
||
new_text = dump_frontmatter_preserving_body(fm, body)
|
||
ent_file.write_text(new_text, encoding="utf-8")
|
||
|
||
# Also: entities not matched at all — they keep their existing state.
|
||
# But ensure their signal_sources.text_refs is at least set to 0 if missing
|
||
# (so the YAML schema is consistent).
|
||
backfill_zeros = 0
|
||
if not args.dry_run:
|
||
for ent_file, rec in entities.items():
|
||
if str(ent_file) in hits_by_entity:
|
||
continue
|
||
fm = rec["fm"]
|
||
sources = (fm.get("signal_sources") or {})
|
||
if "text_refs" not in sources:
|
||
sources["text_refs"] = 0
|
||
fm["signal_sources"] = sources
|
||
_, body = parse_frontmatter(rec["raw_text"])
|
||
ent_file.write_text(dump_frontmatter_preserving_body(fm, body), encoding="utf-8")
|
||
backfill_zeros += 1
|
||
|
||
print()
|
||
print(f" entities updated: {updated}")
|
||
print(f" promoted orphan → weak: {promoted}")
|
||
print(f" upgraded weak → strong: {upgraded}")
|
||
print(f" zero-text-ref backfills: {backfill_zeros}")
|
||
print(f" dry-run: {args.dry_run}")
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|