disclosure-bureau/scripts/maintain/46_text_backfill_mentions.py
Luiz Gustavo a7e9dce6d2 rebuild entity layer from Sonnet-vision reextract pipeline
Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity
JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page
extraction. Add synthesize scripts to regenerate wiki/entities from the 116
_reextract.json (30), aggregate missing page.md from chunks (31), and reprocess
805 pages the doc-rebuilder agent dropped on context overflow (32). Add
maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and
typed relation extraction.

Web: wire relations API + entity-relations component; entity/timeline/doc
pages consume the rebuilt layer.

Note: raw/, processing/, wiki/ remain gitignored (bulk data managed
separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on
disk only. The 27 curated anchor events under wiki/entities/events/ are
preserved.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-21 12:20:24 -03:00

340 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Text-based backfill of entity → page references.
The structured pipelines (Sonnet chunks, Haiku page-level events/entities)
miss many entities the corpus actually discusses — they extract only what
they confidently structure into the schema. The vision_description and
narrative_summary fields routinely *talk about* an event/person/place
without listing it in the structured arrays.
This script does a fuzzy back-fill: scans the narrative body of every page
YAML for textual matches of every entity's canonical_name + aliases, and
records the hits as a new signal source `text_refs`. Aho-Corasick is used
so the whole 3k-pages × 34k-entities cross-product collapses to a single
linear scan per page.
Conservative filtering keeps the noise floor low:
- minimum 5 chars per alias (4 if alias has a digit, to keep e.g. "USS")
- blacklist of common stopwords / generic terms
- word-boundary enforcement (\b in regex, manual check after AC scan)
- skip purely numeric and ASCII-fold-identical-to-id aliases
YAML output (added in-place on each entity file):
text_mentioned_in: ['[[doc-id/pNNN]]', ...] # only refs NOT already in mentioned_in
signal_sources.text_refs: N
total_mentions = db_chunks + page_refs + cross_refs + text_refs
signal_strength recomputed using text_refs as a weak signal
Run:
python3 scripts/maintain/46_text_backfill_mentions.py [--dry-run]
"""
from __future__ import annotations
import argparse
import re
import sys
from collections import defaultdict
from pathlib import Path
from typing import Iterable
import ahocorasick
import yaml
WIKI = Path("/Users/guto/ufo/wiki")
ENTITIES_BASE = WIKI / "entities"
PAGES_BASE = WIKI / "pages"
# Generic / stop-words: never accept these as match patterns even if listed
# as an alias. Lowercased. PT-BR + EN + universally vague terms.
BLACKLIST: set[str] = {
# english stopwords / common
"the", "and", "for", "with", "from", "this", "that", "these", "those",
"report", "reports", "object", "objects", "unknown", "unidentified",
"anomalous", "aerial", "phenomenon", "phenomena", "sighting", "sightings",
"case", "cases", "incident", "incidents", "event", "events", "encounter",
"encounters", "observation", "observations", "document", "documents",
"memo", "memos", "letter", "letters", "table", "tables", "image", "images",
"general", "section", "agent", "agents", "subject", "subjects",
"office", "offices", "field", "fields", "summary", "summaries",
"true", "false", "type", "types", "data", "name", "names",
"person", "people", "place", "places", "location", "locations",
"vehicle", "vehicles", "operation", "operations", "concept", "concepts",
"page", "pages", "chunk", "chunks", "scan", "scans",
"north", "south", "east", "west", "central",
"captain", "major", "colonel", "general", "lieutenant", "sergeant",
"monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
"january", "february", "march", "april", "june", "july", "august",
"september", "october", "november", "december",
# pt-br stopwords / common
"para", "como", "este", "esta", "esse", "essa", "isso", "aquele",
"ainda", "outro", "outra", "outros", "outras", "todos", "todas",
"relatorio", "relatório", "objeto", "objetos", "documento", "documentos",
"página", "paginas", "páginas", "evento", "eventos", "incidente",
"incidentes", "pessoa", "pessoas", "lugar", "lugares", "local", "locais",
"operação", "operacao", "geral", "agente", "agentes", "campo", "campos",
"norte", "sul", "leste", "oeste",
"janeiro", "fevereiro", "março", "marco", "abril", "junho", "julho",
"agosto", "setembro", "outubro", "novembro", "dezembro",
# generic acronyms widely embedded in unrelated text
"uap", "ufo", "usaaf", "usaf", "usa", "fbi", "dod", "nasa",
"atom", "atoms", "atomic",
}
def is_acceptable_alias(name: str) -> bool:
n = name.strip()
if not n:
return False
nl = n.lower()
if nl in BLACKLIST:
return False
# Must contain at least one letter
if not re.search(r"[a-zA-ZÀ-ÿ]", n):
return False
# Purely numeric or punctuation
if re.fullmatch(r"[\d\s\-_.,]+", n):
return False
# Single-word too short (5 char min unless contains a digit)
if " " not in n and "-" not in n and len(n) < 5 and not re.search(r"\d", n):
return False
return True
def parse_frontmatter(text: str) -> tuple[dict, str]:
if not text.startswith("---"):
return {}, text
parts = text.split("---", 2)
if len(parts) < 3:
return {}, text
try:
fm = yaml.safe_load(parts[1]) or {}
except yaml.YAMLError:
fm = {}
return fm, parts[2]
def dump_frontmatter_preserving_body(fm: dict, body: str) -> str:
return "---\n" + yaml.safe_dump(fm, sort_keys=False, allow_unicode=True, width=1000) + "---" + body
def extract_searchable_text(page_fm: dict, page_body: str) -> str:
"""Pick narrative-only fields from a page YAML — avoid YAML keys, IDs, enums."""
parts: list[str] = []
for key in (
"vision_description",
"vision_description_pt_br",
"narrative_summary",
"narrative_summary_pt_br",
"extracted_text",
):
v = page_fm.get(key)
if isinstance(v, str):
parts.append(v)
parts.append(page_body)
return "\n".join(parts)
# Map entity_class -> folder name
FOLDER_BY_CLASS = {
"person": "people",
"organization": "organizations",
"location": "locations",
"event": "events",
"uap_object": "uap-objects",
"vehicle": "vehicles",
"operation": "operations",
"concept": "concepts",
}
def entity_id_from_fm(fm: dict) -> tuple[str, str] | None:
cls = fm.get("entity_class")
if cls:
eid_key = f"{cls}_id"
eid = fm.get(eid_key) or fm.get("entity_id")
if eid:
return cls, eid
# legacy fallback
for k in ("person_id", "organization_id", "location_id", "event_id",
"uap_object_id", "vehicle_id", "operation_id", "concept_id"):
if k in fm:
return k.replace("_id", ""), fm[k]
return None
def signal_strength(db_chunks: int, page_refs: int, cross_refs: int, text_refs: int) -> str:
total = db_chunks + page_refs + cross_refs + text_refs
if total == 0:
return "orphan"
if db_chunks >= 3 or page_refs >= 3 or (db_chunks >= 1 and page_refs >= 1) or text_refs >= 5:
return "strong"
return "weak"
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument("--dry-run", action="store_true")
p.add_argument("--verbose", action="store_true")
args = p.parse_args()
# 1. Load entities + collect (alias_lower → list of (entity_file_path, original_alias))
print("Loading entities ...")
automaton = ahocorasick.Automaton()
entities: dict[Path, dict] = {}
alias_per_entity_count = 0
accepted_entities = 0
for ent_file in ENTITIES_BASE.rglob("*.md"):
if "_archived" in ent_file.parts:
continue
try:
text = ent_file.read_text(encoding="utf-8")
except Exception:
continue
fm, _body = parse_frontmatter(text)
if not fm:
continue
if entity_id_from_fm(fm) is None:
continue
canonical = fm.get("canonical_name") or fm.get("canonical_title")
aliases = fm.get("aliases") or []
names = []
if isinstance(canonical, str):
names.append(canonical)
for a in aliases:
if isinstance(a, str):
names.append(a)
accepted = [n for n in names if is_acceptable_alias(n)]
if not accepted:
continue
entities[ent_file] = {"fm": fm, "raw_text": text, "accepted": accepted}
accepted_entities += 1
for n in accepted:
automaton.add_word(n.lower(), (str(ent_file), n))
alias_per_entity_count += 1
automaton.make_automaton()
print(f" entities considered: {accepted_entities}")
print(f" searchable aliases: {alias_per_entity_count}")
# 2. Scan every page YAML
print("Scanning pages ...")
hits_by_entity: dict[str, set[str]] = defaultdict(set)
pages_scanned = 0
total_hits = 0
for page_file in PAGES_BASE.rglob("p*.md"):
try:
text = page_file.read_text(encoding="utf-8")
except Exception:
continue
fm, body = parse_frontmatter(text)
if not fm:
continue
page_id = fm.get("page_id")
if not page_id:
# Derive from filesystem: <doc-id>/p<NNN>
try:
rel = page_file.relative_to(PAGES_BASE)
page_id = f"{rel.parent}/{rel.stem}"
except ValueError:
continue
ref = f"[[{page_id}]]"
searchable = extract_searchable_text(fm, body).lower()
pages_scanned += 1
seen_this_page: set[str] = set()
for end_idx, (ent_path_str, original) in automaton.iter(searchable):
pattern = original.lower()
start_idx = end_idx - len(pattern) + 1
# Word boundary check
if start_idx > 0 and (searchable[start_idx - 1].isalnum() or searchable[start_idx - 1] == "_"):
continue
after = end_idx + 1
if after < len(searchable) and (searchable[after].isalnum() or searchable[after] == "_"):
continue
if ent_path_str in seen_this_page:
continue
seen_this_page.add(ent_path_str)
hits_by_entity[ent_path_str].add(ref)
total_hits += 1
print(f" pages scanned: {pages_scanned}")
print(f" total hits: {total_hits}")
print(f" entities matched: {len(hits_by_entity)}")
# 3. Write back to entity YAML
print("Writing back ...")
promoted = 0
upgraded = 0
updated = 0
for ent_path_str, refs in hits_by_entity.items():
ent_file = Path(ent_path_str)
rec = entities.get(ent_file)
if not rec:
continue
fm = rec["fm"]
raw_text = rec["raw_text"]
# Don't double-count refs already in mentioned_in (structured page_refs)
existing_mentioned = set(fm.get("mentioned_in") or [])
new_text_refs = sorted(refs - existing_mentioned)
old_sources = (fm.get("signal_sources") or {}).copy()
db_chunks = int(old_sources.get("db_chunks", 0))
page_refs = int(old_sources.get("page_refs", len(existing_mentioned)))
cross_refs = int(old_sources.get("cross_refs", 0))
text_refs = len(new_text_refs)
old_strength = fm.get("signal_strength", "unverified")
new_strength = signal_strength(db_chunks, page_refs, cross_refs, text_refs)
new_total = db_chunks + page_refs + cross_refs + text_refs
fm["text_mentioned_in"] = new_text_refs
sources = old_sources
sources["db_chunks"] = db_chunks
sources["page_refs"] = page_refs
sources["cross_refs"] = cross_refs
sources["text_refs"] = text_refs
fm["signal_sources"] = sources
fm["total_mentions"] = new_total
fm["signal_strength"] = new_strength
if old_strength == "orphan" and new_strength != "orphan":
promoted += 1
if old_strength == "weak" and new_strength == "strong":
upgraded += 1
updated += 1
if not args.dry_run:
# Preserve body verbatim
_, body = parse_frontmatter(raw_text)
new_text = dump_frontmatter_preserving_body(fm, body)
ent_file.write_text(new_text, encoding="utf-8")
# Also: entities not matched at all — they keep their existing state.
# But ensure their signal_sources.text_refs is at least set to 0 if missing
# (so the YAML schema is consistent).
backfill_zeros = 0
if not args.dry_run:
for ent_file, rec in entities.items():
if str(ent_file) in hits_by_entity:
continue
fm = rec["fm"]
sources = (fm.get("signal_sources") or {})
if "text_refs" not in sources:
sources["text_refs"] = 0
fm["signal_sources"] = sources
_, body = parse_frontmatter(rec["raw_text"])
ent_file.write_text(dump_frontmatter_preserving_body(fm, body), encoding="utf-8")
backfill_zeros += 1
print()
print(f" entities updated: {updated}")
print(f" promoted orphan → weak: {promoted}")
print(f" upgraded weak → strong: {upgraded}")
print(f" zero-text-ref backfills: {backfill_zeros}")
print(f" dry-run: {args.dry_run}")
return 0
if __name__ == "__main__":
sys.exit(main())