#!/usr/bin/env python3 """ Text-based backfill of entity → page references. The structured pipelines (Sonnet chunks, Haiku page-level events/entities) miss many entities the corpus actually discusses — they extract only what they confidently structure into the schema. The vision_description and narrative_summary fields routinely *talk about* an event/person/place without listing it in the structured arrays. This script does a fuzzy back-fill: scans the narrative body of every page YAML for textual matches of every entity's canonical_name + aliases, and records the hits as a new signal source `text_refs`. Aho-Corasick is used so the whole 3k-pages × 34k-entities cross-product collapses to a single linear scan per page. Conservative filtering keeps the noise floor low: - minimum 5 chars per alias (4 if alias has a digit, to keep e.g. "USS") - blacklist of common stopwords / generic terms - word-boundary enforcement (\b in regex, manual check after AC scan) - skip purely numeric and ASCII-fold-identical-to-id aliases YAML output (added in-place on each entity file): text_mentioned_in: ['[[doc-id/pNNN]]', ...] # only refs NOT already in mentioned_in signal_sources.text_refs: N total_mentions = db_chunks + page_refs + cross_refs + text_refs signal_strength recomputed using text_refs as a weak signal Run: python3 scripts/maintain/46_text_backfill_mentions.py [--dry-run] """ from __future__ import annotations import argparse import re import sys from collections import defaultdict from pathlib import Path from typing import Iterable import ahocorasick import yaml WIKI = Path("/Users/guto/ufo/wiki") ENTITIES_BASE = WIKI / "entities" PAGES_BASE = WIKI / "pages" # Generic / stop-words: never accept these as match patterns even if listed # as an alias. Lowercased. PT-BR + EN + universally vague terms. BLACKLIST: set[str] = { # english stopwords / common "the", "and", "for", "with", "from", "this", "that", "these", "those", "report", "reports", "object", "objects", "unknown", "unidentified", "anomalous", "aerial", "phenomenon", "phenomena", "sighting", "sightings", "case", "cases", "incident", "incidents", "event", "events", "encounter", "encounters", "observation", "observations", "document", "documents", "memo", "memos", "letter", "letters", "table", "tables", "image", "images", "general", "section", "agent", "agents", "subject", "subjects", "office", "offices", "field", "fields", "summary", "summaries", "true", "false", "type", "types", "data", "name", "names", "person", "people", "place", "places", "location", "locations", "vehicle", "vehicles", "operation", "operations", "concept", "concepts", "page", "pages", "chunk", "chunks", "scan", "scans", "north", "south", "east", "west", "central", "captain", "major", "colonel", "general", "lieutenant", "sergeant", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "january", "february", "march", "april", "june", "july", "august", "september", "october", "november", "december", # pt-br stopwords / common "para", "como", "este", "esta", "esse", "essa", "isso", "aquele", "ainda", "outro", "outra", "outros", "outras", "todos", "todas", "relatorio", "relatório", "objeto", "objetos", "documento", "documentos", "página", "paginas", "páginas", "evento", "eventos", "incidente", "incidentes", "pessoa", "pessoas", "lugar", "lugares", "local", "locais", "operação", "operacao", "geral", "agente", "agentes", "campo", "campos", "norte", "sul", "leste", "oeste", "janeiro", "fevereiro", "março", "marco", "abril", "junho", "julho", "agosto", "setembro", "outubro", "novembro", "dezembro", # generic acronyms widely embedded in unrelated text "uap", "ufo", "usaaf", "usaf", "usa", "fbi", "dod", "nasa", "atom", "atoms", "atomic", } def is_acceptable_alias(name: str) -> bool: n = name.strip() if not n: return False nl = n.lower() if nl in BLACKLIST: return False # Must contain at least one letter if not re.search(r"[a-zA-ZÀ-ÿ]", n): return False # Purely numeric or punctuation if re.fullmatch(r"[\d\s\-_.,]+", n): return False # Single-word too short (5 char min unless contains a digit) if " " not in n and "-" not in n and len(n) < 5 and not re.search(r"\d", n): return False return True def parse_frontmatter(text: str) -> tuple[dict, str]: if not text.startswith("---"): return {}, text parts = text.split("---", 2) if len(parts) < 3: return {}, text try: fm = yaml.safe_load(parts[1]) or {} except yaml.YAMLError: fm = {} return fm, parts[2] def dump_frontmatter_preserving_body(fm: dict, body: str) -> str: return "---\n" + yaml.safe_dump(fm, sort_keys=False, allow_unicode=True, width=1000) + "---" + body def extract_searchable_text(page_fm: dict, page_body: str) -> str: """Pick narrative-only fields from a page YAML — avoid YAML keys, IDs, enums.""" parts: list[str] = [] for key in ( "vision_description", "vision_description_pt_br", "narrative_summary", "narrative_summary_pt_br", "extracted_text", ): v = page_fm.get(key) if isinstance(v, str): parts.append(v) parts.append(page_body) return "\n".join(parts) # Map entity_class -> folder name FOLDER_BY_CLASS = { "person": "people", "organization": "organizations", "location": "locations", "event": "events", "uap_object": "uap-objects", "vehicle": "vehicles", "operation": "operations", "concept": "concepts", } def entity_id_from_fm(fm: dict) -> tuple[str, str] | None: cls = fm.get("entity_class") if cls: eid_key = f"{cls}_id" eid = fm.get(eid_key) or fm.get("entity_id") if eid: return cls, eid # legacy fallback for k in ("person_id", "organization_id", "location_id", "event_id", "uap_object_id", "vehicle_id", "operation_id", "concept_id"): if k in fm: return k.replace("_id", ""), fm[k] return None def signal_strength(db_chunks: int, page_refs: int, cross_refs: int, text_refs: int) -> str: total = db_chunks + page_refs + cross_refs + text_refs if total == 0: return "orphan" if db_chunks >= 3 or page_refs >= 3 or (db_chunks >= 1 and page_refs >= 1) or text_refs >= 5: return "strong" return "weak" def main() -> int: p = argparse.ArgumentParser() p.add_argument("--dry-run", action="store_true") p.add_argument("--verbose", action="store_true") args = p.parse_args() # 1. Load entities + collect (alias_lower → list of (entity_file_path, original_alias)) print("Loading entities ...") automaton = ahocorasick.Automaton() entities: dict[Path, dict] = {} alias_per_entity_count = 0 accepted_entities = 0 for ent_file in ENTITIES_BASE.rglob("*.md"): if "_archived" in ent_file.parts: continue try: text = ent_file.read_text(encoding="utf-8") except Exception: continue fm, _body = parse_frontmatter(text) if not fm: continue if entity_id_from_fm(fm) is None: continue canonical = fm.get("canonical_name") or fm.get("canonical_title") aliases = fm.get("aliases") or [] names = [] if isinstance(canonical, str): names.append(canonical) for a in aliases: if isinstance(a, str): names.append(a) accepted = [n for n in names if is_acceptable_alias(n)] if not accepted: continue entities[ent_file] = {"fm": fm, "raw_text": text, "accepted": accepted} accepted_entities += 1 for n in accepted: automaton.add_word(n.lower(), (str(ent_file), n)) alias_per_entity_count += 1 automaton.make_automaton() print(f" entities considered: {accepted_entities}") print(f" searchable aliases: {alias_per_entity_count}") # 2. Scan every page YAML print("Scanning pages ...") hits_by_entity: dict[str, set[str]] = defaultdict(set) pages_scanned = 0 total_hits = 0 for page_file in PAGES_BASE.rglob("p*.md"): try: text = page_file.read_text(encoding="utf-8") except Exception: continue fm, body = parse_frontmatter(text) if not fm: continue page_id = fm.get("page_id") if not page_id: # Derive from filesystem: /p try: rel = page_file.relative_to(PAGES_BASE) page_id = f"{rel.parent}/{rel.stem}" except ValueError: continue ref = f"[[{page_id}]]" searchable = extract_searchable_text(fm, body).lower() pages_scanned += 1 seen_this_page: set[str] = set() for end_idx, (ent_path_str, original) in automaton.iter(searchable): pattern = original.lower() start_idx = end_idx - len(pattern) + 1 # Word boundary check if start_idx > 0 and (searchable[start_idx - 1].isalnum() or searchable[start_idx - 1] == "_"): continue after = end_idx + 1 if after < len(searchable) and (searchable[after].isalnum() or searchable[after] == "_"): continue if ent_path_str in seen_this_page: continue seen_this_page.add(ent_path_str) hits_by_entity[ent_path_str].add(ref) total_hits += 1 print(f" pages scanned: {pages_scanned}") print(f" total hits: {total_hits}") print(f" entities matched: {len(hits_by_entity)}") # 3. Write back to entity YAML print("Writing back ...") promoted = 0 upgraded = 0 updated = 0 for ent_path_str, refs in hits_by_entity.items(): ent_file = Path(ent_path_str) rec = entities.get(ent_file) if not rec: continue fm = rec["fm"] raw_text = rec["raw_text"] # Don't double-count refs already in mentioned_in (structured page_refs) existing_mentioned = set(fm.get("mentioned_in") or []) new_text_refs = sorted(refs - existing_mentioned) old_sources = (fm.get("signal_sources") or {}).copy() db_chunks = int(old_sources.get("db_chunks", 0)) page_refs = int(old_sources.get("page_refs", len(existing_mentioned))) cross_refs = int(old_sources.get("cross_refs", 0)) text_refs = len(new_text_refs) old_strength = fm.get("signal_strength", "unverified") new_strength = signal_strength(db_chunks, page_refs, cross_refs, text_refs) new_total = db_chunks + page_refs + cross_refs + text_refs fm["text_mentioned_in"] = new_text_refs sources = old_sources sources["db_chunks"] = db_chunks sources["page_refs"] = page_refs sources["cross_refs"] = cross_refs sources["text_refs"] = text_refs fm["signal_sources"] = sources fm["total_mentions"] = new_total fm["signal_strength"] = new_strength if old_strength == "orphan" and new_strength != "orphan": promoted += 1 if old_strength == "weak" and new_strength == "strong": upgraded += 1 updated += 1 if not args.dry_run: # Preserve body verbatim _, body = parse_frontmatter(raw_text) new_text = dump_frontmatter_preserving_body(fm, body) ent_file.write_text(new_text, encoding="utf-8") # Also: entities not matched at all — they keep their existing state. # But ensure their signal_sources.text_refs is at least set to 0 if missing # (so the YAML schema is consistent). backfill_zeros = 0 if not args.dry_run: for ent_file, rec in entities.items(): if str(ent_file) in hits_by_entity: continue fm = rec["fm"] sources = (fm.get("signal_sources") or {}) if "text_refs" not in sources: sources["text_refs"] = 0 fm["signal_sources"] = sources _, body = parse_frontmatter(rec["raw_text"]) ent_file.write_text(dump_frontmatter_preserving_body(fm, body), encoding="utf-8") backfill_zeros += 1 print() print(f" entities updated: {updated}") print(f" promoted orphan → weak: {promoted}") print(f" upgraded weak → strong: {upgraded}") print(f" zero-text-ref backfills: {backfill_zeros}") print(f" dry-run: {args.dry_run}") return 0 if __name__ == "__main__": sys.exit(main())