#!/usr/bin/env python3 """ 03-dedup-entities.py — Phase 5 — Entity dedup + upsert For every page.md under wiki/pages/**/*.md: 1. Read frontmatter.entities_extracted 2. Canonicalize each entity name → kebab-case ASCII-fold id 3. Aggregate occurrences across pages (same kebab-case = same entity) 4. Upsert wiki/entities//.md: - If file missing: create with stub frontmatter + bilingual body - If file exists: merge aliases, preserve manual edits to body, refresh derived stats (mention_count per page, total_mentions, documents_count) Does NOT populate mentioned_in[] — that's lint's job (script 04). This script just creates/updates entity stubs so wiki-links resolve. Idempotent: re-running with no new pages produces no changes (atomic write suppresses writes when output is identical). Uso: ./03-dedup-entities.py # process every page in wiki/pages/ ./03-dedup-entities.py --doc-id # only one document ./03-dedup-entities.py --dry-run # report what would change, don't write """ from __future__ import annotations import argparse import re import sys import unicodedata from collections import defaultdict from datetime import datetime, timezone from pathlib import Path try: import yaml except ImportError: sys.stderr.write("Missing pyyaml. Run: pip3 install pyyaml\n") sys.exit(1) UFO_ROOT = Path("/Users/guto/ufo") PAGES_BASE = UFO_ROOT / "wiki" / "pages" ENTITIES_BASE = UFO_ROOT / "wiki" / "entities" LOG_PATH = UFO_ROOT / "wiki" / "log.md" WIKI_VERSION = "0.1.0" SCHEMA_VERSION = "0.1.0" # (class_name_in_page_extraction, dir_name_under_wiki/entities/, frontmatter type, entity_class field, id_field) ENTITY_CLASSES = [ ("people", "people", "entity", "person", "person_id"), ("organizations", "organizations", "entity", "organization", "organization_id"), ("locations", "locations", "entity", "location", "location_id"), ("vehicles", "vehicles", "entity", "vehicle", "vehicle_id"), ("operations", "operations", "entity", "operation", "operation_id"), ("concepts", "concepts", "entity", "concept", "concept_id"), # events and uap_objects have non-trivial ID schemes — handled separately ] def utc_now_iso(): return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def canonicalize_name(name: str) -> str: """Generic name → kebab-case ASCII-fold id.""" if not name: return "" nfkd = unicodedata.normalize("NFKD", name) ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c)) lower = ascii_str.lower() replaced = re.sub(r"[^a-z0-9-]", "-", lower) collapsed = re.sub(r"-+", "-", replaced).strip("-") if collapsed and collapsed[0].isdigit(): # IDs cannot start with digit (per CLAUDE.md rule) collapsed = "x-" + collapsed return collapsed def event_id_from_entry(entry: dict) -> str: """Build event_id from {label, date}. Date is YYYY-MM-DD, YYYY, or NA.""" label = entry.get("label", "") date = entry.get("date", "NA") or "NA" slug = canonicalize_name(label)[:40].strip("-") or "unlabeled" # Parse date m = re.match(r"^(\d{4})-(\d{2})-(\d{2})$", date) if m: return f"EV-{m.group(1)}-{m.group(2)}-{m.group(3)}-{slug}" m = re.match(r"^(\d{4})-(\d{2})$", date) if m: return f"EV-{m.group(1)}-{m.group(2)}-XX-{slug}" m = re.match(r"^(\d{4})$", date) if m: return f"EV-{m.group(1)}-XX-XX-{slug}" return f"EV-XXXX-XX-XX-{slug}" def uap_object_id_from_entry(entry: dict, event_id: str, index: int) -> str: """OBJ--.""" # Strip "EV-" prefix and dashes from date part to make compact slug if event_id.startswith("EV-"): rest = event_id[3:] # "2004-11-14-tic-tac-nimitz" # Take first 2 parts (year + slug) as compact event ref parts = rest.split("-", 4) if len(parts) >= 4: # parts: [year, month, day, ...slug...] year = parts[0] slug_part = "-".join(parts[3:]) if len(parts) > 3 else "unk" slug_compact = slug_part.replace("-", "").upper()[:20] or "UNK" event_short = f"EV{year}-{slug_compact}" else: event_short = "UNK" else: event_short = "UNK" return f"OBJ-{event_short}-{index:02d}" def read_frontmatter_and_body(path: Path) -> tuple[dict, str]: """Parse a markdown file. Returns (frontmatter_dict, body_str).""" content = path.read_text(encoding="utf-8") if not content.startswith("---"): return {}, content end = content.find("---", 4) if end == -1: return {}, content fm_str = content[3:end].strip() body = content[end + 3 :].lstrip("\n") try: fm = yaml.safe_load(fm_str) or {} except yaml.YAMLError as e: sys.stderr.write(f"YAML error in {path}: {e}\n") fm = {} return fm, body def write_frontmatter_and_body(path: Path, frontmatter: dict, body: str, dry_run: bool = False) -> bool: """Atomic write. Returns True if file was changed. For idempotency: if the file exists and the only differences are `last_ingest` / `last_lint` timestamps, do NOT rewrite. """ new_yaml = yaml.dump(frontmatter, allow_unicode=True, sort_keys=False, default_flow_style=False) new_content = f"---\n{new_yaml}---\n\n{body}" if not body.startswith("\n") else f"---\n{new_yaml}---\n{body}" if path.exists(): existing = path.read_text(encoding="utf-8") if existing == new_content: return False # Compare frontmatter excluding volatile timestamps existing_fm, existing_body = read_frontmatter_and_body(path) VOLATILE = {"last_ingest", "last_lint"} snap_old = {k: v for k, v in existing_fm.items() if k not in VOLATILE} snap_new = {k: v for k, v in frontmatter.items() if k not in VOLATILE} if snap_old == snap_new and existing_body == body: return False # only timestamps differ; treat as unchanged if dry_run: return True path.parent.mkdir(parents=True, exist_ok=True) path.write_text(new_content, encoding="utf-8") return True def collect_entities_from_pages(doc_filter: str | None = None) -> dict: """ Walk wiki/pages/**/*.md and collect all entity references. Returns: { 'people': { canonical_id: { 'aliases': set, 'mentions': [(page_id, role, doc_id), ...], 'roles': set } }, 'organizations': { ... }, ... 'events': { event_id: { 'labels': set, 'date': '...', 'mentions': [...] } }, 'uap_objects': { obj_id: { 'shape': ..., 'color': ..., 'mentions': [...], 'event_id': ... } }, } """ collected = { "people": defaultdict(lambda: {"aliases": set(), "mentions": [], "roles": set()}), "organizations": defaultdict(lambda: {"aliases": set(), "mentions": []}), "locations": defaultdict(lambda: {"aliases": set(), "mentions": [], "type": None}), "vehicles": defaultdict(lambda: {"aliases": set(), "mentions": [], "class": None}), "operations": defaultdict(lambda: {"aliases": set(), "mentions": [], "type": None}), "concepts": defaultdict(lambda: {"aliases": set(), "mentions": [], "class": None}), "events": defaultdict(lambda: {"labels": set(), "date": "NA", "mentions": []}), "uap_objects": defaultdict(lambda: {"shape": None, "color": None, "size_estimate": None, "mentions": [], "event_id": None}), } pattern = "**/*.md" pages = sorted(PAGES_BASE.glob(pattern)) for page_path in pages: if doc_filter and doc_filter not in str(page_path): continue fm, _body = read_frontmatter_and_body(page_path) if not fm or fm.get("type") != "page": continue page_id = fm.get("page_id", "") doc_id = fm.get("doc_id", "") if not page_id or not doc_id: continue entities = fm.get("entities_extracted") or {} # Standard entity classes for class_name, _, _, _, _ in ENTITY_CLASSES: entries = entities.get(class_name) or [] for entry in entries: name = entry.get("name") if isinstance(entry, dict) else None if not name: continue canonical = canonicalize_name(name) if not canonical: continue bucket = collected[class_name][canonical] bucket["aliases"].add(name) role = (entry.get("role_in_page") if class_name == "people" else None) or "mentioned" bucket["mentions"].append((page_id, role, doc_id)) if class_name == "people": bucket["roles"].add(role) elif class_name == "locations": if not bucket.get("type"): bucket["type"] = entry.get("type") elif class_name == "vehicles": if not bucket.get("class"): bucket["class"] = entry.get("class") elif class_name == "operations": if not bucket.get("type"): bucket["type"] = entry.get("type") elif class_name == "concepts": if not bucket.get("class"): bucket["class"] = entry.get("class") # Events events = entities.get("events") or [] page_event_ids: list[str] = [] for entry in events: label = entry.get("label") if not label: continue ev_id = event_id_from_entry(entry) page_event_ids.append(ev_id) bucket = collected["events"][ev_id] bucket["labels"].add(label) bucket["mentions"].append((page_id, "documented_in", doc_id)) date = entry.get("date") or "NA" if date != "NA" and bucket["date"] == "NA": bucket["date"] = date # UAP objects — link to first event on the page if available uaps = entities.get("uap_objects") or [] for idx, entry in enumerate(uaps, start=1): event_for_obj = page_event_ids[0] if page_event_ids else f"EV-XXXX-XX-XX-{canonicalize_name(doc_id)[:30]}" obj_id = uap_object_id_from_entry(entry, event_for_obj, idx) bucket = collected["uap_objects"][obj_id] bucket["shape"] = bucket["shape"] or entry.get("shape") bucket["color"] = bucket["color"] or entry.get("color") bucket["size_estimate"] = bucket["size_estimate"] or entry.get("size_estimate") bucket["event_id"] = bucket["event_id"] or event_for_obj bucket["mentions"].append((page_id, "observation", doc_id)) return collected def _stub_body(entity_class: str, canonical_name: str) -> str: """Standard bilingual stub body for new entities.""" return ( f"# {canonical_name}\n\n" "## Description (EN)\n\n" "_Stub generated by entity dedup. Will be enriched in Phase 6._\n\n" "## Descrição (PT-BR)\n\n" "_Stub gerado pela deduplicação de entidades. Será enriquecido na Fase 6._\n" ) # Pre-built alias index: {dir_name: {alias_lower: path}} cached on first access. _ALIAS_INDEX: dict[str, dict[str, Path]] = {} def _ensure_alias_index(dir_name: str) -> dict[str, Path]: """Build alias→path map for a class folder once, cached. O(N) initial scan.""" if dir_name in _ALIAS_INDEX: return _ALIAS_INDEX[dir_name] target_dir = ENTITIES_BASE / dir_name index: dict[str, Path] = {} if target_dir.exists(): for entity_path in target_dir.glob("*.md"): try: fm, _ = read_frontmatter_and_body(entity_path) except Exception: continue # Index by stem (canonical_id) AND by all aliases index[entity_path.stem.lower()] = entity_path cname = fm.get("canonical_name") if isinstance(cname, str) and cname.strip(): index[cname.lower().strip()] = entity_path for alias in (fm.get("aliases") or []): if isinstance(alias, str) and alias.strip(): index[alias.lower().strip()] = entity_path _ALIAS_INDEX[dir_name] = index return index def _find_existing_entity_by_alias( dir_name: str, names: set[str], canonical_id_candidate: str, ) -> Path | None: """O(1) lookup via pre-built alias index.""" idx = _ensure_alias_index(dir_name) canon_needle = canonical_id_candidate.lower() if canon_needle in idx: return idx[canon_needle] for n in names: if not n: continue key = n.lower().strip() if key in idx: return idx[key] return None def _register_in_index(dir_name: str, path: Path, names: set[str], canonical_name: str | None = None) -> None: """Add a newly-created or updated entity to the in-memory alias index.""" idx = _ensure_alias_index(dir_name) idx[path.stem.lower()] = path if canonical_name: idx[canonical_name.lower().strip()] = path for n in names: if isinstance(n, str) and n.strip(): idx[n.lower().strip()] = path def _upsert_simple_entity( class_name: str, dir_name: str, type_value: str, entity_class: str, id_field: str, canonical_id: str, data: dict, dry_run: bool, ) -> tuple[str, bool, Path]: """Upsert a person/org/location/vehicle/operation/concept entity file. Returns (action, changed_bool, real_path). Action is 'created'|'updated'|'unchanged'|'merged-into-existing'. """ # Check if an existing entity matches by alias — avoid creating duplicates existing = _find_existing_entity_by_alias(dir_name, data.get("aliases", set()), canonical_id) merged = False if existing and existing.stem != canonical_id: path = existing merged = True else: path = ENTITIES_BASE / dir_name / f"{canonical_id}.md" aliases_sorted = sorted(data.get("aliases", set())) # canonical_name = most common alias (first by sort) — could be improved canonical_name = aliases_sorted[0] if aliases_sorted else canonical_id unique_docs = {doc_id for _, _, doc_id in data["mentions"]} total_mentions = len(data["mentions"]) documents_count = len(unique_docs) if path.exists(): fm, body = read_frontmatter_and_body(path) # Merge aliases (preserve existing + add new) existing_aliases = set(fm.get("aliases", []) or []) merged_aliases = sorted(existing_aliases | set(aliases_sorted)) fm["aliases"] = merged_aliases fm["total_mentions"] = total_mentions fm["documents_count"] = documents_count fm["last_ingest"] = utc_now_iso() # Refresh entity-specific fields if missing if class_name == "locations" and not fm.get("location_type") and data.get("type"): fm["location_type"] = data["type"] if class_name == "vehicles" and not fm.get("vehicle_class") and data.get("class"): fm["vehicle_class"] = data["class"] if class_name == "operations" and not fm.get("operation_type") and data.get("type"): fm["operation_type"] = data["type"] if class_name == "concepts" and not fm.get("concept_class") and data.get("class"): fm["concept_class"] = data["class"] changed = write_frontmatter_and_body(path, fm, body, dry_run=dry_run) action = "merged-into-existing" if merged else ("updated" if changed else "unchanged") return (action, changed, path) # Create new fm = { "schema_version": SCHEMA_VERSION, "type": type_value, "entity_class": entity_class, id_field: canonical_id, "canonical_name": canonical_name, "aliases": aliases_sorted, } if class_name == "people": fm["roles"] = [] fm["dates"] = {"born": None, "died": None} elif class_name == "organizations": fm["organization_type"] = None fm["country"] = None elif class_name == "locations": fm["location_type"] = data.get("type") fm["country"] = [] fm["coordinates"] = None elif class_name == "vehicles": fm["vehicle_class"] = data.get("class") elif class_name == "operations": fm["operation_type"] = data.get("type") fm["status"] = None elif class_name == "concepts": fm["concept_class"] = data.get("class") fm["domain"] = None fm["definition_short"] = None fm["definition_short_pt_br"] = None fm["mentioned_in"] = [] # populated by lint fm["total_mentions"] = total_mentions fm["documents_count"] = documents_count fm["related_concepts" if class_name == "concepts" else "related"] = [] fm["enrichment_status"] = "none" fm["external_sources"] = [] fm["last_ingest"] = utc_now_iso() fm["last_lint"] = None fm["wiki_version"] = WIKI_VERSION body = _stub_body(entity_class, canonical_name) write_frontmatter_and_body(path, fm, body, dry_run=dry_run) _register_in_index(dir_name, path, set(aliases_sorted), canonical_name) return ("created", True, path) def _upsert_event(event_id: str, data: dict, dry_run: bool) -> tuple[str, bool, Path]: labels = sorted(data["labels"]) canonical_name = labels[0] if labels else event_id unique_docs = {doc_id for _, _, doc_id in data["mentions"]} total_mentions = len(data["mentions"]) # Alias-match against existing events existing = _find_existing_entity_by_alias("events", set(labels), event_id) merged = False if existing and existing.stem != event_id: path = existing merged = True else: path = ENTITIES_BASE / "events" / f"{event_id}.md" # Date parse from event_id m = re.match(r"^EV-(\d{4}|XXXX)-(\d{2}|XX)-(\d{2}|XX)-", event_id) date_start = "NA" if m: y, mo, d = m.groups() if y != "XXXX": if mo != "XX" and d != "XX": date_start = f"{y}-{mo}-{d}" elif mo != "XX": date_start = f"{y}-{mo}" else: date_start = y if path.exists(): fm, body = read_frontmatter_and_body(path) existing_aliases = set(fm.get("aliases", []) or []) fm["aliases"] = sorted(existing_aliases | set(labels)) fm["total_mentions"] = total_mentions fm["documents_count"] = len(unique_docs) fm["last_ingest"] = utc_now_iso() changed = write_frontmatter_and_body(path, fm, body, dry_run=dry_run) action = "merged-into-existing" if merged else ("updated" if changed else "unchanged") return (action, changed, path) fm = { "schema_version": SCHEMA_VERSION, "type": "entity", "entity_class": "event", "event_id": event_id, "canonical_name": canonical_name, "aliases": labels, "event_class": "uap-encounter", "date_start": date_start, "date_end": date_start, "date_confidence": "low", "primary_location": None, "observers": [], "uap_objects": [], "documented_in": [], "total_mentions": total_mentions, "documents_count": len(unique_docs), "narrative_summary_confidence": "low", "narrative_summary": "_Stub. Will be enriched in Phase 7._", "narrative_summary_pt_br": "_Stub. Será enriquecido na Fase 7._", "enrichment_status": "none", "external_sources": [], "last_ingest": utc_now_iso(), "last_lint": None, "wiki_version": WIKI_VERSION, } body = _stub_body("events", canonical_name) write_frontmatter_and_body(path, fm, body, dry_run=dry_run) _register_in_index("events", path, set(labels), canonical_name) return ("created", True, path) def _find_existing_uap_object_by_event(event_id: str | None, shape: str, color: str, current_id: str) -> Path | None: """If an existing uap_object is observed in the same event with matching shape (or unknown), treat as the same object.""" if not event_id: return None target_dir = ENTITIES_BASE / "uap-objects" if not target_dir.exists(): return None event_ref = f"[[event/{event_id}]]" for p in target_dir.glob("*.md"): if p.stem == current_id: return p try: fm, _ = read_frontmatter_and_body(p) except Exception: continue if fm.get("observed_in_event") != event_ref: continue existing_shape = (fm.get("shape") or "unknown").lower() existing_color = (fm.get("color") or "unknown").lower() if existing_shape in ("unknown", "", shape.lower()) and ( existing_color in ("unknown", "", color.lower()) ): return p return None def _upsert_uap_object(obj_id: str, data: dict, dry_run: bool) -> tuple[str, bool, Path]: shape = data.get("shape") or "unknown" color = data.get("color") or "unknown" canonical_name = f"{shape} {color} UAP ({obj_id})" event_id = data.get("event_id") unique_docs = {doc_id for _, _, doc_id in data["mentions"]} total_mentions = len(data["mentions"]) # If an existing uap_object is anchored to the same event with compatible shape/color, merge existing = _find_existing_uap_object_by_event(event_id, shape, color, obj_id) merged = False if existing and existing.stem != obj_id: path = existing merged = True else: path = ENTITIES_BASE / "uap-objects" / f"{obj_id}.md" if path.exists(): fm, body = read_frontmatter_and_body(path) fm["total_mentions"] = total_mentions fm["documents_count"] = len(unique_docs) fm["last_ingest"] = utc_now_iso() changed = write_frontmatter_and_body(path, fm, body, dry_run=dry_run) action = "merged-into-existing" if merged else ("updated" if changed else "unchanged") return (action, changed, path) fm = { "schema_version": SCHEMA_VERSION, "type": "entity", "entity_class": "uap_object", "uap_object_id": obj_id, "canonical_name": canonical_name, "observed_in_event": f"[[event/{event_id}]]" if event_id else None, "secondary_events": [], "shape": shape, "color": color, "size_estimate_m": {"min": None, "max": None, "confidence_band": "speculation"}, "features": [], "altitude_ft": {"min": None, "max": None, "confidence_band": "speculation"}, "speed_kts": {"min": None, "max": None, "confidence_band": "speculation"}, "maneuver_descriptors": [], "sensor_observations": [], "visual_records": [], "total_mentions": total_mentions, "documents_count": len(unique_docs), "evidence_anchored": [], "hypotheses_addressing": [], "confidence_band_overall": "low", "last_ingest": utc_now_iso(), "last_lint": None, "wiki_version": WIKI_VERSION, } body = _stub_body("uap_objects", canonical_name) write_frontmatter_and_body(path, fm, body, dry_run=dry_run) _register_in_index("uap-objects", path, set(), canonical_name) return ("created", True, path) def main(): ap = argparse.ArgumentParser(description="Dedup and upsert entities from page extractions.") ap.add_argument("--doc-id", help="Only process pages of this doc_id") ap.add_argument("--dry-run", action="store_true", help="Report would-be changes without writing") args = ap.parse_args() print(f"Scanning {PAGES_BASE} for entity references...", flush=True) collected = collect_entities_from_pages(doc_filter=args.doc_id) totals = {k: len(v) for k, v in collected.items()} print(f"Found unique entities: {totals}", flush=True) stats = {"created": 0, "updated": 0, "unchanged": 0, "merged-into-existing": 0} # Simple classes for class_name, dir_name, type_value, entity_class, id_field in ENTITY_CLASSES: for canonical_id, data in collected[class_name].items(): action, changed, real_path = _upsert_simple_entity( class_name, dir_name, type_value, entity_class, id_field, canonical_id, data, dry_run=args.dry_run, ) # Bucket merged-but-unchanged into "unchanged" if action == "merged-into-existing" and not changed: stats["unchanged"] += 1 else: stats[action] += 1 if changed: rel = real_path.relative_to(UFO_ROOT) tag = f"merged ({canonical_id} → {real_path.stem})" if action == "merged-into-existing" else action print(f" [{tag}] {rel}", flush=True) # Events for event_id, data in collected["events"].items(): action, changed, real_path = _upsert_event(event_id, data, dry_run=args.dry_run) if action == "merged-into-existing" and not changed: stats["unchanged"] += 1 else: stats[action] += 1 if changed: tag = f"merged ({event_id} → {real_path.stem})" if action == "merged-into-existing" else action print(f" [{tag}] {real_path.relative_to(UFO_ROOT)}", flush=True) # UAP objects — need to resolve event_id reference first via event upsert # The event_id stored in data may have been merged into a different existing event. # Pass through the event merge map to remap. event_merge_map = {} for event_id, edata in collected["events"].items(): # Re-derive what _upsert_event would have decided labels = sorted(edata["labels"]) existing = _find_existing_entity_by_alias("events", set(labels), event_id) if existing and existing.stem != event_id: event_merge_map[event_id] = existing.stem for obj_id, data in collected["uap_objects"].items(): # Remap event_id if it was merged if data.get("event_id") in event_merge_map: data["event_id"] = event_merge_map[data["event_id"]] action, changed, real_path = _upsert_uap_object(obj_id, data, dry_run=args.dry_run) if action == "merged-into-existing" and not changed: stats["unchanged"] += 1 else: stats[action] += 1 if changed: tag = f"merged ({obj_id} → {real_path.stem})" if action == "merged-into-existing" else action print(f" [{tag}] {real_path.relative_to(UFO_ROOT)}", flush=True) print(f"\nSummary: created={stats['created']}, updated={stats['updated']}, " f"merged={stats['merged-into-existing']}, unchanged={stats['unchanged']}", flush=True) if not args.dry_run and (stats["created"] or stats["updated"]): with open(LOG_PATH, "a", encoding="utf-8") as fh: fh.write(f"\n## {utc_now_iso()} — ENTITY DEDUP (Phase 5)\n") fh.write(f"- operator: archivist\n") fh.write(f"- script: scripts/03-dedup-entities.py\n") fh.write(f"- doc_filter: {args.doc_id or '(all)'}\n") fh.write(f"- created: {stats['created']}\n- updated: {stats['updated']}\n- unchanged: {stats['unchanged']}\n") fh.write(f"- totals_after: {totals}\n") if __name__ == "__main__": main()