disclosure-bureau/scripts/maintain/52_mark_generic_entities.py

#!/usr/bin/env python3
"""
Mark entities whose canonical_name is purely conceptual ("Flying disc sighting
reports", "Investigation of Flying Discs", "Document Receipt by FBI"...) with
`is_generic: true`. These are categories the chunker accidentally promoted to
event/operation entities. Hiding them from /e/events, /e/operations, /timeline,
and /graph removes catalog noise without deleting data.

Decision rule (conservative — only flag obvious noise):
  - canonical_name contains GENERIC_PHRASE patterns AND
  - has no specific qualifier (no proper noun, no year, no place name).

We DO NOT touch:
  - person entities (always specific)
  - location entities (always specific)
  - entities with date_start that resolves to a real year
  - entities whose canonical_name contains a proper noun (Capitalized
    Name not in the generic vocabulary)

Idempotent. Re-running flags new generics if any.

Run:
  python3 scripts/maintain/52_mark_generic_entities.py --dry-run
  python3 scripts/maintain/52_mark_generic_entities.py
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path

import yaml

WIKI_ENT = Path("/Users/guto/ufo/wiki/entities")

# Phrases that, when forming the BULK of a canonical_name without a specific
# qualifier, indicate the entity is a CATEGORY rather than an instance.
GENERIC_TOKEN_VOCAB = {
    # core event/sighting noise
    "flying", "disc", "discs", "disk", "disks", "saucer", "saucers",
    "sighting", "sightings", "report", "reports", "reporting", "reported",
    "investigation", "investigations", "investigative",
    "observation", "observations", "observed", "observing",
    "unidentified", "object", "objects", "aerial", "phenomena", "phenomenon",
    "uap", "ufo", "ufos",
    # generic process / bureaucracy
    "document", "documents", "receipt", "receipts", "protocol", "protocols",
    "summary", "summaries", "review", "reviews", "incident", "incidents",
    "case", "cases", "event", "events", "encounter", "encounters",
    "evaluation", "analysis", "tracking",
    "memo", "memos", "memorandum", "memoranda", "letter", "letters",
    "communication", "communications", "correspondence",
    "information", "data", "details", "record", "records",
    "filing", "file", "files", "section", "subsection", "branch", "department",
    "office", "general", "matter", "matters", "subject", "subjects",
    # connectors (not significant on their own)
    "of", "the", "a", "an", "and", "or", "with", "on", "for", "to", "from",
    "by", "at", "in", "as", "is", "are",
    # pt-br equivalents (sometimes mixed)
    "voador", "voadores", "disco", "discos", "avistamento", "avistamentos",
    "relatorio", "relatorios", "investigacao", "investigacoes",
    "observacao", "observacoes", "objeto", "objetos", "nao", "identificado",
    "documento", "documentos", "recibo", "recibos", "protocolo", "protocolos",
    "sumario", "resumo", "incidente", "incidentes",
    # FBI bureaucratic
    "internal", "security", "headquarters", "agent", "agents",
}

YEAR_RE = re.compile(r"\b(18|19|20)\d{2}\b")
TOKEN_RE = re.compile(r"\b[\w]+\b")


def has_specific_qualifier(name: str) -> bool:
    """Return True if name contains a year, a Capitalized proper noun (not in
    the generic vocab), or a multi-word proper name suggesting a specific
    place/person/case."""
    if YEAR_RE.search(name):
        return True
    # Look at tokens with non-generic Capitalized words
    for tok in TOKEN_RE.findall(name):
        # Strict proper-noun check: starts with uppercase, length >= 4,
        # not in generic vocab
        if tok and tok[0].isupper() and len(tok) >= 4:
            if tok.lower() not in GENERIC_TOKEN_VOCAB:
                return True
    # Check for hyphenated identifiers (EV-..., OBJ-...) — those are codes,
    # not specific qualifiers UNLESS they have date fields
    return False


def is_pure_generic(name: str) -> bool:
    """True if canonical_name is entirely composed of generic vocab tokens."""
    if not name: return True
    toks = [t.lower() for t in TOKEN_RE.findall(name)]
    if not toks: return True
    significant = [t for t in toks if len(t) > 1]
    if not significant: return True
    # Every significant token must be in the generic vocab
    return all(t in GENERIC_TOKEN_VOCAB for t in significant)


def parse_entity(path: Path):
    try:
        text = path.read_text(encoding="utf-8")
        if not text.startswith("---"): return None
        parts = text.split("---", 2)
        if len(parts) < 3: return None
        fm = yaml.safe_load(parts[1]) or {}
        return {"path": path, "fm": fm, "raw": text}
    except Exception:
        return None


def dump_entity(entity):
    raw = entity["raw"]
    parts = raw.split("---", 2)
    if len(parts) < 3: return raw
    body = parts[2]
    return "---\n" + yaml.safe_dump(entity["fm"], sort_keys=False, allow_unicode=True, width=1000) + "---" + body


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--dry-run", action="store_true")
    args = ap.parse_args()

    print(f"Scanning {WIKI_ENT} ...")
    # Only target entity classes where genericness is meaningful
    target_classes = {"event", "operation", "concept", "uap_object"}

    total = 0
    flagged = 0
    already_flagged = 0
    samples = []

    for f in WIKI_ENT.rglob("*.md"):
        if "_archived" in f.parts: continue
        ent = parse_entity(f)
        if not ent: continue
        fm = ent["fm"]
        cls = fm.get("entity_class")
        if cls not in target_classes: continue
        total += 1
        if fm.get("is_generic") is True:
            already_flagged += 1
            continue
        name = fm.get("canonical_name") or ""
        if not name: continue
        # If it has a year, named person/place — skip
        if has_specific_qualifier(name): continue
        if not is_pure_generic(name): continue
        # Flag it
        fm["is_generic"] = True
        if not args.dry_run:
            f.write_text(dump_entity(ent), encoding="utf-8")
        flagged += 1
        if len(samples) < 25:
            samples.append((cls, name))

    print(f"\nEntities scanned (event/operation/concept/uap_object): {total}")
    print(f"Already flagged is_generic: {already_flagged}")
    print(f"Newly flagged is_generic:   {flagged}")
    print(f"\nSample flagged ({min(len(samples), 25)}):")
    for cls, name in samples:
        print(f"  [{cls:<10}] {name}")
    return 0


if __name__ == "__main__":
    sys.exit(main())
rebuild entity layer from Sonnet-vision reextract pipeline Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> 2026-05-21 15:20:24 +00:00			`#!/usr/bin/env python3`
			`"""`
			`Mark entities whose canonical_name is purely conceptual ("Flying disc sighting`
			`reports", "Investigation of Flying Discs", "Document Receipt by FBI"...) with`
			`is_generic: true`. These are categories the chunker accidentally promoted to
			`event/operation entities. Hiding them from /e/events, /e/operations, /timeline,`
			`and /graph removes catalog noise without deleting data.`

			`Decision rule (conservative — only flag obvious noise):`
			`- canonical_name contains GENERIC_PHRASE patterns AND`
			`- has no specific qualifier (no proper noun, no year, no place name).`

			`We DO NOT touch:`
			`- person entities (always specific)`
			`- location entities (always specific)`
			`- entities with date_start that resolves to a real year`
			`- entities whose canonical_name contains a proper noun (Capitalized`
			`Name not in the generic vocabulary)`

			`Idempotent. Re-running flags new generics if any.`

			`Run:`
			`python3 scripts/maintain/52_mark_generic_entities.py --dry-run`
			`python3 scripts/maintain/52_mark_generic_entities.py`
			`"""`
			`from __future__ import annotations`
			`import argparse`
			`import re`
			`import sys`
			`from pathlib import Path`

			`import yaml`

			`WIKI_ENT = Path("/Users/guto/ufo/wiki/entities")`

			`# Phrases that, when forming the BULK of a canonical_name without a specific`
			`# qualifier, indicate the entity is a CATEGORY rather than an instance.`
			`GENERIC_TOKEN_VOCAB = {`
			`# core event/sighting noise`
			`"flying", "disc", "discs", "disk", "disks", "saucer", "saucers",`
			`"sighting", "sightings", "report", "reports", "reporting", "reported",`
			`"investigation", "investigations", "investigative",`
			`"observation", "observations", "observed", "observing",`
			`"unidentified", "object", "objects", "aerial", "phenomena", "phenomenon",`
			`"uap", "ufo", "ufos",`
			`# generic process / bureaucracy`
			`"document", "documents", "receipt", "receipts", "protocol", "protocols",`
			`"summary", "summaries", "review", "reviews", "incident", "incidents",`
			`"case", "cases", "event", "events", "encounter", "encounters",`
			`"evaluation", "analysis", "tracking",`
			`"memo", "memos", "memorandum", "memoranda", "letter", "letters",`
			`"communication", "communications", "correspondence",`
			`"information", "data", "details", "record", "records",`
			`"filing", "file", "files", "section", "subsection", "branch", "department",`
			`"office", "general", "matter", "matters", "subject", "subjects",`
			`# connectors (not significant on their own)`
			`"of", "the", "a", "an", "and", "or", "with", "on", "for", "to", "from",`
			`"by", "at", "in", "as", "is", "are",`
			`# pt-br equivalents (sometimes mixed)`
			`"voador", "voadores", "disco", "discos", "avistamento", "avistamentos",`
			`"relatorio", "relatorios", "investigacao", "investigacoes",`
			`"observacao", "observacoes", "objeto", "objetos", "nao", "identificado",`
			`"documento", "documentos", "recibo", "recibos", "protocolo", "protocolos",`
			`"sumario", "resumo", "incidente", "incidentes",`
			`# FBI bureaucratic`
			`"internal", "security", "headquarters", "agent", "agents",`
			`}`

			`YEAR_RE = re.compile(r"\b(18\|19\|20)\d{2}\b")`
			`TOKEN_RE = re.compile(r"\b[\w]+\b")`


			`def has_specific_qualifier(name: str) -> bool:`
			`"""Return True if name contains a year, a Capitalized proper noun (not in`
			`the generic vocab), or a multi-word proper name suggesting a specific`
			`place/person/case."""`
			`if YEAR_RE.search(name):`
			`return True`
			`# Look at tokens with non-generic Capitalized words`
			`for tok in TOKEN_RE.findall(name):`
			`# Strict proper-noun check: starts with uppercase, length >= 4,`
			`# not in generic vocab`
			`if tok and tok[0].isupper() and len(tok) >= 4:`
			`if tok.lower() not in GENERIC_TOKEN_VOCAB:`
			`return True`
			`# Check for hyphenated identifiers (EV-..., OBJ-...) — those are codes,`
			`# not specific qualifiers UNLESS they have date fields`
			`return False`


			`def is_pure_generic(name: str) -> bool:`
			`"""True if canonical_name is entirely composed of generic vocab tokens."""`
			`if not name: return True`
			`toks = [t.lower() for t in TOKEN_RE.findall(name)]`
			`if not toks: return True`
			`significant = [t for t in toks if len(t) > 1]`
			`if not significant: return True`
			`# Every significant token must be in the generic vocab`
			`return all(t in GENERIC_TOKEN_VOCAB for t in significant)`


			`def parse_entity(path: Path):`
			`try:`
			`text = path.read_text(encoding="utf-8")`
			`if not text.startswith("---"): return None`
			`parts = text.split("---", 2)`
			`if len(parts) < 3: return None`
			`fm = yaml.safe_load(parts[1]) or {}`
			`return {"path": path, "fm": fm, "raw": text}`
			`except Exception:`
			`return None`


			`def dump_entity(entity):`
			`raw = entity["raw"]`
			`parts = raw.split("---", 2)`
			`if len(parts) < 3: return raw`
			`body = parts[2]`
			`return "---\n" + yaml.safe_dump(entity["fm"], sort_keys=False, allow_unicode=True, width=1000) + "---" + body`


			`def main() -> int:`
			`ap = argparse.ArgumentParser()`
			`ap.add_argument("--dry-run", action="store_true")`
			`args = ap.parse_args()`

			`print(f"Scanning {WIKI_ENT} ...")`
			`# Only target entity classes where genericness is meaningful`
			`target_classes = {"event", "operation", "concept", "uap_object"}`

			`total = 0`
			`flagged = 0`
			`already_flagged = 0`
			`samples = []`

			`for f in WIKI_ENT.rglob("*.md"):`
			`if "_archived" in f.parts: continue`
			`ent = parse_entity(f)`
			`if not ent: continue`
			`fm = ent["fm"]`
			`cls = fm.get("entity_class")`
			`if cls not in target_classes: continue`
			`total += 1`
			`if fm.get("is_generic") is True:`
			`already_flagged += 1`
			`continue`
			`name = fm.get("canonical_name") or ""`
			`if not name: continue`
			`# If it has a year, named person/place — skip`
			`if has_specific_qualifier(name): continue`
			`if not is_pure_generic(name): continue`
			`# Flag it`
			`fm["is_generic"] = True`
			`if not args.dry_run:`
			`f.write_text(dump_entity(ent), encoding="utf-8")`
			`flagged += 1`
			`if len(samples) < 25:`
			`samples.append((cls, name))`

			`print(f"\nEntities scanned (event/operation/concept/uap_object): {total}")`
			`print(f"Already flagged is_generic: {already_flagged}")`
			`print(f"Newly flagged is_generic: {flagged}")`
			`print(f"\nSample flagged ({min(len(samples), 25)}):")`
			`for cls, name in samples:`
			`print(f" [{cls:<10}] {name}")`
			`return 0`


			`if __name__ == "__main__":`
			`sys.exit(main())`