Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
170 lines
6.4 KiB
Python
170 lines
6.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Mark entities whose canonical_name is purely conceptual ("Flying disc sighting
|
|
reports", "Investigation of Flying Discs", "Document Receipt by FBI"...) with
|
|
`is_generic: true`. These are categories the chunker accidentally promoted to
|
|
event/operation entities. Hiding them from /e/events, /e/operations, /timeline,
|
|
and /graph removes catalog noise without deleting data.
|
|
|
|
Decision rule (conservative — only flag obvious noise):
|
|
- canonical_name contains GENERIC_PHRASE patterns AND
|
|
- has no specific qualifier (no proper noun, no year, no place name).
|
|
|
|
We DO NOT touch:
|
|
- person entities (always specific)
|
|
- location entities (always specific)
|
|
- entities with date_start that resolves to a real year
|
|
- entities whose canonical_name contains a proper noun (Capitalized
|
|
Name not in the generic vocabulary)
|
|
|
|
Idempotent. Re-running flags new generics if any.
|
|
|
|
Run:
|
|
python3 scripts/maintain/52_mark_generic_entities.py --dry-run
|
|
python3 scripts/maintain/52_mark_generic_entities.py
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
|
|
WIKI_ENT = Path("/Users/guto/ufo/wiki/entities")
|
|
|
|
# Phrases that, when forming the BULK of a canonical_name without a specific
|
|
# qualifier, indicate the entity is a CATEGORY rather than an instance.
|
|
GENERIC_TOKEN_VOCAB = {
|
|
# core event/sighting noise
|
|
"flying", "disc", "discs", "disk", "disks", "saucer", "saucers",
|
|
"sighting", "sightings", "report", "reports", "reporting", "reported",
|
|
"investigation", "investigations", "investigative",
|
|
"observation", "observations", "observed", "observing",
|
|
"unidentified", "object", "objects", "aerial", "phenomena", "phenomenon",
|
|
"uap", "ufo", "ufos",
|
|
# generic process / bureaucracy
|
|
"document", "documents", "receipt", "receipts", "protocol", "protocols",
|
|
"summary", "summaries", "review", "reviews", "incident", "incidents",
|
|
"case", "cases", "event", "events", "encounter", "encounters",
|
|
"evaluation", "analysis", "tracking",
|
|
"memo", "memos", "memorandum", "memoranda", "letter", "letters",
|
|
"communication", "communications", "correspondence",
|
|
"information", "data", "details", "record", "records",
|
|
"filing", "file", "files", "section", "subsection", "branch", "department",
|
|
"office", "general", "matter", "matters", "subject", "subjects",
|
|
# connectors (not significant on their own)
|
|
"of", "the", "a", "an", "and", "or", "with", "on", "for", "to", "from",
|
|
"by", "at", "in", "as", "is", "are",
|
|
# pt-br equivalents (sometimes mixed)
|
|
"voador", "voadores", "disco", "discos", "avistamento", "avistamentos",
|
|
"relatorio", "relatorios", "investigacao", "investigacoes",
|
|
"observacao", "observacoes", "objeto", "objetos", "nao", "identificado",
|
|
"documento", "documentos", "recibo", "recibos", "protocolo", "protocolos",
|
|
"sumario", "resumo", "incidente", "incidentes",
|
|
# FBI bureaucratic
|
|
"internal", "security", "headquarters", "agent", "agents",
|
|
}
|
|
|
|
YEAR_RE = re.compile(r"\b(18|19|20)\d{2}\b")
|
|
TOKEN_RE = re.compile(r"\b[\w]+\b")
|
|
|
|
|
|
def has_specific_qualifier(name: str) -> bool:
|
|
"""Return True if name contains a year, a Capitalized proper noun (not in
|
|
the generic vocab), or a multi-word proper name suggesting a specific
|
|
place/person/case."""
|
|
if YEAR_RE.search(name):
|
|
return True
|
|
# Look at tokens with non-generic Capitalized words
|
|
for tok in TOKEN_RE.findall(name):
|
|
# Strict proper-noun check: starts with uppercase, length >= 4,
|
|
# not in generic vocab
|
|
if tok and tok[0].isupper() and len(tok) >= 4:
|
|
if tok.lower() not in GENERIC_TOKEN_VOCAB:
|
|
return True
|
|
# Check for hyphenated identifiers (EV-..., OBJ-...) — those are codes,
|
|
# not specific qualifiers UNLESS they have date fields
|
|
return False
|
|
|
|
|
|
def is_pure_generic(name: str) -> bool:
|
|
"""True if canonical_name is entirely composed of generic vocab tokens."""
|
|
if not name: return True
|
|
toks = [t.lower() for t in TOKEN_RE.findall(name)]
|
|
if not toks: return True
|
|
significant = [t for t in toks if len(t) > 1]
|
|
if not significant: return True
|
|
# Every significant token must be in the generic vocab
|
|
return all(t in GENERIC_TOKEN_VOCAB for t in significant)
|
|
|
|
|
|
def parse_entity(path: Path):
|
|
try:
|
|
text = path.read_text(encoding="utf-8")
|
|
if not text.startswith("---"): return None
|
|
parts = text.split("---", 2)
|
|
if len(parts) < 3: return None
|
|
fm = yaml.safe_load(parts[1]) or {}
|
|
return {"path": path, "fm": fm, "raw": text}
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def dump_entity(entity):
|
|
raw = entity["raw"]
|
|
parts = raw.split("---", 2)
|
|
if len(parts) < 3: return raw
|
|
body = parts[2]
|
|
return "---\n" + yaml.safe_dump(entity["fm"], sort_keys=False, allow_unicode=True, width=1000) + "---" + body
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--dry-run", action="store_true")
|
|
args = ap.parse_args()
|
|
|
|
print(f"Scanning {WIKI_ENT} ...")
|
|
# Only target entity classes where genericness is meaningful
|
|
target_classes = {"event", "operation", "concept", "uap_object"}
|
|
|
|
total = 0
|
|
flagged = 0
|
|
already_flagged = 0
|
|
samples = []
|
|
|
|
for f in WIKI_ENT.rglob("*.md"):
|
|
if "_archived" in f.parts: continue
|
|
ent = parse_entity(f)
|
|
if not ent: continue
|
|
fm = ent["fm"]
|
|
cls = fm.get("entity_class")
|
|
if cls not in target_classes: continue
|
|
total += 1
|
|
if fm.get("is_generic") is True:
|
|
already_flagged += 1
|
|
continue
|
|
name = fm.get("canonical_name") or ""
|
|
if not name: continue
|
|
# If it has a year, named person/place — skip
|
|
if has_specific_qualifier(name): continue
|
|
if not is_pure_generic(name): continue
|
|
# Flag it
|
|
fm["is_generic"] = True
|
|
if not args.dry_run:
|
|
f.write_text(dump_entity(ent), encoding="utf-8")
|
|
flagged += 1
|
|
if len(samples) < 25:
|
|
samples.append((cls, name))
|
|
|
|
print(f"\nEntities scanned (event/operation/concept/uap_object): {total}")
|
|
print(f"Already flagged is_generic: {already_flagged}")
|
|
print(f"Newly flagged is_generic: {flagged}")
|
|
print(f"\nSample flagged ({min(len(samples), 25)}):")
|
|
for cls, name in samples:
|
|
print(f" [{cls:<10}] {name}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|