243 lines
9.9 KiB
Python
Executable file
243 lines
9.9 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
21-reextract-entities-gemini.py — Phase 2: rewrite ONLY `entities_extracted` in
|
|
each page.md using Gemini 3.0 Flash with explicit anti-fragmentation rules.
|
|
|
|
Fixes the Haiku extraction bugs:
|
|
- "Gudauta Base" was split into location "Gudauta" + organization "Base"
|
|
- "Chief Tereoken" was split into "Chief" + "Tereoken"
|
|
- Bare common nouns ("Base", "Chief", "Department") promoted to standalone entities
|
|
- Variants of same entity ("FBI" / "F-B-I" / "Federal Bureau") not normalized at source
|
|
|
|
Preserves everything else in the page.md frontmatter.
|
|
|
|
Usage:
|
|
./21-reextract-entities-gemini.py --all --workers 20
|
|
./21-reextract-entities-gemini.py --doc-id <id>
|
|
./21-reextract-entities-gemini.py --page <doc-id>/p007 # quick test
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import concurrent.futures
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import yaml
|
|
except ImportError:
|
|
sys.stderr.write("pip3 install pyyaml\n"); sys.exit(1)
|
|
|
|
try:
|
|
from google import genai
|
|
from google.genai import types as genai_types
|
|
except ImportError:
|
|
sys.stderr.write("pip3 install google-genai\n"); sys.exit(1)
|
|
|
|
|
|
UFO_ROOT = Path("/Users/guto/ufo")
|
|
PAGES = UFO_ROOT / "wiki" / "pages"
|
|
PNG_BASE = UFO_ROOT / "processing" / "png"
|
|
|
|
DEFAULT_MODEL = "gemini-3-flash-preview"
|
|
TIMEOUT_S = 180
|
|
|
|
|
|
PROMPT = """You are an OSINT investigator extracting entities from one page of a US Department of War declassified UAP/UFO document. A prior extraction pass made systematic mistakes; your job is to do it right.
|
|
|
|
CRITICAL RULES — NON-NEGOTIABLE:
|
|
|
|
1. **Compound names stay together.** "Gudauta Base" is ONE location, not two entities "Gudauta" + "Base". "Chief Tereoken" is ONE person, not "Chief" + "Tereoken". "FBI Seattle Field Office" is ONE organization. NEVER split a compound name into separate entities.
|
|
|
|
2. **Titles + names are ONE person.** "Chief Tereoken", "LCDR Smith", "Mr. Johnson", "Capt. Davis", "Mrs. Anderson", "Dr. Hynek", "General Marshall" — each is ONE single person entity, with the title as part of the canonical name.
|
|
|
|
3. **NEVER extract bare common nouns as entities.** Skip: "Chief", "Base", "Department", "Office", "Agent", "Bureau", "Captain", "Officer", "File", "Subject", "Memo", "Letter", "Report", "Page", "Bag", "Stamp", "Signature", "Carbon Copy". These are only meaningful when COMBINED with a proper name.
|
|
|
|
4. **Normalize variants at the source.** "F.B.I.", "F-B-I", "FBI", "Federal Bureau of Investigation" → all return as the SINGLE canonical form "Federal Bureau of Investigation" (with "FBI" added to aliases).
|
|
|
|
5. **Distinguish entity types precisely:**
|
|
- `locations`: physical places (cities, countries, military bases, geographic features). "Gudauta Base" → location (it's a military base). "Adapazari, Turkey" → location.
|
|
- `organizations`: institutions, agencies, branches, companies. "FBI", "USAF", "CIA Foreign Branch".
|
|
- `people`: humans with names (titles ok). "J. Edgar Hoover", "Chief Tereoken".
|
|
- `events`: dated incidents with a date or short label. "Tic-Tac Nimitz 2004", "Roswell 1947".
|
|
- `uap_objects`: described UAP themselves. Shape + color + size description.
|
|
- `vehicles`: aircraft, ships, vehicles by model/name. "USS Princeton", "F-18".
|
|
- `operations`: programs, missions, protocols by name. "Project Blue Book", "Operation Mainbrace".
|
|
- `concepts`: legal/scientific/jargon. "FOIA exemption (b)(1)", "GENTEXT", "compartmentalization".
|
|
|
|
Output ONE JSON object only (no markdown fence, no preamble) with this exact schema:
|
|
|
|
{
|
|
"entities_extracted": {
|
|
"people": [{"name": "Full canonical name with title", "role_in_page": "subject|witness|author|signer|mentioned", "aliases": ["alt spellings"]}],
|
|
"organizations": [{"name": "Canonical org name", "aliases": ["FBI", "F.B.I."], "type": "intelligence-agency|military-branch|civilian-agency|...|other"}],
|
|
"locations": [{"name": "Canonical place name including any qualifier (Gudauta Base, not just Gudauta)", "type": "city|region|country|sea|strait|airbase|naval-base|mountain|desert|building|other"}],
|
|
"events": [{"label": "Short distinctive label", "date": "YYYY-MM-DD|YYYY|NA"}],
|
|
"uap_objects": [{"shape": "...", "color": "...", "size_estimate": "..."}],
|
|
"vehicles": [{"name": "...", "class": "aircraft|ship|submarine|spacecraft|satellite|ground|other"}],
|
|
"operations": [{"name": "...", "type": "military-operation|reporting-protocol|research-program|task-force|foia-disclosure|other"}],
|
|
"concepts": [{"name": "...", "class": "legal-instrument|phenomenon-type|doctrine|scientific-term|jargon|program-name|other"}]
|
|
}
|
|
}
|
|
|
|
If a category has no entries, return an empty array. PRESERVE original spelling (do not translate names). Output ONLY the JSON."""
|
|
|
|
|
|
def utc_now_iso() -> str:
|
|
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
|
|
def read_fm(path: Path) -> tuple[dict, str]:
|
|
c = path.read_text(encoding="utf-8")
|
|
if not c.startswith("---"):
|
|
return {}, c
|
|
end = c.find("---", 4)
|
|
if end < 0:
|
|
return {}, c
|
|
try:
|
|
fm = yaml.safe_load(c[3:end].strip()) or {}
|
|
except yaml.YAMLError:
|
|
fm = {}
|
|
return fm, c[end + 3:].lstrip("\n")
|
|
|
|
|
|
def write_fm(path: Path, fm: dict, body: str) -> None:
|
|
new_yaml = yaml.dump(fm, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
sep = "\n" if body.startswith("\n") else "\n\n"
|
|
path.write_text(f"---\n{new_yaml}---{sep}{body}", encoding="utf-8")
|
|
|
|
|
|
def call_gemini(client, png_path: Path, model: str, max_tokens: int = 32768):
|
|
content = [
|
|
genai_types.Part.from_bytes(data=png_path.read_bytes(), mime_type="image/png"),
|
|
PROMPT,
|
|
]
|
|
def _call():
|
|
return client.models.generate_content(
|
|
model=model,
|
|
contents=content,
|
|
config=genai_types.GenerateContentConfig(
|
|
response_mime_type="application/json",
|
|
temperature=0.1,
|
|
max_output_tokens=max_tokens,
|
|
),
|
|
)
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
|
|
future = ex.submit(_call)
|
|
try:
|
|
return future.result(timeout=TIMEOUT_S).text
|
|
except concurrent.futures.TimeoutError:
|
|
raise RuntimeError(f"Gemini hung >{TIMEOUT_S}s")
|
|
|
|
|
|
def parse_json_lenient(text: str) -> dict:
|
|
t = text.strip()
|
|
t = re.sub(r"^```(?:json)?\s*", "", t)
|
|
t = re.sub(r"\s*```$", "", t)
|
|
return json.loads(t)
|
|
|
|
|
|
def process_page(client, page_md: Path, model: str) -> str:
|
|
fm, body = read_fm(page_md)
|
|
if not fm:
|
|
return "no-fm"
|
|
doc_id = fm.get("doc_id", "")
|
|
page_num = int(fm.get("page_number", 0))
|
|
if not doc_id or not page_num:
|
|
return "bad-fm"
|
|
padded = f"{page_num:03d}"
|
|
png = PNG_BASE / doc_id / f"p-{padded}.png"
|
|
if not png.exists():
|
|
return "no-png"
|
|
|
|
# Two attempts with progressively higher token budgets
|
|
revision = None
|
|
for tok in (32768, 65536):
|
|
try:
|
|
raw = call_gemini(client, png, model, max_tokens=tok)
|
|
revision = parse_json_lenient(raw)
|
|
break
|
|
except json.JSONDecodeError:
|
|
continue
|
|
except Exception as e:
|
|
sys.stderr.write(f" ✗ {page_md.relative_to(UFO_ROOT)}: {type(e).__name__}: {e}\n")
|
|
return "error"
|
|
if revision is None:
|
|
return "bad-json"
|
|
|
|
ee = revision.get("entities_extracted")
|
|
if not isinstance(ee, dict):
|
|
return "bad-shape"
|
|
|
|
# Quick stats for reporting
|
|
old_ee = fm.get("entities_extracted") or {}
|
|
old_n = sum(len(old_ee.get(k) or []) for k in old_ee)
|
|
new_n = sum(len(ee.get(k) or []) for k in ee)
|
|
|
|
fm["entities_extracted"] = ee
|
|
fm["last_entity_extraction_model"] = model
|
|
fm["last_entity_extraction_at"] = utc_now_iso()
|
|
write_fm(page_md, fm, body)
|
|
|
|
rel = str(page_md.relative_to(UFO_ROOT))
|
|
print(f" ✓ {rel}: entities {old_n} → {new_n}", flush=True)
|
|
return "ok"
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--doc-id")
|
|
ap.add_argument("--page", help="<doc-id>/pNNN")
|
|
ap.add_argument("--all", action="store_true")
|
|
ap.add_argument("--pages-file")
|
|
ap.add_argument("--model", default=DEFAULT_MODEL)
|
|
ap.add_argument("--workers", type=int, default=20)
|
|
ap.add_argument("--max", type=int, default=0)
|
|
args = ap.parse_args()
|
|
|
|
api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
|
|
if not api_key:
|
|
sys.stderr.write("✗ GEMINI_API_KEY not set\n"); sys.exit(1)
|
|
client = genai.Client(api_key=api_key)
|
|
|
|
if args.page:
|
|
parts = args.page.split("/")
|
|
if len(parts) != 2:
|
|
ap.error("--page must be <doc-id>/pNNN")
|
|
targets = [PAGES / parts[0] / f"{parts[1]}.md"]
|
|
elif args.doc_id:
|
|
targets = sorted((PAGES / args.doc_id).glob("p*.md"))
|
|
elif args.pages_file:
|
|
targets = [Path(line.strip() if line.strip().startswith("/") else UFO_ROOT / line.strip())
|
|
for line in Path(args.pages_file).read_text().splitlines() if line.strip()]
|
|
elif args.all:
|
|
targets = sorted(PAGES.glob("*/p*.md"))
|
|
else:
|
|
ap.error("provide --doc-id, --page, --all, or --pages-file")
|
|
|
|
if args.max:
|
|
targets = targets[:args.max]
|
|
|
|
print(f"Re-extracting entities from {len(targets)} page(s) with {args.model} ({args.workers} workers)")
|
|
stats = {"ok": 0, "error": 0, "no-png": 0, "no-fm": 0, "bad-fm": 0, "bad-json": 0, "bad-shape": 0}
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as pool:
|
|
futs = {pool.submit(process_page, client, p, args.model): p for p in targets if p.exists()}
|
|
for fut in concurrent.futures.as_completed(futs):
|
|
try:
|
|
r = fut.result()
|
|
stats[r] = stats.get(r, 0) + 1
|
|
except Exception as e:
|
|
sys.stderr.write(f"✗ {futs[fut]}: {e}\n")
|
|
stats["error"] += 1
|
|
|
|
print(f"\nDone. {stats}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|