rebuild entity layer from Sonnet-vision reextract pipeline

Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-21 12:20:24 -03:00 · 2026-05-21 12:20:24 -03:00 · a7e9dce6d2
commit a7e9dce6d2
parent 291748df63
39 changed files with 5279 additions and 47 deletions
--- a/.claude/agents/doc-rebuilder.md
+++ b/.claude/agents/doc-rebuilder.md
@ -0,0 +1,205 @@
 ---
 name: doc-rebuilder
 description: Lead orchestrator for rebuilding a complete declassified UAP/UFO document into a lossless, harness-assemblable structure. Produces individual chunk files, an ordered index, and a final assembled document.md.
 tools: Read, Write, Bash, Task
 model: sonnet
 ---
 You orchestrate the rebuild of an entire declassified UAP/UFO document into a structure that lets a deterministic harness rebuild the document perfectly.
 ## Output layout (MANDATORY structure)
 ```
 raw/<doc-id>/
 ├── document.md                    ← FINAL assembled human-readable view (built by you)
 ├── _index.json                    ← Ordered chunk list (machine-readable harness input)
 ├── chunks/
 │   ├── c0001.md                   ← Individual chunk file (one per chunk, zero-padded 4 digits)
 │   ├── c0002.md
 │   └── ...
 ├── images/
 │   ├── IMG-c0023.png              ← Cropped from page PNG (named by chunk_id)
 │   └── ...
 └── tables/
    ├── TBL-001.csv                ← Multi-page tables reconstructed (when applicable)
    └── TBL-001.md                 ← Table description bilingual
 ```
 ## Workflow
 1. **Inspect inputs**:
   - Read `wiki/documents/<doc-id>.md` frontmatter (NOT the body) — just to confirm doc exists
   - List PNG pages: `ls /Users/guto/ufo/processing/png/<doc-id>/p-*.png`
   - List OCR pages: `ls /Users/guto/ufo/processing/ocr/<doc-id>/p-*.txt`
 2. **Process pages in parallel batches of 5**:
   For each page in scope (1..max_pages), spawn `page-rebuilder` subagent via Task with prompt containing:
   - `page_png_path`: absolute path
   - `page_ocr_text`: literal contents of the OCR file (Read it, then inline)
   - `doc_id`, `page_number`, `total_pages`, `doc_title`
   Collect each returned JSON `{page_number, chunks: [...]}`.
 3. **Globally number chunks**:
   After all pages return, iterate pages in ascending page_number. For each chunk in that page (already ordered by `order_in_page`), assign:
   - `chunk_id`: `c<NNNN>` (4-digit zero-padded, globally sequential starting at 1)
   - `order_global`: sequential int (1-indexed)
   Compute `prev_chunk` and `next_chunk` pointers (null at boundaries).
 4. **Analyze images** (parallel):
   For each chunk with `type=image`, in parallel batches of 5:
   - Use Bash + PIL to crop the bbox region:
     ```
     python3 -c "
     from PIL import Image
     im = Image.open('<page_png>')
     W,H = im.size
     x,y,w,h = <bbox_x>, <bbox_y>, <bbox_w>, <bbox_h>
     pad = 0.005
     c = im.crop((max(0,int((x-pad)*W)), max(0,int((y-pad)*H)),
                  min(W,int((x+w+pad)*W)), min(H,int((y+h+pad)*H))))
     c.save('/Users/guto/ufo/raw/<doc-id>/images/IMG-<chunk_id>.png')
     "
     ```
   - Spawn `image-analyst` subagent with the cropped image absolute path
   - Merge returned fields into the chunk's metadata: `image_description_en`, `image_description_pt_br`, `image_type` (overwrites), `extracted_text`, `ufo_anomaly_detected` (bool), `ufo_anomaly_type`, `ufo_anomaly_rationale`, `cryptid_anomaly_detected` (bool), `cryptid_anomaly_type`, `cryptid_anomaly_rationale`
 5. **Stitch multi-page tables** (when applicable):
   Find consecutive runs where a page's last chunk is `type=table_marker` with `cross_page_hint=continues_to_next` AND the next page's first chunk is `type=table_marker` with `cross_page_hint=continues_from_prev`. Spawn `table-stitcher` and replace the fragments with one merged `table_marker` chunk whose metadata carries `stitched_table` (a list of rows). Assign one `TBL-<NNN>` id, save CSV to `tables/TBL-<NNN>.csv`.
 6. **Write individual chunk files**:
   For EVERY chunk, write `raw/<doc-id>/chunks/c<NNNN>.md`:
   ```
   ---
   chunk_id: c<NNNN>
   type: <type>
   page: <N>
   order_in_page: <N>
   order_global: <N>
   bbox: {x: 0.00, y: 0.00, w: 0.00, h: 0.00}
   classification: <SECRET//NOFORN or null>
   formatting: [bold, all_caps]
   cross_page_hint: self_contained
   prev_chunk: c<NNNN>             # null for first
   next_chunk: c<NNNN>             # null for last
   related_image: IMG-c<NNNN>.png  # null unless type=image
   related_table: TBL-<NNN>        # null unless type=table_marker
   ocr_confidence: 0.95
   ocr_source_lines: [4, 5, 6]
   redaction_code: null
   redaction_inferred_content_type: null
   image_type: null
   ufo_anomaly_detected: false
   cryptid_anomaly_detected: false
   ufo_anomaly_type: null
   ufo_anomaly_rationale: null
   cryptid_anomaly_type: null
   cryptid_anomaly_rationale: null
   image_description_en: null
   image_description_pt_br: null
   extracted_text: null
   source_png: ../../processing/png/<doc>/p-NNN.png
   ---
   **EN:** {content_en}
   **PT-BR:** {content_pt_br}
   ```
   - All boolean metadata fields are written explicitly (false/null are valid).
   - Keep YAML clean — do not include keys with empty objects; null is fine.
 7. **Write `_index.json`** at `raw/<doc-id>/_index.json`:
   ```json
   {
     "doc_id": "<id>",
     "schema_version": "0.2.0",
     "total_pages": <N>,
     "total_chunks": <N>,
     "build_approach": "subagents",
     "build_model": "claude-sonnet-4-6",
     "build_at": "<ISO>",
     "chunks": [
       {
         "chunk_id": "c0001",
         "type": "letterhead",
         "page": 1,
         "order_in_page": 1,
         "order_global": 1,
         "file": "chunks/c0001.md",
         "bbox": {"x": 0.1, "y": 0.05, "w": 0.8, "h": 0.06},
         "preview": "first 80 chars of content_en"
       }
     ]
   }
   ```
 8. **Assemble `document.md`** (human-readable, deterministic):
   Frontmatter:
   ```yaml
   schema_version: "0.2.0"
   type: master_document
   doc_id: <id>
   canonical_title: <title>
   total_pages: <N>
   total_chunks: <N>
   chunk_types_histogram: {...}
   multi_page_tables: [TBL-001, ...]
   ufo_anomalies_flagged: [c0023, c0027]
   cryptid_anomalies_flagged: []
   build_approach: "subagents"
   build_model: claude-sonnet-4-6
   build_at: <ISO>
   ```
   Body — for each page:
   ```
   ## Page N
   <!-- chunk:c0001 src:./chunks/c0001.md -->
   <a id="c0001"></a>
   ### Chunk c0001 — letterhead · p1 · bbox: 0.10/0.05/0.80/0.06
   **EN:** {content_en}
   **PT-BR:** {content_pt_br}
   <details><summary>metadata</summary>
   ```json
   {full chunk metadata as JSON}
   ```
   </details>
   ---
   ```
   For `image` chunks, ALSO embed `![chunk image](./images/IMG-c<NNNN>.png)` and include image_analyst description.
   For `table_marker` with stitched_table, render an HTML `<table>`.
 9. **Final stats line** to stdout:
   ```
   STATS pages=<N> chunks=<N> images=<N> tables=<N> ufo=<N> cryptid=<N> doc_md_bytes=<N>
   ```
 ## Performance
 - Page-rebuilders: parallel batches of 5 (don't exceed 10 concurrent Task spawns).
 - After page-rebuilders complete, image-analysts in parallel batches of 5.
 - Crop ALL images first via Bash, THEN spawn image-analysts (they need the cropped file on disk).
 ## Bilingual policy
 - Brazilian Portuguese (pt-br), NOT European
 - UTF-8 accents preserved: ç, ã, é, í, ó, ú, â, ê, ô, à
 - Verbatim quotes stay in source language
 ## NEVER:
 - Fabricate redacted content
 - Skip a chunk (lossy reconstruction unacceptable)
 - Use chunk types outside the enum defined in page-rebuilder
 - Mix multi-page table fragments without invoking table-stitcher
 - Output explanatory prose in the final document.md (it's the reconstructed document, not a report)
 - Write only document.md without the chunks/ + _index.json — those are required for harness roundtrip
--- a/.claude/agents/image-analyst.md
+++ b/.claude/agents/image-analyst.md
@ -0,0 +1,46 @@
 ---
 name: image-analyst
 description: Analyzes a cropped image region from a scanned document. Produces precise vision description bilingual + explicit UAP/cryptid anomaly check.
 tools: Read
 model: sonnet
 ---
 You are a forensic image analyst for The Disclosure Bureau, specializing in declassified UAP/UFO archive imagery.
 Given a page PNG path + a bbox region, you focus on that bbox and produce a precise analysis with explicit UAP and cryptid anomaly checks.
 ## Output schema
 ONE JSON object, no fence, no preamble:
 ```
 {
  "description_en": "Precise factual description (1-3 sentences)",
  "description_pt_br": "Brazilian Portuguese version, preserve UTF-8 accents",
  "image_type": "photo|sketch|map|chart|stamp|signature|redaction|logo|seal|diagram|other",
  "extracted_text": "Any text visible in the image, verbatim original language",
  "ufo_anomaly_check": {
    "anomaly_detected": false,
    "anomaly_type": null,
    "rationale": "1 sentence reasoning"
  },
  "cryptid_anomaly_check": {
    "anomaly_detected": false,
    "anomaly_type": null,
    "rationale": "1 sentence"
  },
  "confidence": 0.95
 }
 ```
 ## Anomaly criteria (be conservative)
 **UAP**: morphologies consistent with reported UAP — disc, triangle, sphere, cylinder, elongated ellipsoid, cigar, irregular metallic; objects defying obvious aerodynamic explanation; unusual lights or sensor signatures.
 **Cryptid**: non-human entities; beings with anomalous proportions; figures inconsistent with known fauna; biological anomalies.
 False positives erode trust. Flag only when the image GENUINELY matches. If the image is mundane (typed text, signature, official seal, hole-punch marks, standard map), `anomaly_detected: false`.
 Brazilian Portuguese (NOT European). Preserve UTF-8 accents.
 Output ONLY the JSON.
--- a/.claude/agents/page-rebuilder.md
+++ b/.claude/agents/page-rebuilder.md
@ -0,0 +1,127 @@
 ---
 name: page-rebuilder
 description: Rebuilds ONE scanned document page as a sequence of LOSSLESS agentic chunks with bilingual EN+PT-BR content. Output is structured so chunks can be deterministically reassembled into a faithful reproduction of the original page (and document) via a harness.
 tools: Read
 model: sonnet
 ---
 You are a forensic document reconstruction agent for The Disclosure Bureau. Given a single page of a US Department of War declassified UAP/UFO document (PNG image + raw OCR text), you decompose it into LOSSLESS agentic chunks — each chunk is a single semantic unit, and the SUM of chunks rebuilt in `order_in_page` faithfully reproduces the page.
 ## Your inputs (from the spawn prompt)
 - `page_png_path`: absolute path to the page PNG (USE the Read tool to view it)
 - `page_ocr_text`: raw OCR text (layout-preserved)
 - `doc_id`, `page_number`, `total_pages`, `doc_title`
 ## Chunk types — STRICT enum (use EXACTLY one of these 19 string values, no variations)
 **The `type` field MUST be one of these literal strings. Do NOT invent names like `body_paragraph`, `classification_banner`, `header_block`, `subject_line`, `addressee_block`, `signature_block`, `section_header`, `form_reference`, or `distribution_list`. Map every chunk you see onto one of these canonical types:**
 | canonical type | what to map to it | example natural-name variations (do NOT use these) |
 |---|---|---|
 | `letterhead` | top-of-page institutional banner (name + address printed together) | letterhead, masthead |
 | `address_block` | sender (FROM:) or recipient (TO:) address; also distribution list, addressee block, routing list | addressee_block, distribution_list, routing_block, to_block, from_block |
 | `classification_marking` | SECRET, NOFORN, CONFIDENTIAL, RESTRICTED, TOP SECRET printed/typed (NOT inked stamp) | classification_banner, security_banner, classification_label |
 | `heading` | document title, section header, subject line, MEMORANDUM, SUBJECT:, RE:, agenda items | header_block, section_header, subject_line, doc_title, agenda_heading |
 | `paragraph` | body text paragraph (most common type) | body_paragraph, narrative, prose, body_text |
 | `form_field` | labeled field + value (Date: 5 May 1948 · Observer: [REDACTED] · File No: 65-3489) | form_reference, field, label_value, kv_field |
 | `bulleted_item` | single bullet point in a list |  |
 | `numbered_item` | single numbered item (1., 2., a., (i)) |  |
 | `quote_block` | indented or block-quoted passage |  |
 | `caption` | caption directly attached to an image |  |
 | `table_marker` | the full table on this page (one chunk per table) |  |
 | `image` | any embedded image (photo, sketch, map, diagram, chart, logo, seal — but NOT inked stamps or signatures, which are their own types) |  |
 | `stamp` | inked official stamp (round seal, banner stamp, date-received stamp, declass stamp) |  |
 | `signature` | handwritten signature (typed name beneath belongs to the previous chunk) | signature_block, sig |
 | `marginalia` | handwritten margin note, scribble, annotation in margins |  |
 | `redaction` | opaque black/white cover obscuring underlying content (▓▓▓) |  |
 | `footer` | page number, footer text, file tracking number at bottom |  |
 | `blank_area` | substantial blank area (only if needed for layout fidelity) |  |
 | `unknown` | ABSOLUTELY LAST RESORT |  |
 **Validation rule the harness applies**: any `type` field NOT in this list of 19 values is a SCHEMA VIOLATION and the chunk is rejected. Use canonical names only.
 ## Output schema
 ONE JSON object, NO markdown fence, NO preamble:
 ```json
 {
  "page_number": <int>,
  "page_summary_en": "1-2 sentences describing what this page contains",
  "page_summary_pt_br": "1-2 frases em português brasileiro",
  "page_layout": {
    "columns": 1,
    "orientation": "portrait | landscape",
    "page_dimensions_approx": "letter | legal | A4 | other"
  },
  "chunks": [
    {
      "order_in_page": 1,
      "type": "<one of the enum values above>",
      "bbox": {"x": 0.0, "y": 0.0, "w": 0.0, "h": 0.0},
      "content_en": "verbatim or near-verbatim English text (or asset description for non-text chunks)",
      "content_pt_br": "Brazilian Portuguese (NOT European) — preserve UTF-8 accents",
      "metadata": {
        "ocr_confidence": 0.0,
        "ocr_source_lines": [1, 2, 3],
        "classification": "SECRET//NOFORN",
        "redaction_code": "(b)(1) 1.4(a)",
        "redaction_inferred_content_type": "name|date|location|other",
        "image_type": "photo|sketch|map|diagram|chart|stamp|signature|logo|seal|other",
        "formatting": ["bold", "italic", "underline", "all_caps", "handwritten", "typed", "stamped"],
        "cross_page_hint": "self_contained | continues_from_prev | continues_to_next",
        "prev_chunk_hint": "if continues_from_prev: a short description of what to look for on the previous page",
        "next_chunk_hint": "if continues_to_next: a short description of what continues",
        "language_in_source": "en|pt|es|fr|de|other"
      }
    }
  ]
 }
 ```
 ## Critical rules for LOSSLESS reconstruction
 1. **Order ALWAYS by reading order** (top-to-bottom, left-to-right). `order_in_page` is 1-indexed sequential.
 2. **One semantic unit per chunk.** A paragraph = 1 chunk. A multi-line address = 1 chunk. A 4-row table = 1 `table_marker` chunk. An image = 1 chunk. A signature = 1 chunk.
 3. **Sum reproduces the page.** If you concatenate chunks back in `order_in_page`, the result must faithfully match the original page content. NEVER skip content. If something is unclear, mark it as `unknown` with `content_en: "[unreadable text]"`.
 4. **Verbatim preservation in `content_en`.** Names, codes, dates, classification markings stay in original spelling. NO paraphrasing. Preserve OCR errors that are likely correct (e.g., `TRIANGLUAR` stays as written if that's what the document says).
 5. **Bilingual paired.** Every chunk has both content_en and content_pt_br.
   - Brazilian Portuguese (pt-br), NOT European Portuguese.
   - Preserve UTF-8 accents: ç, ã, é, í, ó, ú, â, ê, ô, à
   - Proper nouns and verbatim quotes stay in source language even inside the pt-br content.
   - Classification markings stay verbatim (SECRET//NOFORN).
   - For non-text chunks (images, stamps), pt-br describes the asset in Brazilian Portuguese.
 6. **Redaction faithfulness.** content_en = `"[REDACTED — <code>]"`. NEVER fabricate hidden content. Optionally infer the TYPE via `redaction_inferred_content_type`.
 7. **OCR source lines.** For text chunks, list `ocr_source_lines` (1-indexed line numbers of the input OCR text this chunk came from). Helps verify provenance.
 8. **Formatting array.** Include all that apply: `bold`, `italic`, `underline`, `all_caps`, `handwritten`, `typed`, `stamped`. Empty array if normal typed text.
 9. **Cross-page hints.** Mark `cross_page_hint`:
   - `continues_from_prev` if this chunk visibly continues from previous page (table rows, mid-sentence paragraph).
   - `continues_to_next` if this chunk visibly continues to next page.
   - `self_contained` otherwise.
 10. **Bbox normalized 0..1.** From the page PNG dimensions. Tight bbox covering JUST the chunk.
 11. **Image chunks**: content_en = brief description (1 sentence). The image-analyst subagent will be invoked separately for full analysis. Just give a placeholder description here.
 ## Pre-flight
 Before generating chunks, study both the PNG and the OCR text. The PNG is ground truth for layout and visual elements. The OCR is helpful for verbatim text but may have errors — trust the PNG when they disagree.
 ## Schema fidelity rules (CRITICAL — broken YAML poisons the entire archive)
 - `ocr_source_lines` MUST be a list of INTEGERS (line numbers from the OCR text, 1-indexed). Example: `[1, 2, 3]`. NEVER put the actual OCR text strings here.
 - `bbox` is `{x: 0.0..1.0, y: 0.0..1.0, w: 0.0..1.0, h: 0.0..1.0}` — four floats. No strings, no `null`.
 - `formatting` MUST be a list of strings from the allowed set: `["bold", "italic", "underline", "all_caps", "handwritten", "typed", "stamped"]`. No other values.
 - Text strings in `content_en`, `content_pt`, `redaction_inferred_content_type` must be single-line OR properly multi-line YAML (use `|` block scalar if multi-line). DO NOT include unescaped double-quotes (`"`) inside a double-quoted string — use single-quotes around the value, OR replace inner `"` with `\"` (escape consistently).
 - Boolean fields (`ufo_anomaly_detected`, `cryptid_anomaly_detected`) are literal `true`/`false`, not `"true"`/`"false"`.
 Output ONLY the JSON.
--- a/.claude/agents/table-stitcher.md
+++ b/.claude/agents/table-stitcher.md
@ -0,0 +1,43 @@
 ---
 name: table-stitcher
 description: Reconciles tables that span multiple pages. Given consecutive page PNGs where the last table on page N continues to first table on page N+1, produces a single stitched CSV with deduped headers and merged rows.
 tools: Read
 model: sonnet
 ---
 You are a table reconciliation agent. Multi-page tables in scanned documents repeat their headers on each page and split rows across page breaks. You produce a single clean stitched output.
 ## Inputs
 - List of (page_png_path, bbox) for each fragment of the same logical table
 - Page numbers ordered
 ## Output
 ONE JSON object:
 ```
 {
  "table_id": "TBL-<DOC>-<NNN>",
  "headers": ["col1", "col2", "col3"],
  "rows": [["v1", "v2", "v3"], ...],
  "spans_pages": ["p007", "p008", "p009"],
  "headers_repeat_on_each_page": true,
  "merged_cross_page_rows": 0,
  "extraction_confidence": 0.95,
  "notes": "any caveats: illegible cells, redactions, ambiguity"
 }
 ```
 ## Rules
 - Read EACH page in order via Read tool, focus on the bbox region.
 - Detect if headers repeat across pages. Drop the duplicates after the first occurrence.
 - A row that visibly continues across page break gets MERGED into one row (concatenate cell text).
 - Preserve ORIGINAL LANGUAGE of all cell text. Do NOT translate.
 - Empty cells: "".
 - Illegible: "???".
 - Redacted: "REDACTED" (or "REDACTED ((b)(1) 1.4(a))" if code visible).
 - Numbers preserve formatting ("24,989").
 Output ONLY the JSON.
--- a/case/gaps/G-0001.md
+++ b/case/gaps/G-0001.md
@ -0,0 +1,79 @@
 ---
 schema_version: "0.1.0"
 type: gap
 gap_id: "G-0001"
 canonical_title: "Pages 1–6 of DOW-UAP-D54 are bit-for-bit identical (same SHA-256)"
 gap_class: unexplained-redaction
 description: |
  Pages 1, 2, 3, 4, 5, and 6 of the PDF `DOW-UAP-D54-Mission-Report-Mediterranean-
  Sea-NA.pdf` were converted to PNG at 200 DPI and produced six IDENTICAL files,
  all with SHA-256 `29030fd640030926c9e98e94f73a3fbc88cb9ac6739778b012eba120084ed1b7`.
  Visually, all six pages show the same image: solid black background (full-page
  redaction) with only the string "1.4(a)" in red at the top-left corner. The OCR
  (`pdftotext -layout`) of each page also produces only the text "1.4(a)".
  The most plausible hypothesis is that during the release process, six originally
  distinct pages (likely six different blocks of classified content) were ALL
  replaced by a single redaction template image, instead of each page having its
  own redaction overlay preserving sub-redacted structure.
 description_pt_br: |
  As páginas 1, 2, 3, 4, 5 e 6 do PDF `DOW-UAP-D54-Mission-Report-Mediterranean-
  Sea-NA.pdf` foram convertidas em PNG @ 200 DPI e produziram seis arquivos
  IDÊNTICOS, todos com SHA-256 `29030fd640030926c9e98e94f73a3fbc88cb9ac6739778b012eba120084ed1b7`.
  Visualmente, as seis páginas mostram a mesma imagem: fundo preto sólido (redação
  de página inteira) com apenas a string "1.4(a)" em vermelho no canto superior
  esquerdo. O OCR (`pdftotext -layout`) de cada página produz apenas o texto "1.4(a)".
  A hipótese mais plausível é que durante o processo de liberação, seis páginas
  originalmente distintas (provavelmente seis blocos diferentes de conteúdo
  classificado) foram TODAS substituídas por uma única imagem-template de redação,
  em vez de cada página ter sua própria sobreposição preservando a estrutura
  sub-redatada.
 detected_in:
  - "[[dow-uap-d54-mission-report-mediterranean-sea-na/p001]]"
  - "[[dow-uap-d54-mission-report-mediterranean-sea-na/p002]]"
  - "[[dow-uap-d54-mission-report-mediterranean-sea-na/p003]]"
  - "[[dow-uap-d54-mission-report-mediterranean-sea-na/p004]]"
  - "[[dow-uap-d54-mission-report-mediterranean-sea-na/p005]]"
  - "[[dow-uap-d54-mission-report-mediterranean-sea-na/p006]]"
 detected_by: archivist
 detected_at: "2026-05-13T08:50:00Z"
 severity: medium
 investigative_impact: |
  Substitution by an identical template erases any residual visual structure
  (margins, headers, paragraph spacing, partial signature blocks) that might
  permit inference about the redacted content. This compromises forensic
  analysis of redaction patterns. For other documents in the corpus with
  partial redactions, it is possible to roughly infer the size/position of
  removed text — here that is impossible.
 investigative_impact_pt_br: |
  A substituição por template idêntico apaga qualquer estrutura visual residual
  (margens, cabeçalhos, espaçamento de parágrafos, blocos parciais de assinatura)
  que poderia permitir inferência sobre o conteúdo redatado. Compromete a análise
  forense de padrões de redação. Para outros documentos do corpus com redações
  parciais é possível inferir aproximadamente o tamanho/posição do texto
  removido — aqui isso é impossível.
 possible_explanations:
  - { explanation: "Redaction template applied in bulk for all SECRET-classified pages", confidence_band: medium }
  - { explanation: "Bug in release/redaction software that duplicated a single page image", confidence_band: low }
  - { explanation: "Deliberate decision to uniform the appearance of fully redacted pages", confidence_band: medium }
 recommended_actions:
  - "Cross-check other documents in the DOW-UAP-D series to see if the pattern repeats"
  - "Compare PDF metadata (xref, font subsetting, image XObject ids) between the 6 pages"
  - "Check whether other corpus PDFs with all-redacted pages exhibit the same SHA collision"
 related_gaps: ["[[gap/G-0002]]"]
 wiki_version: "0.1.0"
 ---
 # Gap G-0001 — Identical pages in DOW-UAP-D54
 Anomaly detected via SHA-256 collision across 6 PNGs derived from 6 distinct pages of the original PDF. See `description` / `description_pt_br` in frontmatter.
--- a/case/gaps/G-0002.md
+++ b/case/gaps/G-0002.md
@ -0,0 +1,61 @@
 ---
 schema_version: "0.1.0"
 type: gap
 gap_id: "G-0002"
 canonical_title: "Mismatch between internal title (D31) and filename (D54) in DOW-UAP-D54"
 gap_class: inconsistency
 description: |
  The PDF `DOW-UAP-D54-Mission-Report-Mediterranean-Sea-NA.pdf` carries in its
  PDF metadata 'Title' field the value "DoW-UAP-D31", while its external filename
  (published on war.gov/ufo) uses the identifier "D54".
  This may indicate:
  (a) editorial renumbering between versions — the document was originally
      "D31" during preparation and renumbered to "D54" at release;
  (b) copy/paste error in the release template;
  (c) a separate "D31" document exists whose title was reused by mistake.
 description_pt_br: |
  O PDF `DOW-UAP-D54-Mission-Report-Mediterranean-Sea-NA.pdf` carrega no campo
  PDF metadata 'Title' o valor "DoW-UAP-D31", enquanto seu nome externo de
  arquivo (publicado em war.gov/ufo) usa o identificador "D54".
  Isso pode indicar:
  (a) renumeração editorial entre versões — o documento foi originalmente
      "D31" durante a preparação e renumerado para "D54" no release;
  (b) erro de copy/paste no template de release;
  (c) existe um documento "D31" separado cujo título foi reusado por engano.
 detected_in:
  - "[[dow-uap-d54-mission-report-mediterranean-sea-na]]"
 detected_by: archivist
 detected_at: "2026-05-13T08:50:00Z"
 severity: low
 investigative_impact: |
  Does not affect substantive content (the page-7 UAP observation is independent
  of the report number). But raises doubt about whether a separate "DoW-UAP-D31"
  file exists in the corpus, and about the integrity of the release process.
 investigative_impact_pt_br: |
  Não afeta o conteúdo substantivo (a observação UAP da página 7 é independente
  do número do relatório). Mas levanta dúvida sobre se existe um arquivo
  "DoW-UAP-D31" separado no corpus, e sobre a integridade do processo de release.
 possible_explanations:
  - { explanation: "Editorial renumbering — D31 was internal name, D54 is public ID", confidence_band: medium }
  - { explanation: "Copy-paste error of title from another template document", confidence_band: medium }
  - { explanation: "A separate D31 exists and this D54 inherited its title by mistake", confidence_band: low }
 recommended_actions:
  - "Check whether a separate DOW-UAP-D31 exists in the war.gov/ufo corpus"
  - "Cross-check internal titles of other DOW-UAP-D* documents to detect a pattern"
  - "Compare against AARO's official index if available"
 related_gaps: ["[[gap/G-0001]]"]
 wiki_version: "0.1.0"
 ---
 # Gap G-0002 — Internal identifier vs filename mismatch
 See `description` / `description_pt_br`.
--- a/scripts/maintain/42_sync_entity_stats.py
+++ b/scripts/maintain/42_sync_entity_stats.py
@ -121,6 +121,42 @@ def canonicalize_name(name: str) -> str:
    return collapsed
 def event_id_from_entry(entry: dict) -> str | None:
    """Same EV-YYYY-MM-DD-slug id rule as scripts/03-dedup-entities.py."""
    label = entry.get("label") or entry.get("name")
    if not label:
        return None
    date = entry.get("date") or "NA"
    slug = canonicalize_name(label)[:40].strip("-") or "unlabeled"
    m = re.match(r"^(\d{4})-(\d{2})-(\d{2})$", str(date))
    if m:
        return f"EV-{m.group(1)}-{m.group(2)}-{m.group(3)}-{slug}"
    m = re.match(r"^(\d{4})-(\d{2})$", str(date))
    if m:
        return f"EV-{m.group(1)}-{m.group(2)}-XX-{slug}"
    m = re.match(r"^(\d{4})$", str(date))
    if m:
        return f"EV-{m.group(1)}-XX-XX-{slug}"
    return f"EV-XXXX-XX-XX-{slug}"
 def uap_object_id_from_event(event_id: str, index: int) -> str:
    """OBJ-EV<year>-<EVENT_SLUG_UPPERCASE>-<NN>, mirroring scripts/03 logic."""
    if event_id and event_id.startswith("EV-"):
        rest = event_id[3:]
        parts = rest.split("-", 4)
        if len(parts) >= 4:
            year = parts[0]
            slug_part = "-".join(parts[3:]) if len(parts) > 3 else "unk"
            slug_compact = slug_part.replace("-", "").upper()[:20] or "UNK"
            event_short = f"EV{year}-{slug_compact}"
        else:
            event_short = "UNK"
    else:
        event_short = "UNK"
    return f"OBJ-{event_short}-{index:02d}"
 def utc_iso() -> str:
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
@ -192,14 +228,32 @@ def collect_page_refs() -> dict[tuple[str, str], set[str]]:
        # page_id like "doc-abc/p007"
        doc_id = page_path.parent.name
        page_id = f"{doc_id}/{page_path.stem}"
        # Compute the page's event_ids first — UAP objects on the same page
        # are linked to the FIRST event (mirrors scripts/03-dedup-entities.py).
        page_event_ids: list[str] = []
        for entry in (extracted.get("events") or []):
            if isinstance(entry, dict):
                eid = event_id_from_entry(entry)
                if eid:
                    page_event_ids.append(eid)
                    refs[("event", eid)].add(page_id)
        # Then the OBJs, indexed in order, anchored to the first event.
        for idx, entry in enumerate((extracted.get("uap_objects") or []), start=1):
            event_for_obj = page_event_ids[0] if page_event_ids else None
            if not event_for_obj:
                # Same fallback script 03 uses when no event exists on the page.
                event_for_obj = f"EV-XXXX-XX-XX-{canonicalize_name(doc_id)[:30]}"
            obj_id = uap_object_id_from_event(event_for_obj, idx)
            refs[("uap_object", obj_id)].add(page_id)
        # Every other class is handled generically (name-based).
        for folder, entries in extracted.items():
            cls = FOLDER_TO_CLASS.get(folder)
-            if not cls or not isinstance(entries, list):
+            if not cls or cls in {"event", "uap_object"} or not isinstance(entries, list):
                continue
            for entry in entries:
                # entry can be a plain string id, a wikilink, or a dict with
                # a `name` field that we must canonicalize ourselves (matches
                # the algorithm used in scripts/03-dedup-entities.py).
                eid = None
                if isinstance(entry, str):
                    _, parsed_eid = parse_wikilink_target(entry)
@ -210,8 +264,6 @@ def collect_page_refs() -> dict[tuple[str, str], set[str]]:
                           or canonicalize_name(entry.get("name", "")))
                if eid:
                    refs[(cls, eid)].add(page_id)
                    # Also index by every alias, so e.g. "USCENTCOM" matches a
                    # United States Central Command entity if dedup ran on aliases.
                    if isinstance(entry, dict):
                        for alias in (entry.get("aliases") or []):
                            alias_id = canonicalize_name(alias)
@ -372,12 +424,14 @@ def main() -> int:
            stats[strength] += 1
-            # Optional: clean up OBJ entities whose canonical_name is a 100-char
+            # Optional: clean up OBJ entities whose canonical_name is a shape
-            # shape description plus the ID in parentheses. Move the description
+            # description plus the ID in parentheses. Move the description to
-            # to an alias and pick a short readable name from the linked event.
+            # an alias and pick a short readable name from the linked event.
            if args.fix_obj_names and cls == "uap_object":
                cn = str(fm.get("canonical_name") or "")
-                if len(cn) > 80 and "UAP" in cn and "(" in cn and cn.endswith(")"):
+                # Match any OBJ name that embeds the raw ID in parens — that's
                # the unmistakable Sonnet-generated pattern we want to clean up.
                if "UAP (OBJ-" in cn and cn.endswith(")"):
                    obs_event = fm.get("observed_in_event")
                    event_cls, event_id = parse_wikilink_target(obs_event or "")
                    if event_cls == "event" and event_id:
--- a/scripts/maintain/43_fix_chunk_page_from_source_png.py
+++ b/scripts/maintain/43_fix_chunk_page_from_source_png.py
@ -0,0 +1,83 @@
 #!/usr/bin/env python3
 """
 Normalize each chunk's `page:` field to match the actual PNG it was rendered
 against (`source_png`).
 Background: the chunker (Sonnet) populated `page:` with the page-number it
 INFERRED from the document's printed footer/header — which often diverges from
 the PNG index after the PDF→PNG conversion (cover sheets, blank pages, FBI
 section markers, etc).
 The UI routes `/d/<doc>/<pNNN>` by PNG index, so the chunk `page` field MUST
 match the PNG index for the page view to show the right chunks alongside the
 right scan.
 This script rewrites `page:` IN PLACE in every raw chunk markdown where the
 field disagrees with the number embedded in `source_png:`. It is idempotent —
 re-running it on a clean tree is a no-op.
 Run:
    python3 scripts/maintain/43_fix_chunk_page_from_source_png.py [--dry-run]
 """
 from __future__ import annotations
 import re
 import sys
 from pathlib import Path
 from collections import defaultdict
 CHUNKS_ROOT = Path("/Users/guto/ufo/raw")
 PAGE_RE = re.compile(r"^page:\s*(\d+)\s*$", re.M)
 SRC_RE = re.compile(r"source_png:\s*\"?[^\"\n]*?p-?(\d+)\.png", re.M)
 def main() -> int:
    dry = "--dry-run" in sys.argv
    fixed = 0
    scanned = 0
    by_doc: dict[str, int] = defaultdict(int)
    samples: list[tuple[str, int, int]] = []
    for chunks_dir in sorted(CHUNKS_ROOT.glob("*--subagent/chunks")):
        doc_id = chunks_dir.parent.name.replace("--subagent", "")
        for f in chunks_dir.glob("*.md"):
            content = f.read_text(encoding="utf-8")
            if not content.startswith("---"):
                continue
            parts = content.split("---", 2)
            if len(parts) < 3:
                continue
            _, fm, body = parts
            page_m = PAGE_RE.search(fm)
            src_m = SRC_RE.search(fm)
            if not (page_m and src_m):
                continue
            scanned += 1
            declared = int(page_m.group(1))
            real = int(src_m.group(1))
            if declared == real:
                continue
            new_fm = PAGE_RE.sub(f"page: {real}", fm, count=1)
            new_content = "---" + new_fm + "---" + body
            if not dry:
                f.write_text(new_content, encoding="utf-8")
            fixed += 1
            by_doc[doc_id] += 1
            if len(samples) < 5:
                samples.append((f"{doc_id}/{f.name}", declared, real))
    print(f"Scanned: {scanned} chunks")
    print(f"Fixed:   {fixed} chunks  ({'dry-run' if dry else 'written'})")
    print(f"Docs touched: {len(by_doc)}")
    if by_doc:
        print("\nTop docs by fix count:")
        for doc, n in sorted(by_doc.items(), key=lambda x: -x[1])[:15]:
            print(f"  {n:>5}  {doc}")
    if samples:
        print("\nSample fixes:")
        for path, d, r in samples:
            print(f"  {path}: page {d} -> {r}")
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/maintain/44_sync_chunk_page_to_db.py
+++ b/scripts/maintain/44_sync_chunk_page_to_db.py
@ -0,0 +1,74 @@
 #!/usr/bin/env python3
 """
 Resync `chunks.page` in Postgres from the raw chunk markdowns (after running
 43_fix_chunk_page_from_source_png.py).
 This avoids re-embedding — we only touch the integer column.
 Run:
    DATABASE_URL=postgres://... python3 scripts/maintain/44_sync_chunk_page_to_db.py
 """
 from __future__ import annotations
 import os
 import re
 import sys
 from pathlib import Path
 import psycopg
 CHUNKS_ROOT = Path("/Users/guto/ufo/raw")
 PAGE_RE = re.compile(r"^page:\s*(\d+)\s*$", re.M)
 CID_RE = re.compile(r"^chunk_id:\s*(\S+)\s*$", re.M)
 def main() -> int:
    dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL")
    if not dburl:
        sys.exit("DATABASE_URL not set")
    updates: list[tuple[str, str, int]] = []  # (doc_id, chunk_id, page)
    for chunks_dir in sorted(CHUNKS_ROOT.glob("*--subagent/chunks")):
        doc_id = chunks_dir.parent.name.replace("--subagent", "")
        for f in chunks_dir.glob("*.md"):
            content = f.read_text(encoding="utf-8")
            if not content.startswith("---"):
                continue
            parts = content.split("---", 2)
            if len(parts) < 3:
                continue
            fm = parts[1]
            cid_m = CID_RE.search(fm)
            page_m = PAGE_RE.search(fm)
            if not (cid_m and page_m):
                continue
            updates.append((doc_id, cid_m.group(1), int(page_m.group(1))))
    print(f"Loaded {len(updates)} chunk records from disk")
    with psycopg.connect(dburl) as conn:
        with conn.cursor() as cur:
            cur.execute(
                "CREATE TEMP TABLE _chunk_pages (doc_id TEXT, chunk_id TEXT, page INT)"
            )
            with cur.copy("COPY _chunk_pages (doc_id, chunk_id, page) FROM STDIN") as cp:
                for row in updates:
                    cp.write_row(row)
            cur.execute(
                """
                UPDATE chunks c
                SET page = t.page
                FROM _chunk_pages t
                WHERE c.doc_id = t.doc_id AND c.chunk_id = t.chunk_id
                  AND c.page IS DISTINCT FROM t.page
                """
            )
            changed = cur.rowcount
            print(f"Updated {changed} rows in chunks.page")
        conn.commit()
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/maintain/45_resync_index_json.py
+++ b/scripts/maintain/45_resync_index_json.py
@ -0,0 +1,51 @@
 #!/usr/bin/env python3
 """
 Resync each `_index.json` so its embedded chunks[].page reflects the corrected
 markdown frontmatter (after script 43).
 Idempotent.
 """
 from __future__ import annotations
 import json
 import re
 from pathlib import Path
 CHUNKS_ROOT = Path("/Users/guto/ufo/raw")
 PAGE_RE = re.compile(r"^page:\s*(\d+)\s*$", re.M)
 CID_RE = re.compile(r"^chunk_id:\s*(\S+)\s*$", re.M)
 def main() -> None:
    touched = 0
    for chunks_dir in sorted(CHUNKS_ROOT.glob("*--subagent/chunks")):
        idx_path = chunks_dir.parent / "_index.json"
        if not idx_path.is_file():
            continue
        idx = json.loads(idx_path.read_text(encoding="utf-8"))
        chunks = idx.get("chunks") or []
        if not chunks:
            continue
        # Build chunk_id -> page from disk
        truth: dict[str, int] = {}
        for f in chunks_dir.glob("*.md"):
            head = f.read_text(encoding="utf-8")[:2000]
            cm = CID_RE.search(head)
            pm = PAGE_RE.search(head)
            if cm and pm:
                truth[cm.group(1)] = int(pm.group(1))
        changed = 0
        for entry in chunks:
            cid = entry.get("chunk_id")
            real = truth.get(cid)
            if real is not None and entry.get("page") != real:
                entry["page"] = real
                changed += 1
        if changed:
            idx_path.write_text(json.dumps(idx, indent=2, ensure_ascii=False), encoding="utf-8")
            print(f"  {idx.get('doc_id')}: updated {changed} entries")
            touched += 1
    print(f"\nDocs touched: {touched}")
 if __name__ == "__main__":
    main()
--- a/scripts/maintain/46_text_backfill_mentions.py
+++ b/scripts/maintain/46_text_backfill_mentions.py
@ -0,0 +1,340 @@
 #!/usr/bin/env python3
 """
 Text-based backfill of entity → page references.
 The structured pipelines (Sonnet chunks, Haiku page-level events/entities)
 miss many entities the corpus actually discusses — they extract only what
 they confidently structure into the schema. The vision_description and
 narrative_summary fields routinely *talk about* an event/person/place
 without listing it in the structured arrays.
 This script does a fuzzy back-fill: scans the narrative body of every page
 YAML for textual matches of every entity's canonical_name + aliases, and
 records the hits as a new signal source `text_refs`. Aho-Corasick is used
 so the whole 3k-pages × 34k-entities cross-product collapses to a single
 linear scan per page.
 Conservative filtering keeps the noise floor low:
  - minimum 5 chars per alias (4 if alias has a digit, to keep e.g. "USS")
  - blacklist of common stopwords / generic terms
  - word-boundary enforcement (\b in regex, manual check after AC scan)
  - skip purely numeric and ASCII-fold-identical-to-id aliases
 YAML output (added in-place on each entity file):
  text_mentioned_in: ['[[doc-id/pNNN]]', ...]   # only refs NOT already in mentioned_in
  signal_sources.text_refs: N
  total_mentions = db_chunks + page_refs + cross_refs + text_refs
  signal_strength recomputed using text_refs as a weak signal
 Run:
  python3 scripts/maintain/46_text_backfill_mentions.py [--dry-run]
 """
 from __future__ import annotations
 import argparse
 import re
 import sys
 from collections import defaultdict
 from pathlib import Path
 from typing import Iterable
 import ahocorasick
 import yaml
 WIKI = Path("/Users/guto/ufo/wiki")
 ENTITIES_BASE = WIKI / "entities"
 PAGES_BASE = WIKI / "pages"
 # Generic / stop-words: never accept these as match patterns even if listed
 # as an alias. Lowercased. PT-BR + EN + universally vague terms.
 BLACKLIST: set[str] = {
    # english stopwords / common
    "the", "and", "for", "with", "from", "this", "that", "these", "those",
    "report", "reports", "object", "objects", "unknown", "unidentified",
    "anomalous", "aerial", "phenomenon", "phenomena", "sighting", "sightings",
    "case", "cases", "incident", "incidents", "event", "events", "encounter",
    "encounters", "observation", "observations", "document", "documents",
    "memo", "memos", "letter", "letters", "table", "tables", "image", "images",
    "general", "section", "agent", "agents", "subject", "subjects",
    "office", "offices", "field", "fields", "summary", "summaries",
    "true", "false", "type", "types", "data", "name", "names",
    "person", "people", "place", "places", "location", "locations",
    "vehicle", "vehicles", "operation", "operations", "concept", "concepts",
    "page", "pages", "chunk", "chunks", "scan", "scans",
    "north", "south", "east", "west", "central",
    "captain", "major", "colonel", "general", "lieutenant", "sergeant",
    "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
    "january", "february", "march", "april", "june", "july", "august",
    "september", "october", "november", "december",
    # pt-br stopwords / common
    "para", "como", "este", "esta", "esse", "essa", "isso", "aquele",
    "ainda", "outro", "outra", "outros", "outras", "todos", "todas",
    "relatorio", "relatório", "objeto", "objetos", "documento", "documentos",
    "página", "paginas", "páginas", "evento", "eventos", "incidente",
    "incidentes", "pessoa", "pessoas", "lugar", "lugares", "local", "locais",
    "operação", "operacao", "geral", "agente", "agentes", "campo", "campos",
    "norte", "sul", "leste", "oeste",
    "janeiro", "fevereiro", "março", "marco", "abril", "junho", "julho",
    "agosto", "setembro", "outubro", "novembro", "dezembro",
    # generic acronyms widely embedded in unrelated text
    "uap", "ufo", "usaaf", "usaf", "usa", "fbi", "dod", "nasa",
    "atom", "atoms", "atomic",
 }
 def is_acceptable_alias(name: str) -> bool:
    n = name.strip()
    if not n:
        return False
    nl = n.lower()
    if nl in BLACKLIST:
        return False
    # Must contain at least one letter
    if not re.search(r"[a-zA-ZÀ-ÿ]", n):
        return False
    # Purely numeric or punctuation
    if re.fullmatch(r"[\d\s\-_.,]+", n):
        return False
    # Single-word too short (5 char min unless contains a digit)
    if " " not in n and "-" not in n and len(n) < 5 and not re.search(r"\d", n):
        return False
    return True
 def parse_frontmatter(text: str) -> tuple[dict, str]:
    if not text.startswith("---"):
        return {}, text
    parts = text.split("---", 2)
    if len(parts) < 3:
        return {}, text
    try:
        fm = yaml.safe_load(parts[1]) or {}
    except yaml.YAMLError:
        fm = {}
    return fm, parts[2]
 def dump_frontmatter_preserving_body(fm: dict, body: str) -> str:
    return "---\n" + yaml.safe_dump(fm, sort_keys=False, allow_unicode=True, width=1000) + "---" + body
 def extract_searchable_text(page_fm: dict, page_body: str) -> str:
    """Pick narrative-only fields from a page YAML — avoid YAML keys, IDs, enums."""
    parts: list[str] = []
    for key in (
        "vision_description",
        "vision_description_pt_br",
        "narrative_summary",
        "narrative_summary_pt_br",
        "extracted_text",
    ):
        v = page_fm.get(key)
        if isinstance(v, str):
            parts.append(v)
    parts.append(page_body)
    return "\n".join(parts)
 # Map entity_class -> folder name
 FOLDER_BY_CLASS = {
    "person": "people",
    "organization": "organizations",
    "location": "locations",
    "event": "events",
    "uap_object": "uap-objects",
    "vehicle": "vehicles",
    "operation": "operations",
    "concept": "concepts",
 }
 def entity_id_from_fm(fm: dict) -> tuple[str, str] | None:
    cls = fm.get("entity_class")
    if cls:
        eid_key = f"{cls}_id"
        eid = fm.get(eid_key) or fm.get("entity_id")
        if eid:
            return cls, eid
    # legacy fallback
    for k in ("person_id", "organization_id", "location_id", "event_id",
             "uap_object_id", "vehicle_id", "operation_id", "concept_id"):
        if k in fm:
            return k.replace("_id", ""), fm[k]
    return None
 def signal_strength(db_chunks: int, page_refs: int, cross_refs: int, text_refs: int) -> str:
    total = db_chunks + page_refs + cross_refs + text_refs
    if total == 0:
        return "orphan"
    if db_chunks >= 3 or page_refs >= 3 or (db_chunks >= 1 and page_refs >= 1) or text_refs >= 5:
        return "strong"
    return "weak"
 def main() -> int:
    p = argparse.ArgumentParser()
    p.add_argument("--dry-run", action="store_true")
    p.add_argument("--verbose", action="store_true")
    args = p.parse_args()
    # 1. Load entities + collect (alias_lower → list of (entity_file_path, original_alias))
    print("Loading entities ...")
    automaton = ahocorasick.Automaton()
    entities: dict[Path, dict] = {}
    alias_per_entity_count = 0
    accepted_entities = 0
    for ent_file in ENTITIES_BASE.rglob("*.md"):
        if "_archived" in ent_file.parts:
            continue
        try:
            text = ent_file.read_text(encoding="utf-8")
        except Exception:
            continue
        fm, _body = parse_frontmatter(text)
        if not fm:
            continue
        if entity_id_from_fm(fm) is None:
            continue
        canonical = fm.get("canonical_name") or fm.get("canonical_title")
        aliases = fm.get("aliases") or []
        names = []
        if isinstance(canonical, str):
            names.append(canonical)
        for a in aliases:
            if isinstance(a, str):
                names.append(a)
        accepted = [n for n in names if is_acceptable_alias(n)]
        if not accepted:
            continue
        entities[ent_file] = {"fm": fm, "raw_text": text, "accepted": accepted}
        accepted_entities += 1
        for n in accepted:
            automaton.add_word(n.lower(), (str(ent_file), n))
            alias_per_entity_count += 1
    automaton.make_automaton()
    print(f"  entities considered:  {accepted_entities}")
    print(f"  searchable aliases:   {alias_per_entity_count}")
    # 2. Scan every page YAML
    print("Scanning pages ...")
    hits_by_entity: dict[str, set[str]] = defaultdict(set)
    pages_scanned = 0
    total_hits = 0
    for page_file in PAGES_BASE.rglob("p*.md"):
        try:
            text = page_file.read_text(encoding="utf-8")
        except Exception:
            continue
        fm, body = parse_frontmatter(text)
        if not fm:
            continue
        page_id = fm.get("page_id")
        if not page_id:
            # Derive from filesystem: <doc-id>/p<NNN>
            try:
                rel = page_file.relative_to(PAGES_BASE)
                page_id = f"{rel.parent}/{rel.stem}"
            except ValueError:
                continue
        ref = f"[[{page_id}]]"
        searchable = extract_searchable_text(fm, body).lower()
        pages_scanned += 1
        seen_this_page: set[str] = set()
        for end_idx, (ent_path_str, original) in automaton.iter(searchable):
            pattern = original.lower()
            start_idx = end_idx - len(pattern) + 1
            # Word boundary check
            if start_idx > 0 and (searchable[start_idx - 1].isalnum() or searchable[start_idx - 1] == "_"):
                continue
            after = end_idx + 1
            if after < len(searchable) and (searchable[after].isalnum() or searchable[after] == "_"):
                continue
            if ent_path_str in seen_this_page:
                continue
            seen_this_page.add(ent_path_str)
            hits_by_entity[ent_path_str].add(ref)
            total_hits += 1
    print(f"  pages scanned: {pages_scanned}")
    print(f"  total hits:    {total_hits}")
    print(f"  entities matched: {len(hits_by_entity)}")
    # 3. Write back to entity YAML
    print("Writing back ...")
    promoted = 0
    upgraded = 0
    updated = 0
    for ent_path_str, refs in hits_by_entity.items():
        ent_file = Path(ent_path_str)
        rec = entities.get(ent_file)
        if not rec:
            continue
        fm = rec["fm"]
        raw_text = rec["raw_text"]
        # Don't double-count refs already in mentioned_in (structured page_refs)
        existing_mentioned = set(fm.get("mentioned_in") or [])
        new_text_refs = sorted(refs - existing_mentioned)
        old_sources = (fm.get("signal_sources") or {}).copy()
        db_chunks = int(old_sources.get("db_chunks", 0))
        page_refs = int(old_sources.get("page_refs", len(existing_mentioned)))
        cross_refs = int(old_sources.get("cross_refs", 0))
        text_refs = len(new_text_refs)
        old_strength = fm.get("signal_strength", "unverified")
        new_strength = signal_strength(db_chunks, page_refs, cross_refs, text_refs)
        new_total = db_chunks + page_refs + cross_refs + text_refs
        fm["text_mentioned_in"] = new_text_refs
        sources = old_sources
        sources["db_chunks"] = db_chunks
        sources["page_refs"] = page_refs
        sources["cross_refs"] = cross_refs
        sources["text_refs"] = text_refs
        fm["signal_sources"] = sources
        fm["total_mentions"] = new_total
        fm["signal_strength"] = new_strength
        if old_strength == "orphan" and new_strength != "orphan":
            promoted += 1
        if old_strength == "weak" and new_strength == "strong":
            upgraded += 1
        updated += 1
        if not args.dry_run:
            # Preserve body verbatim
            _, body = parse_frontmatter(raw_text)
            new_text = dump_frontmatter_preserving_body(fm, body)
            ent_file.write_text(new_text, encoding="utf-8")
    # Also: entities not matched at all — they keep their existing state.
    # But ensure their signal_sources.text_refs is at least set to 0 if missing
    # (so the YAML schema is consistent).
    backfill_zeros = 0
    if not args.dry_run:
        for ent_file, rec in entities.items():
            if str(ent_file) in hits_by_entity:
                continue
            fm = rec["fm"]
            sources = (fm.get("signal_sources") or {})
            if "text_refs" not in sources:
                sources["text_refs"] = 0
                fm["signal_sources"] = sources
                _, body = parse_frontmatter(rec["raw_text"])
                ent_file.write_text(dump_frontmatter_preserving_body(fm, body), encoding="utf-8")
                backfill_zeros += 1
    print()
    print(f"  entities updated:        {updated}")
    print(f"  promoted orphan → weak:  {promoted}")
    print(f"  upgraded weak → strong:  {upgraded}")
    print(f"  zero-text-ref backfills: {backfill_zeros}")
    print(f"  dry-run: {args.dry_run}")
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/maintain/47_mark_unsearchable_chunks.sql
+++ b/scripts/maintain/47_mark_unsearchable_chunks.sql
@ -0,0 +1,64 @@
 -- 47_mark_unsearchable_chunks.sql
 -- Add an `is_searchable` flag to public.chunks and turn it OFF for purely
 -- structural fragments that carry no informational content (salutations,
 -- page numbers, classification banners, isolated headings, etc).
 --
 -- These chunks still exist for page reconstruction; they just don't pollute
 -- search/retrieval results anymore.
 --
 -- Idempotent: re-running re-applies the same rules.
 BEGIN;
 ALTER TABLE public.chunks
  ADD COLUMN IF NOT EXISTS is_searchable BOOLEAN NOT NULL DEFAULT TRUE;
 -- Reset all to true first so reclassification is clean
 UPDATE public.chunks SET is_searchable = TRUE;
 -- Always-noise types (semantic-free formatting / scaffolding)
 UPDATE public.chunks SET is_searchable = FALSE
 WHERE type IN (
  'page_number',
  'blank',
  'stamp',
  'classification_banner',
  'classification_marking'
 );
 -- Noise when text is short (< 50 chars) — long form letterheads & such stay searchable
 UPDATE public.chunks SET is_searchable = FALSE
 WHERE type IN (
  'salutation',
  'complimentary_close',
  'section_heading',
  'section_header',
  'heading',
  'title',
  'subtitle',
  'date_line',
  'bulleted_item',
  'field_value',
  'field_entry',
  'table_marker',
  'form_field',
  'form_header',
  'routing_block',
  'distribution_list',
  'file_number',
  'marginalia'
 )
 AND LENGTH(COALESCE(content_en, content_pt, '')) < 50;
 -- Partial index: only the searchable ~83% of rows are indexed in vector / fts
 CREATE INDEX IF NOT EXISTS chunks_searchable_idx
  ON public.chunks (chunk_pk) WHERE is_searchable;
 COMMIT;
 -- Diagnostic counters
 SELECT
  is_searchable,
  COUNT(*) AS n,
  ROUND(AVG(LENGTH(COALESCE(content_en, content_pt, '')))) AS avg_len
 FROM public.chunks GROUP BY is_searchable ORDER BY is_searchable;
--- a/scripts/maintain/48_hybrid_search_filter_unsearchable.sql
+++ b/scripts/maintain/48_hybrid_search_filter_unsearchable.sql
@ -0,0 +1,89 @@
 -- 48_hybrid_search_filter_unsearchable.sql
 -- Rewrite hybrid_search_chunks to skip chunks with is_searchable=FALSE.
 -- This is the same function as before, just with an extra AND c.is_searchable
 -- in BOTH the bm25 and dense CTEs.
 CREATE OR REPLACE FUNCTION public.hybrid_search_chunks(
  q_text text,
  q_embedding vector,
  q_lang text DEFAULT 'pt',
  q_doc_id text DEFAULT NULL,
  q_type text DEFAULT NULL,
  q_classification text DEFAULT NULL,
  q_ufo_only boolean DEFAULT FALSE,
  k integer DEFAULT 100,
  rrf_k integer DEFAULT 60
 )
 RETURNS TABLE(
  chunk_pk bigint,
  doc_id text,
  chunk_id text,
  page integer,
  type text,
  bbox jsonb,
  content_en text,
  content_pt text,
  classification text,
  score double precision,
  bm25_rank integer,
  dense_rank integer
 )
 LANGUAGE plpgsql STABLE
 AS $function$
 BEGIN
  RETURN QUERY
  WITH
  ts_q AS (
    SELECT CASE WHEN q_lang = 'en'
                THEN websearch_to_tsquery('public.en_unaccent'::regconfig, q_text)
                ELSE websearch_to_tsquery('public.pt_unaccent'::regconfig, q_text)
            END AS q
  ),
  bm25 AS (
    SELECT c.chunk_pk,
           row_number() OVER (ORDER BY
             ts_rank_cd(
               CASE WHEN q_lang = 'en' THEN c.ts_en ELSE c.ts_pt END,
               (SELECT q FROM ts_q)
             ) DESC NULLS LAST
           )::INT AS r
    FROM public.chunks c
    WHERE c.is_searchable
      AND (CASE WHEN q_lang = 'en' THEN c.ts_en ELSE c.ts_pt END) @@ (SELECT q FROM ts_q)
      AND (q_doc_id IS NULL OR c.doc_id = q_doc_id)
      AND (q_type IS NULL OR c.type = q_type)
      AND (q_classification IS NULL OR c.classification = q_classification)
      AND (NOT q_ufo_only OR c.ufo_anomaly = TRUE)
    LIMIT k
  ),
  dense AS (
    SELECT c.chunk_pk,
           row_number() OVER (ORDER BY c.embedding <=> q_embedding)::INT AS r
    FROM public.chunks c
    WHERE c.is_searchable
      AND c.embedding IS NOT NULL
      AND (q_doc_id IS NULL OR c.doc_id = q_doc_id)
      AND (q_type IS NULL OR c.type = q_type)
      AND (q_classification IS NULL OR c.classification = q_classification)
      AND (NOT q_ufo_only OR c.ufo_anomaly = TRUE)
    ORDER BY c.embedding <=> q_embedding
    LIMIT k
  ),
  fused AS (
    SELECT COALESCE(b.chunk_pk, d.chunk_pk) AS chunk_pk,
           ((1.0::DOUBLE PRECISION / (rrf_k + COALESCE(b.r, k + 1))::DOUBLE PRECISION) +
            (1.0::DOUBLE PRECISION / (rrf_k + COALESCE(d.r, k + 1))::DOUBLE PRECISION)) AS score,
           b.r AS bm25_rank,
           d.r AS dense_rank
    FROM bm25 b
    FULL OUTER JOIN dense d USING (chunk_pk)
  )
  SELECT c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.type, c.bbox,
         c.content_en, c.content_pt, c.classification,
         f.score, f.bm25_rank, f.dense_rank
  FROM fused f
  JOIN public.chunks c USING (chunk_pk)
  ORDER BY f.score DESC
  LIMIT k;
 END
 $function$;
--- a/scripts/maintain/49_dedup_aggressive.py
+++ b/scripts/maintain/49_dedup_aggressive.py
@ -0,0 +1,313 @@
 #!/usr/bin/env python3
 """
 Aggressive entity deduplication — camada 1 (determinístico).
 Hoje há ~34.355 entidades; muitas são variações tipográficas, prefixos de
 papel (Mr./Dr./Major), ou OBJ-* gerados por chunk em vez de por evento.
 Este script faz merge em três camadas, todas alta-confiança:
  A. PROPER_NAME — pessoa com ≥2 tokens onde o nome próprio principal
     (último token significativo + primeiro nome) é único após strip de
     role prefixes. Ex: "Frank M. Brown", "Lt. Frank M. Brown",
     "Special Agent Frank M. Brown" → 1 entidade canônica.
  B. UAP_OBJECT_BY_EVENT — todos os OBJ-EV<year>-<EVENT>-NN do mesmo evento
     são colapsados em 1 OBJ-EV<year>-<EVENT>-00 (NN=00 = canonical).
  C. EXACT_NORMALIZED — após lowercase + strip de pontuação + strip de
     stopwords + strip de sufixos tipo " UAP" / " incident", strings
     idênticas viram 1 entidade.
 Para cada cluster:
  - Escolhe canonical: o mais longo OU o que tem narrative_summary curado,
    com fallback no primeiro alfabético.
  - Une aliases[], mentioned_in[], text_mentioned_in[], referenced_by[].
  - Recalcula signal_sources somando page_refs/text_refs (db_chunks fica
    com o do canonical pq depende do entity_pk).
  - Move duplicatas para wiki/entities/_archived/.
 Output: lista de merges (cluster → canonical), pra revisar antes de aplicar.
 Run:
  python3 scripts/maintain/49_dedup_aggressive.py --dry-run
  python3 scripts/maintain/49_dedup_aggressive.py  # apply
 """
 from __future__ import annotations
 import argparse
 import re
 import shutil
 import sys
 import unicodedata
 from pathlib import Path
 from collections import defaultdict
 from typing import Iterable
 import yaml
 WIKI_ENT = Path("/Users/guto/ufo/wiki/entities")
 ARCHIVED = WIKI_ENT / "_archived"
 ROLE_PREFIX_RE = re.compile(
    r"^("
    r"mr|mrs|ms|dr|prof|sr|sra|sir|dame|lord|lady|"
    r"major|maj|colonel|col|lt|lieutenant|captain|capt|"
    r"general|gen|sergeant|sgt|corporal|cpl|private|pvt|admiral|adm|commander|cmdr|"
    r"agent|special agent|sa|director|deputy director|deputy|"
    r"reverend|rev|professor|"
    r"president|vice president|vp|chairman|secretary|"
    r"detective|det|inspector"
    r")\.?\s+",
    re.IGNORECASE,
 )
 STOPWORD_PREFIX_RE = re.compile(r"^(the|a|an|o|os|a|as|de|do|da|dos|das|of|los|las|el|la|le|les)\s+", re.IGNORECASE)
 PUNCT_RE = re.compile(r"[.,;:!?\"'\(\)\[\]_\-]")
 WS_RE = re.compile(r"\s+")
 NOISE_SUFFIX_RE = re.compile(r"\s+(uap|incident|case|sighting|event|observation)$", re.IGNORECASE)
 OBJ_ID_RE = re.compile(r"^OBJ-([A-Z0-9]+)-(.+?)-(\d{2})$")
 def ascii_fold(s: str) -> str:
    return "".join(
        c for c in unicodedata.normalize("NFD", s)
        if not unicodedata.combining(c)
    )
 def aggressive_normalize(name: str) -> str:
    s = ascii_fold(name).strip().lower()
    # strip role prefixes (repeat: "Special Agent Major Brown")
    for _ in range(3):
        new = ROLE_PREFIX_RE.sub("", s)
        if new == s: break
        s = new
    s = STOPWORD_PREFIX_RE.sub("", s)
    s = PUNCT_RE.sub(" ", s)
    s = WS_RE.sub(" ", s).strip()
    s = NOISE_SUFFIX_RE.sub("", s).strip()
    return s
 FOLDER_TO_CLASS = {
    "people": "person",
    "organizations": "organization",
    "locations": "location",
    "events": "event",
    "uap-objects": "uap_object",
    "vehicles": "vehicle",
    "operations": "operation",
    "concepts": "concept",
 }
 def load_entity(path: Path) -> dict | None:
    try:
        text = path.read_text(encoding="utf-8")
        if not text.startswith("---"):
            return None
        parts = text.split("---", 2)
        if len(parts) < 3: return None
        fm = yaml.safe_load(parts[1]) or {}
        body = parts[2]
        return {"path": path, "fm": fm, "body": body, "raw": text}
    except Exception as e:
        return None
 def dump_entity(entity: dict) -> str:
    return "---\n" + yaml.safe_dump(
        entity["fm"], sort_keys=False, allow_unicode=True, width=1000
    ) + "---" + entity["body"]
 def dedup_pass_obj_by_event(entities: list[dict]) -> dict[str, list[dict]]:
    """OBJ-EVYYYY-EVENT-NN → group by EVENT base (drop NN)."""
    clusters: dict[str, list[dict]] = defaultdict(list)
    for e in entities:
        if e["fm"].get("entity_class") != "uap_object":
            continue
        eid = e["fm"].get("uap_object_id") or e["fm"].get("entity_id") or ""
        m = OBJ_ID_RE.match(eid)
        if not m: continue
        # Group by EV<year>-<EVENT_SLUG>
        key = f"OBJ-{m.group(1)}-{m.group(2)}"
        clusters[key].append(e)
    return {k: v for k, v in clusters.items() if len(v) > 1}
 def dedup_pass_proper_name(entities: list[dict]) -> dict[str, list[dict]]:
    """Person/organization/event/location: cluster by aggressive_normalize.
    Only auto-merge if the normalized form has ≥2 tokens (avoids "smith" only).
    """
    clusters: dict[str, list[dict]] = defaultdict(list)
    for e in entities:
        cls = e["fm"].get("entity_class")
        if cls not in ("person", "organization", "event", "location", "operation",
                       "concept", "vehicle"):
            continue
        name = e["fm"].get("canonical_name") or ""
        if not name: continue
        norm = aggressive_normalize(name)
        if not norm: continue
        # Require ≥2 tokens OR ≥6 chars to avoid "smith" / "brown" collisions
        n_tokens = len(norm.split())
        if n_tokens < 2 and len(norm) < 8:
            continue
        key = f"{cls}::{norm}"
        clusters[key].append(e)
    return {k: v for k, v in clusters.items() if len(v) > 1}
 def choose_canonical(cluster: list[dict]) -> dict:
    """Pick canonical: prefer one with curated narrative, then longest aliases,
    then most mentions, then first alphabetical."""
    def score(e: dict) -> tuple:
        fm = e["fm"]
        curated = 1 if fm.get("summary_status") == "curated" else 0
        n_aliases = len(fm.get("aliases") or [])
        mentions = fm.get("total_mentions") or 0
        # Negative path to make alphabetical ascending
        name_for_sort = str(fm.get("canonical_name") or "")
        return (curated, n_aliases, mentions, -ord(name_for_sort[0]) if name_for_sort else 0)
    return max(cluster, key=score)
 def merge_into(canonical: dict, duplicates: list[dict]) -> None:
    """Merge fields from duplicates into canonical (in place)."""
    cfm = canonical["fm"]
    cfm.setdefault("aliases", [])
    cfm.setdefault("mentioned_in", [])
    cfm.setdefault("text_mentioned_in", [])
    cfm.setdefault("referenced_by", [])
    cfm.setdefault("related", [])
    # Collect aliases (include the duplicates' canonical_name as alias)
    all_aliases = set(cfm["aliases"] or [])
    all_aliases.add(cfm.get("canonical_name", ""))
    all_mentions = set(cfm["mentioned_in"] or [])
    all_text_mentions = set(cfm["text_mentioned_in"] or [])
    all_referenced = set(cfm["referenced_by"] or [])
    all_related = set(cfm["related"] or [])
    page_refs_sum = int((cfm.get("signal_sources") or {}).get("page_refs") or 0)
    text_refs_sum = int((cfm.get("signal_sources") or {}).get("text_refs") or 0)
    for d in duplicates:
        dfm = d["fm"]
        dcanonical = dfm.get("canonical_name")
        if dcanonical: all_aliases.add(dcanonical)
        for a in (dfm.get("aliases") or []): all_aliases.add(a)
        for m in (dfm.get("mentioned_in") or []): all_mentions.add(m)
        for m in (dfm.get("text_mentioned_in") or []): all_text_mentions.add(m)
        for r in (dfm.get("referenced_by") or []): all_referenced.add(r)
        for r in (dfm.get("related") or []): all_related.add(r)
    all_aliases.discard("")
    all_aliases.discard(None)
    cfm["aliases"] = sorted(all_aliases)
    cfm["mentioned_in"] = sorted(all_mentions)
    cfm["text_mentioned_in"] = sorted(all_text_mentions)
    cfm["referenced_by"] = sorted(all_referenced)
    cfm["related"] = sorted(all_related)
    cfm["documents_count"] = len({m.split("/")[0].lstrip("[") for m in all_mentions})
    # Recompute signal_sources (page_refs/text_refs are sums; db_chunks stays as canonical's)
    sigs = cfm.get("signal_sources") or {"db_chunks": 0, "cross_refs": 0}
    sigs["page_refs"] = len(all_mentions)
    sigs["text_refs"] = len(all_text_mentions)
    sigs["cross_refs"] = len(all_referenced)
    sigs["db_chunks"] = int(sigs.get("db_chunks", 0))
    cfm["signal_sources"] = sigs
    cfm["total_mentions"] = sigs["db_chunks"] + sigs["page_refs"] + sigs["cross_refs"] + sigs["text_refs"]
    # Recompute signal_strength
    total = cfm["total_mentions"]
    if total == 0:
        cfm["signal_strength"] = "orphan"
    elif (sigs["db_chunks"] >= 3 or sigs["page_refs"] >= 3
          or (sigs["db_chunks"] >= 1 and sigs["page_refs"] >= 1)
          or sigs["text_refs"] >= 5):
        cfm["signal_strength"] = "strong"
    else:
        cfm["signal_strength"] = "weak"
 def archive_path(p: Path) -> Path:
    rel = p.relative_to(WIKI_ENT)
    return ARCHIVED / rel
 def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--dry-run", action="store_true")
    ap.add_argument("--limit-pass", choices=["all", "obj", "name"], default="all")
    args = ap.parse_args()
    print(f"Loading entities from {WIKI_ENT} ...")
    all_entities: list[dict] = []
    for f in WIKI_ENT.rglob("*.md"):
        if "_archived" in f.parts: continue
        ent = load_entity(f)
        if ent and ent["fm"].get("type") == "entity":
            all_entities.append(ent)
    print(f"  loaded {len(all_entities)} entities")
    # Run dedup passes
    clusters: dict[str, list[dict]] = {}
    if args.limit_pass in ("all", "obj"):
        obj_clusters = dedup_pass_obj_by_event(all_entities)
        print(f"\nPass A — OBJ by event: {len(obj_clusters)} clusters ({sum(len(v) for v in obj_clusters.values())} entities → {len(obj_clusters)} canonicals)")
        clusters.update({f"OBJ::{k}": v for k, v in obj_clusters.items()})
    if args.limit_pass in ("all", "name"):
        name_clusters = dedup_pass_proper_name(all_entities)
        print(f"Pass B/C — proper-name normalize: {len(name_clusters)} clusters ({sum(len(v) for v in name_clusters.values())} entities → {len(name_clusters)} canonicals)")
        clusters.update({f"NAME::{k}": v for k, v in name_clusters.items()})
    # Deduplicate entities across passes (avoid double-merge)
    seen_paths: set[str] = set()
    plans: list[tuple[str, dict, list[dict]]] = []
    for ckey, cluster in clusters.items():
        # Filter out already-seen
        cluster = [e for e in cluster if str(e["path"]) not in seen_paths]
        if len(cluster) < 2: continue
        canonical = choose_canonical(cluster)
        duplicates = [e for e in cluster if e is not canonical]
        for e in cluster: seen_paths.add(str(e["path"]))
        plans.append((ckey, canonical, duplicates))
    plans.sort(key=lambda p: -len(p[2]))  # biggest clusters first
    redundant_total = sum(len(d) for _, _, d in plans)
    print(f"\n=== Merge plan ===")
    print(f"  clusters: {len(plans)}")
    print(f"  entities removed: {redundant_total}")
    print(f"  before: {len(all_entities)}  →  after: {len(all_entities) - redundant_total}")
    print(f"  reduction: {100*redundant_total/len(all_entities):.1f}%\n")
    print("=== Top 20 biggest merges ===")
    for ckey, canonical, dupes in plans[:20]:
        cname = canonical["fm"].get("canonical_name", "?")
        print(f"  {len(dupes)+1:>3} entities → '{cname}'  ({ckey.split('::')[0]})")
        for d in dupes[:4]:
            print(f"        ✗ {d['fm'].get('canonical_name', '?')}")
        if len(dupes) > 4: print(f"        ... +{len(dupes)-4}")
    if args.dry_run:
        print("\n(dry-run; nothing written)")
        return 0
    # Apply merges
    print("\nApplying merges ...")
    merged_count = 0
    archived_count = 0
    for ckey, canonical, dupes in plans:
        merge_into(canonical, dupes)
        canonical["path"].write_text(dump_entity(canonical), encoding="utf-8")
        merged_count += 1
        for d in dupes:
            archive_to = archive_path(d["path"])
            archive_to.parent.mkdir(parents=True, exist_ok=True)
            shutil.move(str(d["path"]), str(archive_to))
            archived_count += 1
    print(f"  canonicals updated: {merged_count}")
    print(f"  duplicates archived: {archived_count}")
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/maintain/50_dedup_fuzzy_trigram.py
+++ b/scripts/maintain/50_dedup_fuzzy_trigram.py
@ -0,0 +1,428 @@
 #!/usr/bin/env python3
 """
 Aggressive entity deduplication — camada 2 (fuzzy trigram).
 Para cada entity_class, compara TODAS as entidades restantes via similaridade
 trigram (Postgres pg_trgm). Merge automático quando:
  - similarity >= 0.85 e ambos os nomes têm ≥2 tokens significativos OU
  - similarity >= 0.92 (mais tolerante para nomes curtos)
  - mesma classe
  - estado: NÃO já arquivada
  - mesmo "núcleo" (último token após strip de role prefixes)
 Para nomes ambíguos (single-word sobrenome como "Smith"), só faz merge se
 houver contexto compartilhado (mesma página, mesmo documento na maioria das
 menções).
 Run:
  DATABASE_URL=postgres://... python3 scripts/maintain/50_dedup_fuzzy_trigram.py --dry-run
 """
 from __future__ import annotations
 import argparse
 import os
 import re
 import shutil
 import sys
 import unicodedata
 from collections import defaultdict
 from pathlib import Path
 import psycopg
 import yaml
 WIKI_ENT = Path("/Users/guto/ufo/wiki/entities")
 ARCHIVED = WIKI_ENT / "_archived"
 ROLE_PREFIX_RE = re.compile(
    r"^("
    r"mr|mrs|ms|dr|prof|sr|sra|sir|dame|lord|lady|"
    r"major|maj|colonel|col|lt|lieutenant|captain|capt|"
    r"general|gen|sergeant|sgt|corporal|cpl|private|pvt|admiral|adm|commander|cmdr|"
    r"agent|special agent|sa|director|deputy director|deputy|"
    r"reverend|rev|professor|"
    r"president|vice president|vp|chairman|secretary|"
    r"detective|det|inspector"
    r")\.?\s+",
    re.IGNORECASE,
 )
 def ascii_fold(s: str) -> str:
    return "".join(c for c in unicodedata.normalize("NFD", s) if not unicodedata.combining(c))
 def strip_roles(name: str) -> str:
    s = name
    for _ in range(3):
        new = ROLE_PREFIX_RE.sub("", s)
        if new == s: break
        s = new
    return s.strip()
 def core_tokens(name: str) -> set[str]:
    """Significant tokens of a name (no roles, no stopwords, lowercased)."""
    s = ascii_fold(strip_roles(name).lower())
    s = re.sub(r"[.,;:!?\"'\(\)\[\]_]", " ", s)
    toks = [t for t in s.split() if len(t) > 1 and t not in {
        "the", "of", "and", "de", "do", "da", "dos", "das", "el", "la", "los", "las",
        "a", "an", "o", "as", "os", "le", "les", "von", "van"
    }]
    return set(toks)
 # Tokens that mix letters and digits (II-22, B-6, mode4, district17, 17th, 3rd)
 # These are SIGNIFICANT modifiers — if they differ between two names, the
 # names refer to DIFFERENT things.
 NUMERIC_TOKEN_RE = re.compile(r"^[a-z]*\d+[a-z]*$|^\d+[a-z]+$|^[a-z]+-?\d+[a-z]*$|^[ivxlcdm]+-?\d+$", re.IGNORECASE)
 CODE_SUFFIX_RE = re.compile(r"(?:\s-\s|-)([A-Z]{1,3})$|\s([A-Z])$")
 def code_suffix(name: str) -> str | None:
    """Extract trailing short code (1-3 uppercase letters) like ' - Z',
    ' M', '-R'. These often denote sub-categories that differ semantically
    (FBI classification subdivisions, military variants)."""
    s = name.strip()
    m = CODE_SUFFIX_RE.search(s)
    if not m: return None
    code = (m.group(1) or m.group(2) or "").upper()
    return code if code else None
 ROMAN_NUMERALS = {
    "i","ii","iii","iv","v","vi","vii","viii","ix","x",
    "xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx",
    "xxi","xxii","xxiii","xxiv","xxv","xxvi","xxvii","xxviii","xxix","xxx",
 }
 ORDINAL_WORDS = {
    "first","second","third","fourth","fifth","sixth","seventh","eighth",
    "ninth","tenth","eleventh","twelfth","thirteenth","fourteenth","fifteenth",
    "sixteenth","seventeenth","eighteenth","nineteenth","twentieth",
    "primeiro","segundo","terceiro","quarto","quinto","sexto","setimo",
    "oitavo","nono","decimo","undecimo","duodecimo",
 }
 def is_variant_marker(tok: str) -> bool:
    """True if `tok` is the kind of token that distinguishes instances of a
    series: 'A', 'B', 'II', 'XIII', 'Ninth', 'Fourth', '5', etc."""
    t = tok.lower()
    if t.isdigit(): return True
    if t in ROMAN_NUMERALS: return True
    if t in ORDINAL_WORDS: return True
    # Single uppercase letter (e.g. 'A' in 'Pioneer A')
    if len(tok) == 1 and tok.isalpha() and tok.isupper(): return True
    return False
 def single_letter_token_diff(name_a: str, name_b: str) -> bool:
    """Returns True if the two names differ by tokens that are 'variant
    markers' — letters, romans, ordinals. Catches:
        Pioneer Launch          vs PIONEER A Launch       (single letter)
        PIONEER-B Launch        vs PIONEER-C Launch
        XII Tactical Air Cmd    vs XIII Tactical Air Cmd  (romans)
        Ninth Air Force         vs Tenth Air Force        (ordinals)
        Apollo                  vs Apollo 11              (digit)
    These are variants of the same program, NOT the same instance.
    """
    def toks(s: str) -> list[str]:
        s = ascii_fold(s.lower())
        s = re.sub(r"[-_]", " ", s)
        return [t for t in re.findall(r"\b[\w]+\b", s) if t]
    # Lowercase tokens for set diff, but remember the original case to detect
    # the single-uppercase-letter case.
    ta_orig = re.findall(r"\b[\w]+\b", re.sub(r"[-_]", " ", ascii_fold(name_a)))
    tb_orig = re.findall(r"\b[\w]+\b", re.sub(r"[-_]", " ", ascii_fold(name_b)))
    ta = [t.lower() for t in ta_orig]
    tb = [t.lower() for t in tb_orig]
    if not ta or not tb: return False
    from collections import Counter
    ca, cb = Counter(ta), Counter(tb)
    diff_a = list((ca - cb).elements())
    diff_b = list((cb - ca).elements())
    if not diff_a and not diff_b: return False
    # Helper: variant marker check considering original case for single letters
    def marker_or_single_letter(lower_tok: str, src: list[str]) -> bool:
        if is_variant_marker(lower_tok): return True
        # Single letter not flagged above because we only allowed UPPERCASE.
        # Re-check via original-case forms in the source name.
        if len(lower_tok) == 1 and lower_tok.isalpha():
            # See if it appears as uppercase in original tokens
            for o in src:
                if o.lower() == lower_tok and o.isupper(): return True
        return False
    a_all_markers = all(marker_or_single_letter(t, ta_orig) for t in diff_a) if diff_a else True
    b_all_markers = all(marker_or_single_letter(t, tb_orig) for t in diff_b) if diff_b else True
    if a_all_markers and b_all_markers and (diff_a or diff_b):
        return True
    return False
 def numeric_signature(name: str) -> frozenset[str]:
    """Extract all numeric/ordinal/serial tokens from a name.
    Two names with DIFFERENT numeric signatures CANNOT be merged."""
    s = ascii_fold(name.lower())
    s = re.sub(r"[.,;:!?\"'\(\)\[\]_]", " ", s)
    # Extract all tokens that contain at least one digit
    nums = set()
    for t in re.findall(r"\b[\w-]+\b", s):
        # Pure number
        if re.fullmatch(r"\d+(st|nd|rd|th)?", t):
            # Normalize "17th" → "17"
            nums.add(re.sub(r"(st|nd|rd|th)$", "", t))
        # Letter + digit (II-22, b-6, mode4)
        elif re.search(r"\d", t):
            # Normalize "II-22" / "ii-22" → "ii22"; "b-6" → "b6"
            nums.add(re.sub(r"[-\s]", "", t))
    return frozenset(nums)
 FOLDER_TO_CLASS = {
    "people": "person",
    "organizations": "organization",
    "locations": "location",
    "events": "event",
    "uap-objects": "uap_object",
    "vehicles": "vehicle",
    "operations": "operation",
    "concepts": "concept",
 }
 CLASS_TO_FOLDER = {v: k for k, v in FOLDER_TO_CLASS.items()}
 def load_entity(path: Path) -> dict | None:
    try:
        text = path.read_text(encoding="utf-8")
        if not text.startswith("---"): return None
        parts = text.split("---", 2)
        if len(parts) < 3: return None
        fm = yaml.safe_load(parts[1]) or {}
        body = parts[2]
        return {"path": path, "fm": fm, "body": body}
    except Exception:
        return None
 def dump_entity(entity: dict) -> str:
    return "---\n" + yaml.safe_dump(entity["fm"], sort_keys=False, allow_unicode=True, width=1000) + "---" + entity["body"]
 def entity_path_for(cls: str, entity_id: str) -> Path | None:
    folder = CLASS_TO_FOLDER.get(cls)
    if not folder: return None
    p = WIKI_ENT / folder / f"{entity_id}.md"
    return p if p.exists() else None
 def merge_into(canonical: dict, duplicate: dict) -> None:
    cfm = canonical["fm"]; dfm = duplicate["fm"]
    cfm.setdefault("aliases", []); cfm.setdefault("mentioned_in", [])
    cfm.setdefault("text_mentioned_in", []); cfm.setdefault("referenced_by", [])
    cfm.setdefault("related", [])
    all_aliases = set(cfm["aliases"] or []); all_aliases.add(cfm.get("canonical_name", ""))
    if dfm.get("canonical_name"): all_aliases.add(dfm["canonical_name"])
    for a in (dfm.get("aliases") or []): all_aliases.add(a)
    all_aliases.discard(""); all_aliases.discard(None)
    cfm["aliases"] = sorted(all_aliases)
    cfm["mentioned_in"] = sorted(set(cfm["mentioned_in"] or []) | set(dfm.get("mentioned_in") or []))
    cfm["text_mentioned_in"] = sorted(set(cfm["text_mentioned_in"] or []) | set(dfm.get("text_mentioned_in") or []))
    cfm["referenced_by"] = sorted(set(cfm["referenced_by"] or []) | set(dfm.get("referenced_by") or []))
    cfm["related"] = sorted(set(cfm["related"] or []) | set(dfm.get("related") or []))
    cfm["documents_count"] = len({m.split("/")[0].lstrip("[") for m in cfm["mentioned_in"]})
    sigs = cfm.get("signal_sources") or {"db_chunks": 0, "cross_refs": 0}
    sigs["page_refs"] = len(cfm["mentioned_in"])
    sigs["text_refs"] = len(cfm["text_mentioned_in"])
    sigs["cross_refs"] = len(cfm["referenced_by"])
    sigs["db_chunks"] = int(sigs.get("db_chunks", 0))
    cfm["signal_sources"] = sigs
    total = sigs["db_chunks"] + sigs["page_refs"] + sigs["cross_refs"] + sigs["text_refs"]
    cfm["total_mentions"] = total
    if total == 0:
        cfm["signal_strength"] = "orphan"
    elif sigs["db_chunks"] >= 3 or sigs["page_refs"] >= 3 or (sigs["db_chunks"] >= 1 and sigs["page_refs"] >= 1) or sigs["text_refs"] >= 5:
        cfm["signal_strength"] = "strong"
    else:
        cfm["signal_strength"] = "weak"
 def choose_canonical(a: dict, b: dict) -> tuple[dict, dict]:
    """Return (canonical, duplicate). Prefer one with curated narrative,
    then longer aliases list, then higher total_mentions."""
    def score(e: dict) -> tuple:
        fm = e["fm"]
        return (
            1 if fm.get("summary_status") == "curated" else 0,
            len(fm.get("aliases") or []),
            fm.get("total_mentions") or 0,
            len(fm.get("canonical_name") or ""),
        )
    if score(a) >= score(b): return a, b
    return b, a
 def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--dry-run", action="store_true")
    ap.add_argument("--threshold", type=float, default=0.85,
                    help="trigram similarity threshold (0..1)")
    ap.add_argument("--threshold-short", type=float, default=0.92,
                    help="higher threshold for single-token names")
    ap.add_argument("--limit", type=int, default=None,
                    help="apply at most N merges (for cautious runs)")
    args = ap.parse_args()
    dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL")
    if not dburl: sys.exit("DATABASE_URL not set")
    with psycopg.connect(dburl) as conn:
        with conn.cursor() as cur:
            cur.execute(f"SET pg_trgm.similarity_threshold = {args.threshold}")
            # All entity pairs above threshold in the SAME class, where a > b (avoid duplicates)
            cur.execute(f"""
                SELECT e1.entity_class,
                       e1.entity_id,  e1.canonical_name,
                       e2.entity_id,  e2.canonical_name,
                       similarity(e1.canonical_name, e2.canonical_name) AS sim
                FROM entities e1
                JOIN entities e2
                  ON e1.entity_class = e2.entity_class
                 AND e1.entity_id    < e2.entity_id
                 AND e1.canonical_name % e2.canonical_name
                ORDER BY sim DESC
            """)
            pairs = cur.fetchall()
    print(f"Trigram candidate pairs (sim >= {args.threshold}): {len(pairs)}")
    # Filter pairs by:
    #  - share at least 1 significant core token (avoids "United States" matching "United Kingdom")
    #  - if both names are single-token AFTER role strip, require higher threshold
    accepted = []
    rejected_short = 0
    rejected_no_overlap = 0
    rejected_numeric = 0
    for cls, id_a, name_a, id_b, name_b, sim in pairs:
        toks_a = core_tokens(name_a or "")
        toks_b = core_tokens(name_b or "")
        if not toks_a or not toks_b:
            rejected_no_overlap += 1; continue
        # Must share at least one significant token
        if not (toks_a & toks_b):
            rejected_no_overlap += 1; continue
        # If one side is single-token, require stricter threshold
        if (len(toks_a) <= 1 or len(toks_b) <= 1) and sim < args.threshold_short:
            rejected_short += 1; continue
        # NUMERIC SAFEGUARD: if numeric signatures differ, the names refer to
        # different objects (NAVSTAR II-2 vs II-24, Mode 3 vs Mode 4,
        # 17th District vs 13th District, etc). Reject.
        sig_a = numeric_signature(name_a or "")
        sig_b = numeric_signature(name_b or "")
        if sig_a != sig_b:
            rejected_numeric += 1; continue
        # CODE SUFFIX SAFEGUARD: if EITHER name has a short code suffix
        # (1-3 uppercase letters), they must have IDENTICAL suffixes.
        # 'INTERNAL SECURITY - Z' ≠ 'INTERNAL SECURITY - X' ≠ 'INTERNAL SECURITY' (base).
        cs_a = code_suffix(name_a or "")
        cs_b = code_suffix(name_b or "")
        if (cs_a or cs_b) and cs_a != cs_b:
            rejected_numeric += 1; continue
        # SINGLE-LETTER VARIANT TOKEN: 'PIONEER A Launch' vs 'PIONEER-B Launch'
        # vs 'Pioneer Launch' are distinct missions of the same program.
        if single_letter_token_diff(name_a or "", name_b or ""):
            rejected_numeric += 1; continue
        accepted.append((cls, id_a, name_a, id_b, name_b, sim))
    print(f"  rejected (no token overlap):           {rejected_no_overlap}")
    print(f"  rejected (single-token below {args.threshold_short}):   {rejected_short}")
    print(f"  rejected (numeric signature mismatch): {rejected_numeric}")
    print(f"  ACCEPTED for merge: {len(accepted)}")
    # Build a union-find over accepted pairs so transitive clusters merge correctly
    parent: dict[tuple[str, str], tuple[str, str]] = {}
    def find(x):
        while parent.get(x, x) != x:
            parent[x] = parent.get(parent[x], parent[x])
            x = parent[x]
        return x
    def union(x, y):
        rx, ry = find(x), find(y)
        if rx != ry: parent[ry] = rx
    for cls, id_a, _, id_b, _, _ in accepted:
        a = (cls, id_a); b = (cls, id_b)
        parent.setdefault(a, a); parent.setdefault(b, b)
        union(a, b)
    clusters: dict[tuple[str, str], list[tuple[str, str]]] = defaultdict(list)
    for node in list(parent.keys()):
        clusters[find(node)].append(node)
    clusters = {k: v for k, v in clusters.items() if len(v) > 1}
    print(f"\nClusters after union-find: {len(clusters)}")
    print(f"Entities to remove: {sum(len(v) - 1 for v in clusters.values())}\n")
    # Sample biggest
    biggest = sorted(clusters.values(), key=lambda c: -len(c))[:15]
    print("=== Top 15 biggest fuzzy clusters ===")
    for cluster in biggest:
        # Load names for display
        names = []
        for cls, eid in cluster:
            p = entity_path_for(cls, eid)
            if p:
                ent = load_entity(p)
                if ent: names.append(ent["fm"].get("canonical_name") or eid)
        if not names: continue
        cls = cluster[0][0]
        print(f"  [{cls}] {len(cluster)} entities:")
        for n in names[:6]: print(f"     - {n}")
        if len(names) > 6: print(f"     ... +{len(names)-6}")
    if args.dry_run:
        print("\n(dry-run; nothing written)")
        return 0
    # Apply merges
    print("\nApplying merges ...")
    applied = 0
    archived = 0
    for cluster in clusters.values():
        if args.limit and applied >= args.limit: break
        # Load all entities
        loaded = []
        for cls, eid in cluster:
            p = entity_path_for(cls, eid)
            if p:
                ent = load_entity(p)
                if ent: loaded.append(ent)
        if len(loaded) < 2: continue
        # Pick canonical: highest score
        canonical = max(loaded, key=lambda e: (
            1 if e["fm"].get("summary_status") == "curated" else 0,
            len(e["fm"].get("aliases") or []),
            e["fm"].get("total_mentions") or 0,
            len(e["fm"].get("canonical_name") or ""),
        ))
        dupes = [e for e in loaded if e is not canonical]
        for d in dupes:
            merge_into(canonical, d)
        canonical["path"].write_text(dump_entity(canonical), encoding="utf-8")
        for d in dupes:
            rel = d["path"].relative_to(WIKI_ENT)
            arch = ARCHIVED / rel
            arch.parent.mkdir(parents=True, exist_ok=True)
            shutil.move(str(d["path"]), str(arch))
            archived += 1
        applied += 1
    print(f"  canonicals updated: {applied}")
    print(f"  duplicates archived: {archived}")
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/maintain/51_remap_entity_mentions.py
+++ b/scripts/maintain/51_remap_entity_mentions.py
@ -0,0 +1,159 @@
 #!/usr/bin/env python3
 """
 After scripts 49/50 archive duplicates, the DB's `entities` + `entity_mentions`
 tables still point at the OLD (now-archived) entity_pks. This script:
  1. Re-reads every active entity YAML from wiki/entities/ (skipping _archived)
  2. For each entity, looks up the corresponding entity_pk in the DB by
     (entity_class, entity_id).
  3. Reads the aliases[] from the YAML and finds DB entities with matching
     entity_id that no longer exist on disk — those are the merged-away ones.
  4. UPDATE entity_mentions SET entity_pk = <canonical_pk> WHERE entity_pk IN (<archived_pks>)
  5. DELETE FROM entities WHERE entity_class||entity_id IS NOT in active set
 Idempotent — re-running is a no-op once converged.
 """
 from __future__ import annotations
 import os
 import sys
 from pathlib import Path
 import psycopg
 import yaml
 WIKI_ENT = Path("/Users/guto/ufo/wiki/entities")
 ARCHIVED = WIKI_ENT / "_archived"
 def main() -> int:
    dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL")
    if not dburl: sys.exit("DATABASE_URL not set")
    # Build active set + the alias→canonical lookup
    print("Scanning active YAMLs ...")
    active: set[tuple[str, str]] = set()
    canonical_by_alias_eid: dict[tuple[str, str], tuple[str, str]] = {}
    for f in WIKI_ENT.rglob("*.md"):
        if "_archived" in f.parts: continue
        try:
            text = f.read_text(encoding="utf-8")
            if not text.startswith("---"): continue
            fm = yaml.safe_load(text.split("---")[1]) or {}
        except Exception: continue
        cls = fm.get("entity_class")
        eid = (fm.get("entity_id")
               or fm.get(f"{cls}_id") if cls else None)
        if not (cls and eid): continue
        active.add((cls, eid))
        # All archived entities that ended up merged into this one likely
        # had entity_ids that are now in this entity's aliases list. We can't
        # be 100% sure, but a same-class entity with id matching an alias
        # slugified is a strong signal.
    print(f"  active entities: {len(active)}")
    print("\nScanning archived YAMLs ...")
    archived_map: dict[tuple[str, str], tuple[str, str]] = {}
    for f in ARCHIVED.rglob("*.md") if ARCHIVED.exists() else []:
        try:
            text = f.read_text(encoding="utf-8")
            if not text.startswith("---"): continue
            fm = yaml.safe_load(text.split("---")[1]) or {}
        except Exception: continue
        cls = fm.get("entity_class")
        eid = (fm.get("entity_id")
               or (fm.get(f"{cls}_id") if cls else None))
        if not (cls and eid): continue
        # Find canonical: an active entity with same class whose aliases contain
        # this entity's canonical_name.
        dup_name = (fm.get("canonical_name") or "").strip().lower()
        if not dup_name: continue
        archived_map[(cls, eid)] = (cls, dup_name)
    print(f"  archived entities: {len(archived_map)}")
    print("\nConnecting to DB ...")
    with psycopg.connect(dburl) as conn:
        with conn.cursor() as cur:
            # Map active YAML entities → their entity_pk
            cur.execute("SELECT entity_pk, entity_class, entity_id, canonical_name FROM entities")
            db_rows = cur.fetchall()
            db_by_key: dict[tuple[str, str], tuple[int, str]] = {
                (cls, eid): (pk, name) for pk, cls, eid, name in db_rows
            }
            print(f"  DB entities: {len(db_rows)}")
            # For each archived (cls, eid), find the canonical active entity in same class
            # whose aliases contain the archived's canonical_name OR whose entity_id matches.
            # Build an alias index from active YAMLs:
            print("\nBuilding alias index from active YAMLs ...")
            alias_index: dict[tuple[str, str], tuple[str, str]] = {}
            for f in WIKI_ENT.rglob("*.md"):
                if "_archived" in f.parts: continue
                try:
                    text = f.read_text(encoding="utf-8")
                    if not text.startswith("---"): continue
                    fm = yaml.safe_load(text.split("---")[1]) or {}
                except Exception: continue
                cls = fm.get("entity_class")
                eid = fm.get("entity_id") or (fm.get(f"{cls}_id") if cls else None)
                if not (cls and eid): continue
                for a in (fm.get("aliases") or []):
                    if isinstance(a, str):
                        alias_index[(cls, a.strip().lower())] = (cls, eid)
                # Also index canonical_name itself
                cn = fm.get("canonical_name")
                if isinstance(cn, str):
                    alias_index[(cls, cn.strip().lower())] = (cls, eid)
            print(f"  alias index size: {len(alias_index)}")
            # Now: for each archived DB entity, find the active canonical
            print("\nResolving remap ...")
            remap_pairs: list[tuple[int, int]] = []  # (old_pk, new_pk)
            orphan_archived: list[tuple[str, str]] = []
            for (cls, eid), (db_pk, db_name) in db_by_key.items():
                if (cls, eid) in active: continue
                # This DB entity is no longer in active YAMLs → archived
                target = alias_index.get((cls, db_name.strip().lower()))
                if not target:
                    orphan_archived.append((cls, eid))
                    continue
                tgt_pk_row = db_by_key.get(target)
                if not tgt_pk_row:
                    orphan_archived.append((cls, eid)); continue
                remap_pairs.append((db_pk, tgt_pk_row[0]))
            print(f"  remap pairs: {len(remap_pairs)}")
            print(f"  orphans (archived but no canonical found): {len(orphan_archived)}")
            if remap_pairs:
                cur.execute("CREATE TEMP TABLE _remap (old_pk BIGINT, new_pk BIGINT)")
                with cur.copy("COPY _remap (old_pk, new_pk) FROM STDIN") as cp:
                    for old, new in remap_pairs:
                        cp.write_row((old, new))
                # 1. Insert new rows for the canonical entity (skip if already exists)
                #    This preserves any non-default columns the table may have.
                cur.execute("""
                    INSERT INTO entity_mentions (chunk_pk, entity_pk, surface_form)
                    SELECT em.chunk_pk, r.new_pk, em.surface_form
                    FROM entity_mentions em
                    JOIN _remap r ON em.entity_pk = r.old_pk
                    ON CONFLICT DO NOTHING
                """)
                inserted = cur.rowcount
                print(f"  new canonical mentions inserted: {inserted}")
                # 2. Delete all old (archived-entity) mentions
                cur.execute("""
                    DELETE FROM entity_mentions em USING _remap r
                    WHERE em.entity_pk = r.old_pk
                """)
                print(f"  archived-entity mentions removed: {cur.rowcount}")
                # 3. Delete archived entities from `entities` table
                cur.execute("DELETE FROM entities WHERE entity_pk IN (SELECT old_pk FROM _remap)")
                print(f"  archived entities removed: {cur.rowcount}")
        conn.commit()
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/maintain/52_mark_generic_entities.py
+++ b/scripts/maintain/52_mark_generic_entities.py
@ -0,0 +1,170 @@
 #!/usr/bin/env python3
 """
 Mark entities whose canonical_name is purely conceptual ("Flying disc sighting
 reports", "Investigation of Flying Discs", "Document Receipt by FBI"...) with
 `is_generic: true`. These are categories the chunker accidentally promoted to
 event/operation entities. Hiding them from /e/events, /e/operations, /timeline,
 and /graph removes catalog noise without deleting data.
 Decision rule (conservative — only flag obvious noise):
  - canonical_name contains GENERIC_PHRASE patterns AND
  - has no specific qualifier (no proper noun, no year, no place name).
 We DO NOT touch:
  - person entities (always specific)
  - location entities (always specific)
  - entities with date_start that resolves to a real year
  - entities whose canonical_name contains a proper noun (Capitalized
    Name not in the generic vocabulary)
 Idempotent. Re-running flags new generics if any.
 Run:
  python3 scripts/maintain/52_mark_generic_entities.py --dry-run
  python3 scripts/maintain/52_mark_generic_entities.py
 """
 from __future__ import annotations
 import argparse
 import re
 import sys
 from pathlib import Path
 import yaml
 WIKI_ENT = Path("/Users/guto/ufo/wiki/entities")
 # Phrases that, when forming the BULK of a canonical_name without a specific
 # qualifier, indicate the entity is a CATEGORY rather than an instance.
 GENERIC_TOKEN_VOCAB = {
    # core event/sighting noise
    "flying", "disc", "discs", "disk", "disks", "saucer", "saucers",
    "sighting", "sightings", "report", "reports", "reporting", "reported",
    "investigation", "investigations", "investigative",
    "observation", "observations", "observed", "observing",
    "unidentified", "object", "objects", "aerial", "phenomena", "phenomenon",
    "uap", "ufo", "ufos",
    # generic process / bureaucracy
    "document", "documents", "receipt", "receipts", "protocol", "protocols",
    "summary", "summaries", "review", "reviews", "incident", "incidents",
    "case", "cases", "event", "events", "encounter", "encounters",
    "evaluation", "analysis", "tracking",
    "memo", "memos", "memorandum", "memoranda", "letter", "letters",
    "communication", "communications", "correspondence",
    "information", "data", "details", "record", "records",
    "filing", "file", "files", "section", "subsection", "branch", "department",
    "office", "general", "matter", "matters", "subject", "subjects",
    # connectors (not significant on their own)
    "of", "the", "a", "an", "and", "or", "with", "on", "for", "to", "from",
    "by", "at", "in", "as", "is", "are",
    # pt-br equivalents (sometimes mixed)
    "voador", "voadores", "disco", "discos", "avistamento", "avistamentos",
    "relatorio", "relatorios", "investigacao", "investigacoes",
    "observacao", "observacoes", "objeto", "objetos", "nao", "identificado",
    "documento", "documentos", "recibo", "recibos", "protocolo", "protocolos",
    "sumario", "resumo", "incidente", "incidentes",
    # FBI bureaucratic
    "internal", "security", "headquarters", "agent", "agents",
 }
 YEAR_RE = re.compile(r"\b(18|19|20)\d{2}\b")
 TOKEN_RE = re.compile(r"\b[\w]+\b")
 def has_specific_qualifier(name: str) -> bool:
    """Return True if name contains a year, a Capitalized proper noun (not in
    the generic vocab), or a multi-word proper name suggesting a specific
    place/person/case."""
    if YEAR_RE.search(name):
        return True
    # Look at tokens with non-generic Capitalized words
    for tok in TOKEN_RE.findall(name):
        # Strict proper-noun check: starts with uppercase, length >= 4,
        # not in generic vocab
        if tok and tok[0].isupper() and len(tok) >= 4:
            if tok.lower() not in GENERIC_TOKEN_VOCAB:
                return True
    # Check for hyphenated identifiers (EV-..., OBJ-...) — those are codes,
    # not specific qualifiers UNLESS they have date fields
    return False
 def is_pure_generic(name: str) -> bool:
    """True if canonical_name is entirely composed of generic vocab tokens."""
    if not name: return True
    toks = [t.lower() for t in TOKEN_RE.findall(name)]
    if not toks: return True
    significant = [t for t in toks if len(t) > 1]
    if not significant: return True
    # Every significant token must be in the generic vocab
    return all(t in GENERIC_TOKEN_VOCAB for t in significant)
 def parse_entity(path: Path):
    try:
        text = path.read_text(encoding="utf-8")
        if not text.startswith("---"): return None
        parts = text.split("---", 2)
        if len(parts) < 3: return None
        fm = yaml.safe_load(parts[1]) or {}
        return {"path": path, "fm": fm, "raw": text}
    except Exception:
        return None
 def dump_entity(entity):
    raw = entity["raw"]
    parts = raw.split("---", 2)
    if len(parts) < 3: return raw
    body = parts[2]
    return "---\n" + yaml.safe_dump(entity["fm"], sort_keys=False, allow_unicode=True, width=1000) + "---" + body
 def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--dry-run", action="store_true")
    args = ap.parse_args()
    print(f"Scanning {WIKI_ENT} ...")
    # Only target entity classes where genericness is meaningful
    target_classes = {"event", "operation", "concept", "uap_object"}
    total = 0
    flagged = 0
    already_flagged = 0
    samples = []
    for f in WIKI_ENT.rglob("*.md"):
        if "_archived" in f.parts: continue
        ent = parse_entity(f)
        if not ent: continue
        fm = ent["fm"]
        cls = fm.get("entity_class")
        if cls not in target_classes: continue
        total += 1
        if fm.get("is_generic") is True:
            already_flagged += 1
            continue
        name = fm.get("canonical_name") or ""
        if not name: continue
        # If it has a year, named person/place — skip
        if has_specific_qualifier(name): continue
        if not is_pure_generic(name): continue
        # Flag it
        fm["is_generic"] = True
        if not args.dry_run:
            f.write_text(dump_entity(ent), encoding="utf-8")
        flagged += 1
        if len(samples) < 25:
            samples.append((cls, name))
    print(f"\nEntities scanned (event/operation/concept/uap_object): {total}")
    print(f"Already flagged is_generic: {already_flagged}")
    print(f"Newly flagged is_generic:   {flagged}")
    print(f"\nSample flagged ({min(len(samples), 25)}):")
    for cls, name in samples:
        print(f"  [{cls:<10}] {name}")
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/maintain/53_add_is_generic_to_db.sql
+++ b/scripts/maintain/53_add_is_generic_to_db.sql
@ -0,0 +1,10 @@
 -- 53_add_is_generic_to_db.sql
 -- Add public.entities.is_generic BOOLEAN. Populated by 54_sync_is_generic.py
 -- which reads each YAML's is_generic and writes it to the DB.
 BEGIN;
 ALTER TABLE public.entities
  ADD COLUMN IF NOT EXISTS is_generic BOOLEAN NOT NULL DEFAULT FALSE;
 CREATE INDEX IF NOT EXISTS entities_is_generic_idx
  ON public.entities (is_generic) WHERE is_generic = TRUE;
 COMMIT;
--- a/scripts/maintain/54_sync_is_generic.py
+++ b/scripts/maintain/54_sync_is_generic.py
@ -0,0 +1,55 @@
 #!/usr/bin/env python3
 """
 Sync `is_generic` flag from each entity YAML to public.entities table.
 """
 from __future__ import annotations
 import os
 import sys
 from pathlib import Path
 import psycopg
 import yaml
 WIKI_ENT = Path("/Users/guto/ufo/wiki/entities")
 def main() -> int:
    dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL")
    if not dburl: sys.exit("DATABASE_URL not set")
    rows: list[tuple[str, str, bool]] = []
    for f in WIKI_ENT.rglob("*.md"):
        if "_archived" in f.parts: continue
        try:
            text = f.read_text(encoding="utf-8")
            if not text.startswith("---"): continue
            fm = yaml.safe_load(text.split("---")[1]) or {}
        except Exception: continue
        cls = fm.get("entity_class")
        eid = fm.get("entity_id") or (fm.get(f"{cls}_id") if cls else None)
        if not (cls and eid): continue
        rows.append((cls, eid, bool(fm.get("is_generic"))))
    print(f"Loaded {len(rows)} entities from YAML")
    with psycopg.connect(dburl) as conn:
        with conn.cursor() as cur:
            cur.execute("CREATE TEMP TABLE _gen (entity_class TEXT, entity_id TEXT, is_generic BOOL)")
            with cur.copy("COPY _gen (entity_class, entity_id, is_generic) FROM STDIN") as cp:
                for row in rows: cp.write_row(row)
            cur.execute("""
                UPDATE entities e SET is_generic = g.is_generic
                FROM _gen g
                WHERE e.entity_class = g.entity_class
                  AND e.entity_id = g.entity_id
                  AND e.is_generic IS DISTINCT FROM g.is_generic
            """)
            print(f"  rows updated: {cur.rowcount}")
            cur.execute("SELECT COUNT(*) FROM entities WHERE is_generic")
            print(f"  total is_generic=TRUE in DB: {cur.fetchone()[0]}")
        conn.commit()
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/maintain/55_relations_schema.sql
+++ b/scripts/maintain/55_relations_schema.sql
@ -0,0 +1,50 @@
 -- 55_relations_schema.sql
 -- Typed relations between entities. Replaces noisy co-mention with semantic
 -- edges like (Person, witnessed, Event), (Event, occurred_at, Location),
 -- (Person, signed, Document), etc.
 BEGIN;
 CREATE TABLE IF NOT EXISTS public.relations (
  relation_pk    BIGSERIAL PRIMARY KEY,
  source_class   TEXT NOT NULL,
  source_id      TEXT NOT NULL,
  relation_type  TEXT NOT NULL,
  target_class   TEXT NOT NULL,
  target_id      TEXT NOT NULL,
  evidence_ref   TEXT,                 -- e.g. '[[doc-id/p007]]' or chunk_id
  confidence     TEXT NOT NULL DEFAULT 'medium',  -- high|medium|low
  extracted_by   TEXT NOT NULL DEFAULT 'yaml',    -- yaml|regex|llm|manual
  created_at     TIMESTAMPTZ NOT NULL DEFAULT NOW(),
  UNIQUE (source_class, source_id, relation_type, target_class, target_id, evidence_ref)
 );
 -- Enum check on relation_type (extensible — add new values as we discover them)
 ALTER TABLE public.relations
  ADD CONSTRAINT relations_type_check
  CHECK (relation_type IN (
    'witnessed',          -- (person, witnessed, event)
    'occurred_at',        -- (event, occurred_at, location)
    'involves_uap',       -- (event, involves_uap, uap_object)
    'documented_in',      -- (event, documented_in, document)
    'authored',           -- (person, authored, document)
    'signed',             -- (person, signed, document)
    'mentioned_by',       -- (person, mentioned_by, document)
    'employed_by',        -- (person, employed_by, organization)
    'operated_by',        -- (operation, operated_by, organization)
    'investigated',       -- (person, investigated, event)
    'commanded',          -- (person, commanded, organization)
    'related_to',         -- generic fallback (lower-quality)
    'similar_to',         -- (event, similar_to, event)
    'precedes',           -- (event, precedes, event)
    'follows'             -- (event, follows, event)
  ));
 CREATE INDEX IF NOT EXISTS relations_source_idx
  ON public.relations (source_class, source_id);
 CREATE INDEX IF NOT EXISTS relations_target_idx
  ON public.relations (target_class, target_id);
 CREATE INDEX IF NOT EXISTS relations_type_idx
  ON public.relations (relation_type);
 COMMIT;
--- a/scripts/maintain/56_extract_relations.py
+++ b/scripts/maintain/56_extract_relations.py
@ -0,0 +1,211 @@
 #!/usr/bin/env python3
 """
 Extract typed relations from existing page YAMLs.
 For each wiki/pages/<doc>/p<NNN>.md, examines the structured fields the Haiku
 extracted (events[], people[], organizations[], primary_location, uap_objects[])
 and produces relations:
  observers in events  →  (person, witnessed, event)
  primary_location     →  (event, occurred_at, location)
  uap_objects in event →  (event, involves_uap, uap_object)
  every event on page  →  (event, documented_in, document)
  every person on page →  (person, mentioned_by, document)
 ID mapping mirrors scripts/03-dedup-entities.py logic:
  - person:    slugify(name) → person_id
  - event:     EV-YYYY-MM-DD-slug(label)
  - location:  slugify(canonical location name)
  - uap_object: OBJ-EV<year>-<EVENT>-NN
 """
 from __future__ import annotations
 import os
 import re
 import sys
 import unicodedata
 from datetime import datetime
 from pathlib import Path
 import psycopg
 import yaml
 WIKI = Path("/Users/guto/ufo/wiki")
 PAGES_BASE = WIKI / "pages"
 def ascii_fold(s: str) -> str:
    return "".join(c for c in unicodedata.normalize("NFD", s) if not unicodedata.combining(c))
 def slugify(s: str) -> str:
    s = ascii_fold(s).lower()
    s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
    return s
 def person_id(name: str) -> str | None:
    name = (name or "").strip()
    if not name: return None
    # Strip role prefixes like "Dr.", "Major", "Special Agent"
    name = re.sub(
        r"^(Mr|Mrs|Ms|Dr|Prof|Sr|Sra|Major|Maj|Col|Colonel|Lt|Lieutenant|Capt|Captain|"
        r"Gen|General|Sgt|Sergeant|Agent|Special Agent|SA|Director|Deputy|Rev|Reverend|"
        r"Inspector|Det|Detective)\.?\s+", "", name, flags=re.IGNORECASE,
    )
    return slugify(name) or None
 def event_id(label: str, date: str | None) -> str | None:
    label = (label or "").strip()
    if not label: return None
    # Parse year-month-day from date or default to XXXX-XX-XX
    y, m, d = "XXXX", "XX", "XX"
    if date:
        ms = re.search(r"(\d{4})(?:-(\d{2}))?(?:-(\d{2}))?", str(date))
        if ms:
            y = ms.group(1)
            m = ms.group(2) or "XX"
            d = ms.group(3) or "XX"
    return f"EV-{y}-{m}-{d}-{slugify(label)}"
 def location_id(name: str) -> str | None:
    name = (name or "").strip()
    if not name: return None
    return slugify(name)
 def parse_page_yaml(path: Path) -> dict | None:
    try:
        text = path.read_text(encoding="utf-8")
        if not text.startswith("---"): return None
        return yaml.safe_load(text.split("---")[1]) or {}
    except Exception:
        return None
 def main() -> int:
    dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL")
    if not dburl: sys.exit("DATABASE_URL not set")
    print("Scanning page YAMLs ...")
    rows: list[tuple[str, str, str, str, str, str, str, str]] = []
    pages_processed = 0
    for f in PAGES_BASE.rglob("p*.md"):
        fm = parse_page_yaml(f)
        if not fm: continue
        pages_processed += 1
        # Derive doc_id/page_id from path
        try:
            rel = f.relative_to(PAGES_BASE)
            doc_id = str(rel.parent)
            page_id = f"{doc_id}/{f.stem}"
        except ValueError:
            continue
        doc_ref = f"[[{page_id}]]"
        ents = fm.get("entities_extracted") or {}
        events = ents.get("events") or fm.get("events") or []
        people = ents.get("people") or fm.get("people") or []
        locations_list = ents.get("locations") or fm.get("locations") or []
        primary_loc = fm.get("primary_location")
        uap_objs = ents.get("uap_objects") or fm.get("uap_objects") or []
        # Materialize event_ids on this page
        page_event_ids: list[str] = []
        for ev in events:
            if not isinstance(ev, dict): continue
            label = ev.get("label") or ev.get("name")
            date = ev.get("date") or ev.get("date_start")
            eid = event_id(label, date)
            if eid: page_event_ids.append(eid)
        # 1. observers in events → witnessed
        for ev in events:
            if not isinstance(ev, dict): continue
            eid = event_id(ev.get("label") or ev.get("name"), ev.get("date") or ev.get("date_start"))
            if not eid: continue
            observers = ev.get("observers") or []
            for obs in observers:
                obs_name = obs if isinstance(obs, str) else (obs.get("name") if isinstance(obs, dict) else None)
                pid = person_id(obs_name)
                if pid:
                    rows.append(("person", pid, "witnessed", "event", eid, doc_ref, "high", "yaml"))
        # 2. people on page mentioned_by document
        for p in people:
            pname = p if isinstance(p, str) else (p.get("name") if isinstance(p, dict) else None)
            pid = person_id(pname)
            if pid:
                # Use page_id as doc, treating it as a "document" target
                rows.append(("person", pid, "mentioned_by", "document", doc_id, doc_ref, "high", "yaml"))
        # 3. primary_location relates page events
        if primary_loc:
            lid = location_id(primary_loc if isinstance(primary_loc, str)
                              else (primary_loc.get("name") if isinstance(primary_loc, dict) else None))
            for eid in page_event_ids:
                if lid:
                    rows.append(("event", eid, "occurred_at", "location", lid, doc_ref, "medium", "yaml"))
        # 4. uap_objects in events → involves_uap
        if page_event_ids and uap_objs:
            first_event = page_event_ids[0]
            year_match = re.search(r"EV-(\d{4})-", first_event)
            year_token = year_match.group(1) if year_match else "XXXX"
            event_slug = first_event.split("-", 4)[-1].upper()
            for i, obj in enumerate(uap_objs, 1):
                obj_id = f"OBJ-EV{year_token}-{event_slug}-{i:02d}"
                rows.append(("event", first_event, "involves_uap", "uap_object", obj_id, doc_ref, "medium", "yaml"))
        # 5. events on page → documented_in
        for eid in page_event_ids:
            rows.append(("event", eid, "documented_in", "document", doc_id, doc_ref, "high", "yaml"))
    print(f"Pages processed: {pages_processed}")
    print(f"Relations extracted: {len(rows)}")
    # Dedupe (same source/relation/target/evidence — keep highest confidence)
    seen: set[tuple] = set()
    deduped: list[tuple] = []
    for r in rows:
        key = (r[0], r[1], r[2], r[3], r[4], r[5])
        if key in seen: continue
        seen.add(key)
        deduped.append(r)
    print(f"Relations after dedup: {len(deduped)}")
    if not deduped:
        return 0
    with psycopg.connect(dburl) as conn:
        with conn.cursor() as cur:
            cur.execute("CREATE TEMP TABLE _rel (LIKE public.relations INCLUDING DEFAULTS)")
            with cur.copy("""COPY _rel
                            (source_class, source_id, relation_type,
                             target_class, target_id, evidence_ref,
                             confidence, extracted_by)
                            FROM STDIN""") as cp:
                for r in deduped: cp.write_row(r)
            cur.execute("""
                INSERT INTO public.relations
                    (source_class, source_id, relation_type,
                     target_class, target_id, evidence_ref,
                     confidence, extracted_by)
                SELECT source_class, source_id, relation_type,
                       target_class, target_id, evidence_ref,
                       confidence, extracted_by
                FROM _rel
                ON CONFLICT DO NOTHING
            """)
            print(f"  inserted (after ON CONFLICT): {cur.rowcount}")
            cur.execute("SELECT relation_type, COUNT(*) FROM public.relations GROUP BY relation_type ORDER BY 2 DESC")
            print("\n=== Relation counts ===")
            for t, n in cur.fetchall():
                print(f"  {n:>7}  {t}")
        conn.commit()
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/maintain/run_full_dedup_pipeline.sh
+++ b/scripts/maintain/run_full_dedup_pipeline.sh
@ -0,0 +1,54 @@
 #!/usr/bin/env bash
 # Full dedup pipeline:
 #   1. Layer 1 (deterministic merges)
 #   2. Layer 2 (fuzzy trigram with numeric + code-suffix guards)
 #   3. DB remap (entity_mentions → canonical entity_pk)
 #   4. Re-sync signal_strength + total_mentions
 #   5. Re-run text backfill
 #
 # Run from /Users/guto/ufo. Requires DATABASE_URL set and SSH tunnel open.
 set -euo pipefail
 cd /Users/guto/ufo
 source /Users/guto/ufo/infra/disclosure-stack/.env
 export DATABASE_URL="postgres://postgres:${POSTGRES_PASSWORD}@localhost:5433/postgres"
 # Ensure tunnel
 ss -ltn 2>/dev/null | grep -q 5433 || {
  pkill -f "ssh.*5433:172" 2>/dev/null || true
  sleep 1
  sshpass -p "$VPS_PASSWORD" ssh -o StrictHostKeyChecking=accept-new \
    -p "$VPS_PORT" -fN -L 5433:172.27.0.2:5432 "${VPS_USER}@${VPS_HOST}"
  sleep 2
 }
 echo "=== [1/5] Layer 1 dedup (deterministic) — already applied; re-running idempotent"
 python3 scripts/maintain/49_dedup_aggressive.py 2>&1 | tail -5
 echo ""
 echo "=== [2/5] Layer 2 dedup (fuzzy trigram) ==="
 python3 scripts/maintain/50_dedup_fuzzy_trigram.py 2>&1 | tail -8
 echo ""
 echo "=== [3/5] Remap entity_mentions in DB ==="
 python3 scripts/maintain/51_remap_entity_mentions.py 2>&1 | tail -10
 echo ""
 echo "=== [4/5] Resync signal_strength ==="
 python3 scripts/maintain/42_sync_entity_stats.py --fix-obj-names 2>&1 | tail -10
 echo ""
 echo "=== [5/5] Re-run text backfill on the new canonical set ==="
 python3 scripts/maintain/46_text_backfill_mentions.py 2>&1 | tail -8
 echo ""
 echo "=== Done. Final entity counts ==="
 python3 -c "
 from pathlib import Path
 p = Path('wiki/entities')
 active = sum(1 for f in p.rglob('*.md') if '_archived' not in f.parts)
 archived = sum(1 for f in (p / '_archived').rglob('*.md')) if (p / '_archived').exists() else 0
 print(f'  active:   {active}')
 print(f'  archived: {archived}')
 print(f'  total:    {active + archived}')
 "
--- a/scripts/reextract/build_doc_text.py
+++ b/scripts/reextract/build_doc_text.py
@ -0,0 +1,99 @@
 #!/usr/bin/env python3
 """
 build_doc_text.py — Reconstruct the FULL document text from already-extracted
 chunks, with chunk-id markers so Sonnet can cite back via evidence_chunks.
 Input:  raw/<doc-id>--subagent/_index.json + chunks/c*.md
 Output: stdout — concatenated EN text of the document, with markers:
        [chunk c0042 · page 7]
        <content_en verbatim>
        [chunk c0043 · page 7]
        <content_en verbatim>
        ...
 Run:
  python3 scripts/reextract/build_doc_text.py <doc-id>
 """
 from __future__ import annotations
 import json
 import re
 import sys
 from pathlib import Path
 RAW = Path("/Users/guto/ufo/raw")
 def split_frontmatter(text: str) -> tuple[dict, str]:
    if not text.startswith("---"):
        return {}, text
    parts = text.split("---", 2)
    if len(parts) < 3:
        return {}, text
    fm_raw = parts[1]
    body = parts[2]
    # Tolerant key:value extraction (chunks have free-text fields that break
    # strict YAML — we only need a handful of keys)
    fm: dict = {}
    for line in fm_raw.splitlines():
        m = re.match(r"^([a-zA-Z_]+):\s*(.*)$", line)
        if not m: continue
        fm[m.group(1)] = m.group(2).strip()
    return fm, body
 def extract_en_section(body: str) -> str:
    """Pull the EN: paragraph from a bilingual chunk body."""
    # Bodies look like:
    #   **EN:** <english text>
    #   **PT-BR:** <portuguese text>
    m = re.search(r"\*\*EN:\*\*\s*(.*?)(?:\n\s*\*\*PT-BR:\*\*|\Z)", body, re.S)
    if m: return m.group(1).strip()
    # Some chunks store the text in `extracted_text:` field only (e.g. images)
    return body.strip()
 def main() -> int:
    if len(sys.argv) < 2:
        sys.exit("usage: build_doc_text.py <doc-id>")
    doc_id = sys.argv[1]
    chunks_dir = RAW / f"{doc_id}--subagent" / "chunks"
    idx_path = RAW / f"{doc_id}--subagent" / "_index.json"
    if not idx_path.is_file():
        sys.exit(f"_index.json not found for {doc_id}")
    idx = json.loads(idx_path.read_text(encoding="utf-8"))
    entries = sorted(idx.get("chunks", []), key=lambda x: x.get("order_global", 0))
    out_lines: list[str] = [f"DOCUMENT_ID: {doc_id}",
                            f"TOTAL_PAGES: {idx.get('total_pages')}",
                            f"TOTAL_CHUNKS: {len(entries)}", ""]
    for entry in entries:
        cid = entry.get("chunk_id")
        page = entry.get("page")
        ctype = entry.get("type", "?")
        chunk_path = chunks_dir / f"{cid}.md"
        if not chunk_path.is_file(): continue
        text = chunk_path.read_text(encoding="utf-8")
        fm, body = split_frontmatter(text)
        en = extract_en_section(body)
        # For pure-image chunks the EN body itself describes the image.
        # Fall back to image_description_en if extracted text is empty.
        if not en or len(en) < 5:
            # Try the description in frontmatter
            en = fm.get("image_description_en") or fm.get("extracted_text") or ""
            en = en.strip().strip('"\'')
        if not en: continue
        out_lines.append(f"[chunk {cid} · page {page} · type:{ctype}]")
        out_lines.append(en.strip())
        out_lines.append("")
    print("\n".join(out_lines))
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/reextract/enums.yaml
+++ b/scripts/reextract/enums.yaml
@ -0,0 +1,205 @@
 # Closed enums for Sonnet re-extraction.
 # Pós-extração, validator rejeita JSON com qualquer valor fora destas listas.
 # Adicione novos valores aqui — NUNCA deixe Sonnet inventar.
 doc_classification:
  - mission_report
  - intelligence_memo
  - fbi_internal_correspondence
  - press_clipping
  - photo_with_caption
  - sketch_or_diagram
  - witness_statement
  - radio_transcript
  - foia_release
  - operation_log
  - policy_document
  - administrative_form
  - blank_page_or_separator
  - investigation_file
  - aviation_incident_report
  - debriefing_transcript
  - other_specify
 noise_emission:
  - none        # 100% investigativo
  - low         # >70% investigativo, alguma rota/carimbo
  - medium      # 40-70% investigativo
  - high        # <40% investigativo (rotação interna, ementa, índice)
 investigative_value:
  - critical
  - high
  - medium
  - low
  - none
 primary_topics:
  - flying_disc_sightings
  - uap_encounter
  - radar_visual_correlation
  - aviation_incident
  - foreign_object_recovery
  - operation_paperclip
  - cold_war_intelligence
  - nuclear_facility_overflight
  - astronaut_observation
  - photographic_evidence
  - contactee_phenomena
  - underwater_unidentified_object
  - transmedium_observation
  - government_disclosure
  - debunking_explanation
  - witness_interrogation
  - policy_directive
  - administrative_routing
  - other
 event_class:
  - uap_encounter
  - press_release
  - investigation_opened
  - investigation_closed
  - testimony_recorded
  - document_published
  - meeting_held
  - flight_operation
  - radar_detection
  - photo_analysis
  - personnel_change
  - policy_change
  - communication_sent
  - communication_received
  - arrest
  - trial
  - death
  - launch_event
  - recovery_operation
  - intercept_attempt
  - other_specify
 person_class:
  - military_officer
  - enlisted_personnel
  - civilian_witness
  - government_official
  - law_enforcement
  - scientist
  - journalist
  - pilot
  - radar_operator
  - intelligence_officer
  - foreign_national
  - clergy
  - civilian
  - astronaut
  - politician
  - lawyer
  - business_person
  - unknown
 org_class:
  - military_unit
  - military_branch
  - government_agency
  - civilian_agency
  - law_enforcement
  - intelligence_agency
  - research_institution
  - civilian_organization
  - foreign_government
  - media_organization
  - contactee_group
  - religious_organization
  - corporation
  - unknown
 geo_class:
  - city
  - state
  - country
  - region
  - military_base
  - airfield
  - building
  - waterway
  - mountain
  - desert
  - rural_area
  - sea_or_ocean
  - coastal
  - lake
  - river
  - submerged
  - airspace
  - unknown
 uap_shape:
  - disc
  - cigar
  - sphere
  - triangle
  - rectangle
  - cluster
  - light_only
  - cone
  - dome
  - irregular
  - tic_tac
  - cylindrical
  - cross
  - boomerang
  - unknown
 uap_medium:
  - air
  - sea_surface
  - submerged
  - transmedium
  - space
  - ground
  - multiple
  - unknown
 uap_color:
  - white
  - silver_metallic
  - black
  - red
  - orange
  - yellow
  - green
  - blue
  - multicolored
  - dark_unspecified
  - bright_unspecified
  - unknown
 date_confidence:
  - exact
  - month
  - year
  - decade
  - unknown
 confidence:
  - high
  - medium
  - low
 relation_type:
  - witnessed
  - occurred_at
  - involves_uap
  - documented_in
  - authored
  - signed
  - mentioned_by
  - employed_by
  - operated_by
  - investigated
  - commanded
  - related_to
  - similar_to
  - precedes
  - follows
--- a/scripts/reextract/prompt-system.md
+++ b/scripts/reextract/prompt-system.md
@ -0,0 +1,165 @@
 # Sonnet System Prompt — Re-Extração Investigativa
 Você é um analista forense investigativo do **Disclosure Bureau**. Recebe o **texto completo de um documento declassificado UAP/UFO** (já extraído por OCR/vision em pass anterior) e produz **um único JSON estruturado** com as entidades, eventos e relações investigativas que estão NO TEXTO.
 ## Cobertura
 **EXTRAIA TUDO** que o texto deste segmento documenta. Não limite contagens. Se o segmento tem 100 eventos distintos, retorne 100. O sistema é alimentado por análise investigativa séria — ausência de evento é perda de evidência.
 Você pode estar recebendo APENAS UM SEGMENTO de um documento maior (ver `SEGMENT N OF M` no início, se presente). Extraia exaustivamente o que está NESTE segmento — outros segmentos cobrirão o resto.
 Critério único de não-inclusão: **falta de evidência textual neste segmento.**
 ## Regras invioláveis
 1. **Você está LENDO texto, não vendo imagem.** Cada chunk é precedido pelo marcador `[chunk c0042 · page 7]`. Use esse `chunk_id` em todo campo `evidence_chunks` para apontar de volta ao texto que justifica sua extração.
 2. **NUNCA invente.** Se o nome de uma pessoa está redacted (`[REDACTED]`, `[NAME UNCLEAR]`, `▓▓▓`), retorne `"unknown"` no campo `name` e marque `confidence: "low"`. NÃO complete nomes parciais por inferência (`Mr. [redacted]` ≠ `Mr. Smith`).
 3. **NÃO traduza nomes próprios.** "Roswell" fica "Roswell". "Major Jesse Marcel" fica "Major Jesse Marcel". Locais brasileiros mantêm acentuação ("São Paulo", "Pará").
 4. **Use APENAS os valores dos enums.** Se o conceito não cabe em nenhum enum, use o valor `other_specify` e adicione campo livre `other_specify_note` com 1 frase. NÃO invente novos valores de enum.
 5. **Cada `event`, `person`, `organization`, `location`, `relation` DEVE ter `evidence_chunks: ["c0042", ...]`.** Sem evidência, não inclua. Esse é o teste de Locard: se não há rastro no texto, não é evidência.
 6. **Não duplique entidades.** Se "Major Jesse Marcel" e "J. Marcel" são a mesma pessoa pelo contexto, escolha 1 `name` canônico e liste todas as ocorrências em `aliases_in_doc`.
 7. **Eventos = instâncias específicas.** "Flying disc sighting reports" no plural genérico NÃO é evento. "Sighting of unknown disc on 1947-07-08 over Roswell NM by William Brazel" é evento. Se não há data + local + observer ou objeto, **não é evento** — é tópico (`primary_topics`).
 8. **Foco investigativo.** Carimbos de roteamento, listas de distribuição, números de série, banners de classificação — **não são entidades**. Catalogue só o que serve para uma análise de caso real.
 9. **Bilíngue só onde pedido.** `narrative_summary` e `narrative_summary_pt_br`. Resto fica em inglês (chave de schema internacional).
 ## Modo de operação
 **Você roda em 5 passes separadas por segmento.** Cada chamada (`OUTPUT MODE` block ao final) pede UM tipo só:
 1. events
 2. people
 3. organizations + locations
 4. relations
 5. doc-level metadata
 Retorne **apenas o JSON descrito no OUTPUT MODE** — não tente preencher campos de outras passes (eles vêm em chamadas separadas). Não envolva em markdown fence. Não adicione preâmbulo nem postscript. JSON puro.
 ## Schema de referência (completo, distribuído entre as 5 passes)
 ```json
 {
  "doc_id": "<o doc_id que recebeu>",
  "doc_classification": "<enum doc_classification>",
  "doc_classification_note": "<frase opcional se other_specify>",
  "doc_period": "<YYYY ou YYYY-YYYY>",
  "primary_topics": ["<enum primary_topics>", "..."],
  "noise_emission": "<enum noise_emission>",
  "investigative_value": "<enum investigative_value>",
  "doc_summary_en": "2-3 sentences English summary of what this document IS and why it matters.",
  "doc_summary_pt_br": "2-3 frases em português brasileiro: o que é o documento e por que importa.",
  "events": [
    {
      "label": "Roswell debris recovery press release",
      "date_start": "1947-07-08",
      "date_end": "1947-07-08",
      "date_confidence": "<enum date_confidence>",
      "event_class": "<enum event_class>",
      "primary_location_name": "Roswell Army Air Field, New Mexico, USA",
      "primary_location_geo_class": "<enum geo_class>",
      "observers": [
        {"name": "Major Jesse Marcel", "role_at_event": "<enum person_class>"}
      ],
      "uap_objects_observed": [
        {
          "shape": "<enum uap_shape>",
          "color": "<enum uap_color>",
          "medium": "<enum uap_medium>",
          "size_estimate_m": null,
          "altitude_ft": null,
          "speed_kts": null,
          "maneuver_notes": null
        }
      ],
      "evidence_chunks": ["c0042", "c0043"],
      "narrative_summary": "1-2 sentences English.",
      "narrative_summary_pt_br": "1-2 frases PT-BR.",
      "confidence": "<enum confidence>"
    }
  ],
  "people": [
    {
      "name": "Major Jesse Marcel",
      "aliases_in_doc": ["J. Marcel", "Marcel"],
      "person_class": "<enum person_class>",
      "affiliation": "USAAF 509th Bombardment Group",
      "role_at_doc_date": "intelligence officer",
      "evidence_chunks": ["c0042", "c0050"],
      "confidence": "<enum confidence>"
    }
  ],
  "organizations": [
    {
      "name": "USAAF 509th Bombardment Group",
      "aliases_in_doc": ["509th", "509 BG"],
      "org_class": "<enum org_class>",
      "country": "USA",
      "evidence_chunks": ["c0042"],
      "confidence": "<enum confidence>"
    }
  ],
  "locations": [
    {
      "name": "Roswell Army Air Field",
      "aliases_in_doc": ["RAAF", "Roswell airfield"],
      "geo_class": "<enum geo_class>",
      "country": "USA",
      "region_or_state": "New Mexico",
      "evidence_chunks": ["c0042"],
      "confidence": "<enum confidence>"
    }
  ],
  "relations": [
    {
      "source_class": "<enum>",     // person | event | organization | location | uap_object | document
      "source_name":  "Major Jesse Marcel",
      "type":         "<enum relation_type>",
      "target_class": "<enum>",
      "target_name":  "Roswell debris recovery press release",
      "evidence_chunks": ["c0042"],
      "confidence": "<enum confidence>"
    }
  ]
 }
 ```
 ## Notas finais
 - Se o documento é puro ruído administrativo (`noise_emission: high`), retorne arrays vazios mas preencha `doc_classification`, `noise_emission`, `doc_summary_*`. Não force a achar eventos onde não há.
 - Se observar conexão entre documentos (este memo cita aquele), use `relations` com `target_class: "document"` e `target_name: <doc_id>` quando o doc-id estiver mencionado.
 - Se observar UAP submerso ou transmedium (objeto entrando/saindo da água), garanta `uap_medium: submerged` ou `transmedium` e seja explícito no `narrative_summary`.
 ## Regras de validação invioláveis (causaram rejeição em rodadas anteriores)
 10. **Formatos de data permitidos** — `YYYY` (só ano), `YYYY-MM` (ano + mês), `YYYY-MM-DD` (data completa), `XXXX` / `XXXX-XX` / `XXXX-XX-XX` (totalmente desconhecida). **NÃO use `2023-09-XX` nem `2023-XX-XX`** — se você sabe o mês mas não o dia, escreva `2023-09`. Se você sabe o ano mas não o mês, escreva `2023`.
 11. **`other_specify` só existe em DOIS enums:** `doc_classification` e `event_class`. Para QUALQUER outro enum — `geo_class`, `person_class`, `org_class`, `uap_shape`, `uap_medium`, `uap_color`, `date_confidence`, `noise_emission`, `investigative_value`, `confidence`, `relation_type`, `primary_topics` — escolha o valor do enum que MAIS SE APROXIMA, ou use `unknown` se nada se aproxima. **NUNCA `other_specify` fora dos 2 enums permitidos.**
 12. **`source_class` e `target_class` em `relations`** SÃO RESTRITOS A exatamente esta lista: `person`, `event`, `organization`, `location`, `uap_object`, `document`. **NÃO use `vehicle`** — se o item é uma cápsula/aeronave que é o objeto observado, mapeie para `uap_object`. Se é uma aeronave operada por militar (não-UAP), use `organization` (a unidade militar) ou simplesmente omita a relação.
 13. **`evidence_chunks` deve usar APENAS os IDs de chunks visíveis neste segmento.** O texto traz marcadores `[chunk c0042 · page 7]`. **ANTES de incluir um chunk_id em `evidence_chunks`, confirme que esse marcador aparece literalmente no texto deste segmento.** Se você acha que viu `c0026` mas não consegue localizá-lo no texto, NÃO use. Inventar chunk_id quebra a procedência (Locard) e é o erro mais grave possível.
 14. **Toda entidade extraída TEM que ter `evidence_chunks` não-vazio.** Se você não consegue apontar para um chunk concreto presente no texto, a entidade não está documentada — **NÃO A INCLUA NA SAÍDA**. Melhor 50 entidades bem-evidenciadas do que 60 onde 1 não tem rastro.
 ## Fallbacks explícitos quando o conceito não cabe no enum
 Listas completas dos enums críticos (use APENAS estes valores):
 **`geo_class`** = `city | state | country | region | military_base | airfield | building | waterway | mountain | desert | rural_area | sea_or_ocean | coastal | lake | river | submerged | airspace | unknown`. **Não existe `space`** — para órbita lunar / superfície da Lua / observações de astronauta em espaço, use `unknown` e ponha o contexto no campo livre (`primary_location_name` recebe "Lunar orbit" ou "Apollo capsule, translunar coast"). Não force outro enum.
 **`event_class`** = `uap_encounter | press_release | investigation_opened | investigation_closed | testimony_recorded | document_published | meeting_held | flight_operation | radar_detection | photo_analysis | personnel_change | policy_change | communication_sent | communication_received | arrest | trial | death | launch_event | recovery_operation | intercept_attempt | other_specify`. **Não existe `debriefing_transcript` como event_class** — debriefings de astronauta ou militar mapeiam para `testimony_recorded`. Use `other_specify` + `other_specify_note` somente se realmente nenhum dos 20 se aproxima.
 **`relation_type`** = `witnessed | occurred_at | involves_uap | documented_in | authored | signed | mentioned_by | employed_by | operated_by | investigated | commanded | related_to | similar_to | precedes | follows`. Sem `other_specify` — se nada cabe, use `related_to`.
 Retorne **APENAS** o JSON. Sem texto explicativo antes ou depois. Sem markdown fence (```). JSON puro parseável.
--- a/scripts/reextract/run.py
+++ b/scripts/reextract/run.py
@ -0,0 +1,417 @@
 #!/usr/bin/env python3
 """
 run.py — Re-extract a document via Claude Code OAuth (Sonnet), with chunked
 processing for large docs.
 Strategy:
  - Build doc text from already-extracted chunks (build_doc_text.py).
  - If text fits in one Sonnet window (default 50k tokens input budget),
    run a single call producing the full JSON.
  - Otherwise, split the doc into overlapping segments of ~50k input tokens
    each, run Sonnet on each segment (preserving chunk_id markers), then
    MERGE the JSONs deduping by (name, class) within each entity list and
    by (source_name, type, target_name, evidence_chunks) for relations.
 The merged JSON faithfully covers the entire document — no entity is dropped
 because the doc was "too big".
 """
 from __future__ import annotations
 import json
 import os
 import re
 import subprocess
 import sys
 from pathlib import Path
 import yaml
 REX_DIR = Path("/Users/guto/ufo/scripts/reextract")
 RAW = Path("/Users/guto/ufo/raw")
 BUILD_DOC = REX_DIR / "build_doc_text.py"
 PROMPT_SYSTEM = REX_DIR / "prompt-system.md"
 ENUMS_YAML = REX_DIR / "enums.yaml"
 VALIDATE = REX_DIR / "validate.py"
 # Token budget per call.
 # Both Sonnet and Opus cap output at 32k tokens. We partition the extraction
 # into 5 separate calls per segment, each producing a small piece of the JSON.
 # Each piece is well under the ceiling.
 SEGMENT_INPUT_CHARS = 60_000      # ~15k tokens input per segment
 SEGMENT_OVERLAP_CHARS = 3_000
 # Per-segment extraction is split into 5 passes. Each pass gets the same
 # document text (so the claude CLI reuses its prompt cache) but a different
 # "output mode" instruction asking for ONE category only.
 PASSES = [
    ("events",
     "Return a JSON object with ONLY the events array, exhaustively extracted "
     "from THIS segment:\n\n"
     "{\"events\": [{...event objects per the schema...}]}\n\n"
     "Use the event schema rules from the system prompt. Include "
     "uap_objects_observed, observers (with role_at_event), "
     "primary_location_name/geo_class, evidence_chunks, both narratives. "
     "Do NOT include people/organizations/locations/relations/doc-level fields."),
    ("people",
     "Return a JSON object with ONLY the people array, exhaustively extracted "
     "from THIS segment:\n\n"
     "{\"people\": [{...person objects per the schema...}]}\n\n"
     "Use the person schema rules. Each entry: name, aliases_in_doc, "
     "person_class, affiliation, role_at_doc_date, evidence_chunks, confidence. "
     "Skip pure routing-list entries (FBI distribution slips, etc.) unless they "
     "are subjects/witnesses/authors of real content."),
    ("orgs_locs",
     "Return a JSON object with ONLY organizations[] and locations[], exhaustively "
     "extracted from THIS segment:\n\n"
     "{\"organizations\": [...], \"locations\": [...]}\n\n"
     "Use the schema rules. Include aliases_in_doc, class enums, country, "
     "region_or_state (locations), evidence_chunks."),
    ("relations",
     "Return a JSON object with ONLY the relations array, exhaustively extracted "
     "from THIS segment:\n\n"
     "{\"relations\": [{...relation objects...}]}\n\n"
     "Priority: typed investigative relations (witnessed, occurred_at, signed, "
     "involves_uap, investigated, authored, employed_by, commanded). "
     "mentioned_by ONLY for clearly investigative people (authors of memos, "
     "subjects of investigations); skip mentioned_by for pure routing-list "
     "names (Tolson, Ladd routing slips). All other types as found."),
    ("doc_meta",
     "Return a JSON object with ONLY the document-level fields:\n\n"
     "{\"doc_classification\": \"...\", \"doc_classification_note\": null|\"...\", "
     "\"doc_period\": \"YYYY\" or \"YYYY-YYYY\", "
     "\"primary_topics\": [\"...\"], "
     "\"noise_emission\": \"...\", \"investigative_value\": \"...\", "
     "\"doc_summary_en\": \"2-3 sentences\", \"doc_summary_pt_br\": \"2-3 frases\"}\n\n"
     "These reflect the FULL document this segment is part of (you may not see "
     "every page in this segment, but classify based on what you do see)."),
 ]
 def build_doc_text(doc_id: str) -> str:
    r = subprocess.run(
        ["python3", str(BUILD_DOC), doc_id],
        capture_output=True, text=True, encoding="utf-8",
    )
    if r.returncode != 0:
        sys.exit(f"build_doc_text failed: {r.stderr}")
    return r.stdout
 def segment_doc(text: str) -> list[str]:
    """Split doc text into overlapping segments at chunk-marker boundaries.
    A segment never breaks a chunk — we split at the `[chunk c... · ...]`
    line boundaries closest to the char-budget cap.
    """
    if len(text) <= SEGMENT_INPUT_CHARS:
        return [text]
    # Find all chunk marker line positions (line starts).
    marker_re = re.compile(r"^\[chunk c\d+ ·.*$", re.MULTILINE)
    starts = [m.start() for m in marker_re.finditer(text)]
    if not starts: return [text]
    segments: list[str] = []
    seg_start = 0
    while seg_start < len(text):
        cap = seg_start + SEGMENT_INPUT_CHARS
        if cap >= len(text):
            segments.append(text[seg_start:])
            break
        # Pick the LAST chunk marker before cap (so we never break a chunk)
        candidates = [s for s in starts if seg_start < s < cap]
        if not candidates:
            # No chunk marker fits — cut at cap (shouldn't happen with normal data)
            seg_end = cap
        else:
            seg_end = candidates[-1]
        segments.append(text[seg_start:seg_end])
        # Next segment starts at the chunk marker that gives ~OVERLAP_CHARS overlap
        target_overlap_start = seg_end - SEGMENT_OVERLAP_CHARS
        overlap_candidates = [s for s in starts if target_overlap_start <= s < seg_end]
        if overlap_candidates:
            seg_start = overlap_candidates[0]
        else:
            seg_start = seg_end
    return segments
 def build_full_prompt(doc_id: str, doc_text: str, segment_meta: str = "",
                      pass_instruction: str = "",
                      error_feedback: str = "") -> str:
    system = PROMPT_SYSTEM.read_text(encoding="utf-8")
    enums = ENUMS_YAML.read_text(encoding="utf-8")
    feedback_block = ""
    if error_feedback:
        feedback_block = (
            "\n\n## PREVIOUS ATTEMPT FAILED VALIDATION\n\n"
            "Your previous JSON had these errors. Fix them and return a corrected JSON:\n"
            f"```\n{error_feedback}\n```\n\n"
            "Re-emit the FULL corrected JSON.\n"
        )
    pass_block = ""
    if pass_instruction:
        pass_block = (
            "\n\n## OUTPUT MODE (THIS CALL ONLY)\n\n"
            f"{pass_instruction}\n\n"
            "Return ONLY the JSON object described above. No markdown fence, "
            "no preamble, no postscript. JSON only.\n"
        )
    return (
        f"{system}\n\n"
        "## CLOSED ENUMS (use ONLY these values)\n\n"
        f"```yaml\n{enums}\n```\n\n"
        f"## DOCUMENT TO ANALYZE — doc_id: {doc_id}\n"
        f"{segment_meta}\n\n"
        f"```\n{doc_text}\n```\n"
        f"{pass_block}"
        f"{feedback_block}\n"
        "Return the JSON now."
    )
 def call_claude(prompt: str) -> tuple[str, dict]:
    """Returns (response_text, meta_dict).
    Uses `--output-format text` + `--disallowed-tools` and redirects stdout
    DIRECTLY to a temp file (not via subprocess.PIPE, which silently truncates
    large outputs). Tools are disabled to prevent the model from initiating
    multi-turn calls that fail in `-p` mode.
    """
    import tempfile
    env = {**os.environ, "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000"}
    DISALLOWED = (
        "AskUserQuestion,Bash,Edit,Write,Read,Task,Glob,Grep,"
        "TaskCreate,TaskUpdate,TaskList,TaskGet,TaskStop,TaskOutput,"
        "Skill,ScheduleWakeup,Monitor,WebSearch,WebFetch,NotebookEdit,"
        "EnterPlanMode,ExitPlanMode,EnterWorktree,ExitWorktree,"
        "CronCreate,CronDelete,CronList,RemoteTrigger,ToolSearch,"
        "PushNotification,ListMcpResourcesTool,ReadMcpResourceTool,"
        "ShareOnboardingGuide"
    )
    with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding="utf-8") as tmp:
        tmp_path = tmp.name
    try:
        with open(tmp_path, "wb") as out_f:
            r = subprocess.run(
                ["claude", "-p", "--model", "opus", "--output-format", "text",
                 "--disallowed-tools", DISALLOWED],
                input=prompt.encode("utf-8"),
                stdout=out_f,
                stderr=subprocess.PIPE,
                env=env,
            )
        if r.returncode != 0:
            sys.exit(
                f"claude CLI failed (rc={r.returncode})\n"
                f"stderr: {r.stderr.decode('utf-8', errors='replace')[:4000]}"
            )
        with open(tmp_path, "r", encoding="utf-8") as f:
            output = f.read()
        return (output, {"stop_reason": "text-format-no-meta", "chars": len(output)})
    finally:
        try: os.unlink(tmp_path)
        except OSError: pass
 def extract_json_block(s: str) -> str:
    s = s.strip()
    if s.startswith("```"):
        s = "\n".join(line for line in s.splitlines() if not line.startswith("```"))
        s = s.strip()
    start = s.find("{")
    end = s.rfind("}")
    if start >= 0 and end > start:
        return s[start:end + 1]
    return s
 def merge_extractions(segments: list[dict], doc_id: str) -> dict:
    """Merge per-segment JSONs into a single doc-level JSON.
    Dedup rules:
      - events:    by (label, date_start)
      - people:    by canonical name (lowercase, role prefixes stripped)
      - orgs:      by canonical name (lowercase)
      - locations: by canonical name (lowercase)
      - relations: by (source_name, type, target_name) — union of evidence_chunks
    Top-level fields (classification, summary) come from the FIRST segment with
    non-null values; primary_topics is union; noise_emission/investigative_value
    is the MAX across segments (worst case takes precedence).
    """
    out: dict = {
        "doc_id": doc_id,
        "doc_classification": None,
        "doc_classification_note": None,
        "doc_period": None,
        "primary_topics": [],
        "noise_emission": None,
        "investigative_value": None,
        "doc_summary_en": None,
        "doc_summary_pt_br": None,
        "events": [],
        "people": [],
        "organizations": [],
        "locations": [],
        "relations": [],
    }
    def lower_key(s) -> str:
        return (s or "").strip().lower()
    # Top-level pickup from first non-null
    for seg in segments:
        for k in ("doc_classification", "doc_classification_note", "doc_period",
                 "doc_summary_en", "doc_summary_pt_br"):
            if not out.get(k) and seg.get(k):
                out[k] = seg[k]
    # primary_topics: union, preserve first-seen order
    seen = set()
    for seg in segments:
        for t in seg.get("primary_topics") or []:
            if t and t not in seen:
                seen.add(t); out["primary_topics"].append(t)
    # noise / investigative — take WORST across segments
    NOISE_ORDER = {"none": 0, "low": 1, "medium": 2, "high": 3}
    INV_ORDER = {"none": 0, "low": 1, "medium": 2, "high": 3, "critical": 4}
    noise_max, inv_max = None, None
    for seg in segments:
        n = seg.get("noise_emission")
        if n in NOISE_ORDER and (noise_max is None or NOISE_ORDER[n] > NOISE_ORDER[noise_max]):
            noise_max = n
        i = seg.get("investigative_value")
        if i in INV_ORDER and (inv_max is None or INV_ORDER[i] > INV_ORDER[inv_max]):
            inv_max = i
    out["noise_emission"] = noise_max
    out["investigative_value"] = inv_max
    # Entities — dedup by canonical key
    def merge_list(key: str, key_fn):
        seen_keys: dict = {}
        for seg in segments:
            for item in seg.get(key) or []:
                if not isinstance(item, dict): continue
                k = key_fn(item)
                if not k: continue
                if k in seen_keys:
                    # merge: union evidence_chunks, aliases_in_doc; keep highest confidence
                    existing = seen_keys[k]
                    ev = set(existing.get("evidence_chunks") or []) | set(item.get("evidence_chunks") or [])
                    existing["evidence_chunks"] = sorted(ev)
                    al = set(existing.get("aliases_in_doc") or []) | set(item.get("aliases_in_doc") or [])
                    al.discard(existing.get("name", ""))
                    existing["aliases_in_doc"] = sorted(al)
                    # confidence: keep max
                    conf_order = {"low": 0, "medium": 1, "high": 2}
                    if conf_order.get(item.get("confidence"), 0) > conf_order.get(existing.get("confidence"), 0):
                        existing["confidence"] = item["confidence"]
                else:
                    seen_keys[k] = item
        return list(seen_keys.values())
    out["events"] = merge_list("events",
        lambda e: (lower_key(e.get("label")), e.get("date_start") or ""))
    out["people"] = merge_list("people",
        lambda p: lower_key(p.get("name")))
    out["organizations"] = merge_list("organizations",
        lambda o: lower_key(o.get("name")))
    out["locations"] = merge_list("locations",
        lambda l: lower_key(l.get("name")))
    # Relations: dedup by (source_class, source_name_lower, type, target_class, target_name_lower)
    rel_seen: dict = {}
    for seg in segments:
        for r in seg.get("relations") or []:
            if not isinstance(r, dict): continue
            key = (
                r.get("source_class"),
                lower_key(r.get("source_name")),
                r.get("type"),
                r.get("target_class"),
                lower_key(r.get("target_name")),
            )
            if key in rel_seen:
                existing = rel_seen[key]
                ev = set(existing.get("evidence_chunks") or []) | set(r.get("evidence_chunks") or [])
                existing["evidence_chunks"] = sorted(ev)
            else:
                rel_seen[key] = r
    out["relations"] = list(rel_seen.values())
    return out
 def main() -> int:
    if len(sys.argv) < 2:
        sys.exit("usage: run.py <doc-id>")
    doc_id = sys.argv[1]
    out_path = RAW / f"{doc_id}--subagent" / "_reextract.json"
    print(f"[1/N] Building doc text ...")
    doc_text = build_doc_text(doc_id)
    print(f"      {len(doc_text)} chars (~{len(doc_text) // 4} tokens)")
    segments = segment_doc(doc_text)
    n_seg = len(segments)
    print(f"[2/N] Splitting into {n_seg} segment(s) of ~{SEGMENT_INPUT_CHARS // 1000}k chars each")
    for i, s in enumerate(segments, 1):
        print(f"        segment {i}: {len(s)} chars")
    from concurrent.futures import ThreadPoolExecutor, as_completed
    def run_pass(seg_idx: int, seg_text: str, pass_name: str, pass_instr: str) -> tuple[int, str, dict | None, str]:
        meta_label = f"\n\n[SEGMENT {seg_idx + 1} OF {n_seg}] — extract everything in THIS segment exhaustively.\n"
        prompt = build_full_prompt(doc_id, seg_text, segment_meta=meta_label,
                                   pass_instruction=pass_instr)
        raw, _meta = call_claude(prompt)
        json_text = extract_json_block(raw)
        try:
            piece = json.loads(json_text)
            return (seg_idx, pass_name, piece, "")
        except json.JSONDecodeError as e:
            return (seg_idx, pass_name, None, f"{e} | raw_len={len(raw)}")
    # Fire ALL (segment, pass) jobs in parallel
    extracted: list[dict] = [{"doc_id": doc_id} for _ in range(n_seg)]
    n_jobs = n_seg * len(PASSES)
    print(f"[3/N] Firing {n_jobs} parallel passes ({n_seg} segments × {len(PASSES)} passes)")
    errors: list[str] = []
    completed = 0
    # Cap inner concurrency so outer-WORKERS × inner doesn't fork-bomb the box.
    # 5 = one slot per pass; segments process serially within a doc.
    INNER_MAX = int(os.environ.get("REEXTRACT_INNER_MAX", "5"))
    with ThreadPoolExecutor(max_workers=min(INNER_MAX, n_jobs)) as pool:
        futures = []
        for seg_idx, seg in enumerate(segments):
            for pass_name, pass_instr in PASSES:
                futures.append(pool.submit(run_pass, seg_idx, seg, pass_name, pass_instr))
        for fut in as_completed(futures):
            seg_idx, pass_name, piece, err = fut.result()
            completed += 1
            tag = f"seg{seg_idx+1}/{pass_name}"
            if err:
                errors.append(f"{tag}: {err}")
                debug = out_path.parent / f"_reextract_raw_seg{seg_idx+1}_{pass_name}.txt"
                print(f"  [{completed}/{n_jobs}] {tag} FAILED — {err[:120]}")
            else:
                if isinstance(piece, dict):
                    for k, v in piece.items():
                        extracted[seg_idx][k] = v
                print(f"  [{completed}/{n_jobs}] {tag} OK")
    print(f"[4/N] Merging {n_seg} extraction(s) ...")
    merged = merge_extractions(extracted, doc_id) if n_seg > 1 else {**extracted[0], "doc_id": doc_id}
    out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path.write_text(json.dumps(merged, indent=2, ensure_ascii=False), encoding="utf-8")
    print(f"      saved {out_path}")
    print(f"[5/N] Validating ...")
    v = subprocess.run(
        ["python3", str(VALIDATE), doc_id, str(out_path)],
        capture_output=True, text=True, encoding="utf-8",
    )
    print(v.stdout.strip())
    return v.returncode
 if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/reextract/run_parallel.sh
+++ b/scripts/reextract/run_parallel.sh
@ -0,0 +1,97 @@
 #!/usr/bin/env bash
 # Parallel re-extraction orchestrator.
 #
 # - Lists every doc that has raw/<doc>--subagent/_index.json
 # - Skips docs that already have _reextract.json (idempotent)
 # - Uses an mkdir-based per-doc lock to prevent two workers from racing
 # - Runs N workers in parallel (default 8, override via WORKERS=N)
 # - Logs each doc to raw/<doc>--subagent/_reextract.log
 #
 # Run:
 #   ./run_parallel.sh                 # all docs, 8 workers
 #   WORKERS=4 ./run_parallel.sh       # 4 workers
 #   ./run_parallel.sh DOC1 DOC2       # specific docs only
 set -uo pipefail
 UFO="/Users/guto/ufo"
 RAW="$UFO/raw"
 RUN="$UFO/scripts/reextract/run.py"
 WORKERS="${WORKERS:-4}"
 # Build list of doc IDs
 if [ "$#" -gt 0 ]; then
  DOCS=("$@")
 else
  DOCS=()
  for d in "$RAW"/*--subagent; do
    [ -f "$d/_index.json" ] || continue
    doc_id=$(basename "$d" | sed 's/--subagent$//')
    DOCS+=("$doc_id")
  done
 fi
 echo "=== Re-extract orchestrator ==="
 echo "  docs queued: ${#DOCS[@]}"
 echo "  workers:     $WORKERS"
 echo ""
 process_one() {
  local doc_id="$1"
  local sub="$RAW/$doc_id--subagent"
  local out="$sub/_reextract.json"
  local log="$sub/_reextract.log"
  local lock="$sub/.reextract.lock"
  # Skip if already extracted
  if [ -f "$out" ]; then
    # Quick sanity: must parse as JSON
    if python3 -c "import json,sys; json.load(open('$out'))" 2>/dev/null; then
      echo "[SKIP] $doc_id (already extracted)"
      return 0
    fi
  fi
  # Acquire lock via mkdir (atomic)
  if ! mkdir "$lock" 2>/dev/null; then
    echo "[LOCK] $doc_id (another worker has it)"
    return 0
  fi
  trap "rmdir '$lock' 2>/dev/null || true" EXIT
  local started=$(date +%s)
  echo "[BEGIN] $doc_id"
  if python3 "$RUN" "$doc_id" > "$log" 2>&1; then
    local elapsed=$(($(date +%s) - started))
    echo "[OK]    $doc_id (${elapsed}s)"
  else
    local elapsed=$(($(date +%s) - started))
    echo "[FAIL]  $doc_id (${elapsed}s) — see $log"
  fi
  rmdir "$lock" 2>/dev/null || true
  trap - EXIT
 }
 export -f process_one
 export RAW RUN
 # Run in parallel via xargs
 printf '%s\n' "${DOCS[@]}" | xargs -n 1 -P "$WORKERS" -I {} bash -c 'process_one "$@"' _ {}
 echo ""
 echo "=== Done. Summary: ==="
 ok=0; skip=0; fail=0
 for d in "${DOCS[@]}"; do
  out="$RAW/$d--subagent/_reextract.json"
  if [ -f "$out" ]; then
    if python3 -c "import json,sys; json.load(open('$out'))" 2>/dev/null; then
      ok=$((ok + 1))
    else
      fail=$((fail + 1))
    fi
  else
    fail=$((fail + 1))
  fi
 done
 echo "  OK:   $ok"
 echo "  FAIL: $fail"
--- a/scripts/reextract/validate.py
+++ b/scripts/reextract/validate.py
@ -0,0 +1,169 @@
 #!/usr/bin/env python3
 """
 validate.py — Validate a Sonnet re-extraction JSON against the closed enums
 in enums.yaml. Returns exit 0 if valid; prints errors and exits 1 otherwise.
 Run:
  python3 scripts/reextract/validate.py <doc-id>
 """
 from __future__ import annotations
 import json
 import re
 import sys
 from pathlib import Path
 import yaml
 REX_DIR = Path("/Users/guto/ufo/scripts/reextract")
 OUT_DIR = Path("/Users/guto/ufo/raw")
 def load_enums() -> dict[str, set[str]]:
    raw = yaml.safe_load((REX_DIR / "enums.yaml").read_text(encoding="utf-8"))
    return {k: set(v) for k, v in raw.items()}
 def validate(data: dict, enums: dict[str, set[str]], doc_id: str) -> list[str]:
    errs: list[str] = []
    def check_enum(value, enum_name: str, ctx: str):
        if value is None: return
        if value not in enums.get(enum_name, set()):
            errs.append(f"{ctx}: '{value}' not in enum:{enum_name}")
    def check_list_enum(values, enum_name: str, ctx: str):
        if not isinstance(values, list): return
        for i, v in enumerate(values):
            check_enum(v, enum_name, f"{ctx}[{i}]")
    # top-level
    if data.get("doc_id") != doc_id:
        errs.append(f"top: doc_id mismatch: '{data.get('doc_id')}' != '{doc_id}'")
    check_enum(data.get("doc_classification"), "doc_classification", "top.doc_classification")
    check_enum(data.get("noise_emission"), "noise_emission", "top.noise_emission")
    check_enum(data.get("investigative_value"), "investigative_value", "top.investigative_value")
    check_list_enum(data.get("primary_topics"), "primary_topics", "top.primary_topics")
    # known chunk IDs from index — to verify evidence_chunks exist
    idx_path = OUT_DIR / f"{doc_id}--subagent" / "_index.json"
    known_chunks: set[str] = set()
    if idx_path.is_file():
        try:
            idx = json.loads(idx_path.read_text(encoding="utf-8"))
            known_chunks = {c.get("chunk_id") for c in idx.get("chunks", [])}
        except Exception:
            pass
    def check_evidence(refs, ctx: str):
        if not isinstance(refs, list):
            errs.append(f"{ctx}: evidence_chunks must be list")
            return
        if not refs:
            errs.append(f"{ctx}: evidence_chunks empty")
            return
        for r in refs:
            if not isinstance(r, str) or not re.match(r"^c\d+$", r):
                errs.append(f"{ctx}: bad chunk_id '{r}'")
            elif known_chunks and r not in known_chunks:
                errs.append(f"{ctx}: unknown chunk_id '{r}' (not in _index.json)")
    # events
    for i, ev in enumerate(data.get("events") or []):
        ctx = f"events[{i}]"
        if not isinstance(ev, dict):
            errs.append(f"{ctx}: not object"); continue
        check_enum(ev.get("event_class"), "event_class", f"{ctx}.event_class")
        check_enum(ev.get("date_confidence"), "date_confidence", f"{ctx}.date_confidence")
        check_enum(ev.get("primary_location_geo_class"), "geo_class", f"{ctx}.primary_location_geo_class")
        check_enum(ev.get("confidence"), "confidence", f"{ctx}.confidence")
        check_evidence(ev.get("evidence_chunks"), ctx)
        for j, o in enumerate(ev.get("observers") or []):
            check_enum(o.get("role_at_event") if isinstance(o, dict) else None,
                       "person_class", f"{ctx}.observers[{j}].role_at_event")
        for j, u in enumerate(ev.get("uap_objects_observed") or []):
            if not isinstance(u, dict): continue
            check_enum(u.get("shape"), "uap_shape", f"{ctx}.uap[{j}].shape")
            check_enum(u.get("color"), "uap_color", f"{ctx}.uap[{j}].color")
            check_enum(u.get("medium"), "uap_medium", f"{ctx}.uap[{j}].medium")
        # date format
        for k in ("date_start", "date_end"):
            v = ev.get(k)
            if v and not re.match(r"^\d{4}(-\d{2}(-\d{2})?)?$|^XXXX(-XX(-XX)?)?$", v):
                errs.append(f"{ctx}.{k}: bad date format '{v}'")
    # people
    for i, p in enumerate(data.get("people") or []):
        ctx = f"people[{i}]"
        if not isinstance(p, dict):
            errs.append(f"{ctx}: not object"); continue
        check_enum(p.get("person_class"), "person_class", f"{ctx}.person_class")
        check_enum(p.get("confidence"), "confidence", f"{ctx}.confidence")
        check_evidence(p.get("evidence_chunks"), ctx)
    # organizations
    for i, o in enumerate(data.get("organizations") or []):
        ctx = f"organizations[{i}]"
        if not isinstance(o, dict):
            errs.append(f"{ctx}: not object"); continue
        check_enum(o.get("org_class"), "org_class", f"{ctx}.org_class")
        check_enum(o.get("confidence"), "confidence", f"{ctx}.confidence")
        check_evidence(o.get("evidence_chunks"), ctx)
    # locations
    for i, l in enumerate(data.get("locations") or []):
        ctx = f"locations[{i}]"
        if not isinstance(l, dict):
            errs.append(f"{ctx}: not object"); continue
        check_enum(l.get("geo_class"), "geo_class", f"{ctx}.geo_class")
        check_enum(l.get("confidence"), "confidence", f"{ctx}.confidence")
        check_evidence(l.get("evidence_chunks"), ctx)
    # relations
    valid_classes = {"person", "event", "organization", "location", "uap_object", "document"}
    for i, r in enumerate(data.get("relations") or []):
        ctx = f"relations[{i}]"
        if not isinstance(r, dict):
            errs.append(f"{ctx}: not object"); continue
        check_enum(r.get("type"), "relation_type", f"{ctx}.type")
        check_enum(r.get("confidence"), "confidence", f"{ctx}.confidence")
        check_evidence(r.get("evidence_chunks"), ctx)
        for k in ("source_class", "target_class"):
            v = r.get(k)
            if v not in valid_classes:
                errs.append(f"{ctx}.{k}: '{v}' not in {valid_classes}")
    return errs
 def main() -> int:
    if len(sys.argv) < 2:
        sys.exit("usage: validate.py <doc-id> [<json-path>]")
    doc_id = sys.argv[1]
    json_path = sys.argv[2] if len(sys.argv) > 2 else str(OUT_DIR / f"{doc_id}--subagent" / "_reextract.json")
    p = Path(json_path)
    if not p.is_file():
        sys.exit(f"json not found: {p}")
    try:
        data = json.loads(p.read_text(encoding="utf-8"))
    except json.JSONDecodeError as e:
        sys.exit(f"JSON parse error: {e}")
    enums = load_enums()
    errs = validate(data, enums, doc_id)
    if errs:
        print(f"❌ {len(errs)} validation errors for {doc_id}:")
        for e in errs[:50]:
            print(f"  - {e}")
        if len(errs) > 50:
            print(f"  ... +{len(errs) - 50} more")
        return 1
    print(f"✓ valid: {doc_id}")
    print(f"  events: {len(data.get('events') or [])}")
    print(f"  people: {len(data.get('people') or [])}")
    print(f"  orgs:   {len(data.get('organizations') or [])}")
    print(f"  locs:   {len(data.get('locations') or [])}")
    print(f"  rels:   {len(data.get('relations') or [])}")
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/synthesize/30_rebuild_wiki_from_reextract.py
+++ b/scripts/synthesize/30_rebuild_wiki_from_reextract.py
@ -0,0 +1,447 @@
 #!/usr/bin/env python3
 """
 30_rebuild_wiki_from_reextract.py — Rebuild wiki/entities/ from scratch using
 the 116 _reextract.json files as the SOLE source of truth.
 Pipeline:
  1. Load every raw/<doc>--subagent/_reextract.json
  2. Load every raw/<doc>--subagent/_index.json (chunk_id → page map)
  3. Cross-doc dedup:
       person/org/loc:  by canonical_name (lowercase, ASCII-fold)
       event:           by event_id (EV-YYYY-MM-DD-slug)
       uap_object:      per (event, observed_index) — never deduped cross-event
  4. Generate IDs per CLAUDE.md regex
  5. Write wiki/entities/{type}/<id>.md (clean frontmatter + EN/PT-BR body stubs)
  6. Print summary
 Does NOT touch DB. DB sync is a separate step.
 Idempotent: re-running with same inputs produces same outputs (deterministic).
 """
 from __future__ import annotations
 import json
 import re
 import sys
 import unicodedata
 from collections import defaultdict
 from datetime import datetime, timezone
 from pathlib import Path
 import yaml
 UFO = Path("/Users/guto/ufo")
 RAW = UFO / "raw"
 ENT = UFO / "wiki" / "entities"
 SCHEMA_VERSION = "0.1.0"
 WIKI_VERSION = "0.1.0"
 NOW = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
 def canonicalize_name(name: str) -> str:
    """Generic name → kebab-case ASCII-fold id (mirrors scripts/03-dedup-entities.py)."""
    if not name:
        return ""
    nfkd = unicodedata.normalize("NFKD", name)
    ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c))
    lower = ascii_str.lower()
    replaced = re.sub(r"[^a-z0-9-]", "-", lower)
    collapsed = re.sub(r"-+", "-", replaced).strip("-")
    if collapsed and collapsed[0].isdigit():
        collapsed = "x-" + collapsed
    return collapsed
 def event_id_from(label: str, date_start: str | None) -> str:
    slug = canonicalize_name(label or "")[:40].strip("-") or "unlabeled"
    date = date_start or ""
    m = re.match(r"^(\d{4})-(\d{2})-(\d{2})$", date)
    if m: return f"EV-{m.group(1)}-{m.group(2)}-{m.group(3)}-{slug}"
    m = re.match(r"^(\d{4})-(\d{2})$", date)
    if m: return f"EV-{m.group(1)}-{m.group(2)}-XX-{slug}"
    m = re.match(r"^(\d{4})$", date)
    if m: return f"EV-{m.group(1)}-XX-XX-{slug}"
    return f"EV-XXXX-XX-XX-{slug}"
 def uap_object_id(event_id: str, index: int) -> str:
    if event_id.startswith("EV-"):
        parts = event_id[3:].split("-", 4)
        if len(parts) >= 4:
            year = parts[0]
            slug = "-".join(parts[3:])
            compact = re.sub(r"[^A-Z0-9]", "", slug.upper())[:20] or "UNK"
            return f"OBJ-EV{year}-{compact}-{index:02d}"
    return f"OBJ-UNK-{index:02d}"
 def dump_yaml(obj: dict) -> str:
    """Stable YAML dump matching existing entity file style."""
    return yaml.safe_dump(obj, sort_keys=False, allow_unicode=True,
                          default_flow_style=False, width=10_000).strip()
 def write_entity(path: Path, frontmatter: dict, body_title: str) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    content = (
        f"---\n{dump_yaml(frontmatter)}\n---\n\n"
        f"# {body_title}\n\n"
        f"## Description (EN)\n\n"
        f"## Descrição (PT-BR)\n"
    )
    path.write_text(content, encoding="utf-8")
 def load_chunk_to_page(doc_id: str) -> dict[str, int]:
    idx_path = RAW / f"{doc_id}--subagent" / "_index.json"
    if not idx_path.is_file(): return {}
    try:
        idx = json.loads(idx_path.read_text(encoding="utf-8"))
        return {c.get("chunk_id"): c.get("page") for c in (idx.get("chunks") or [])
                if c.get("chunk_id") and c.get("page") is not None}
    except Exception:
        return {}
 def page_refs_for(doc_id: str, chunks: list[str], chunk_to_page: dict[str, int]) -> list[str]:
    pages = set()
    for c in chunks or []:
        p = chunk_to_page.get(c)
        if p is not None: pages.add(int(p))
    return [f"[[{doc_id}/p{p:03d}]]" for p in sorted(pages)]
 # ─────────────────────────────────────────────────────────────────────────────
 # AGGREGATION
 # ─────────────────────────────────────────────────────────────────────────────
 class EntityBucket:
    """Aggregates one entity across multiple documents."""
    __slots__ = ("ent_id", "canonical_name", "aliases", "first_class",
                 "by_doc", "extra")
    def __init__(self, ent_id: str, canonical_name: str):
        self.ent_id = ent_id
        self.canonical_name = canonical_name
        self.aliases: set[str] = set()
        self.first_class: str | None = None
        # doc_id → {chunks: list, raw: dict}
        self.by_doc: dict[str, dict] = {}
        self.extra: dict = {}    # type-specific scratch (affiliation, geo_class, etc.)
    def add_occurrence(self, doc_id: str, raw_entity: dict, ent_class: str | None):
        if self.first_class is None and ent_class:
            self.first_class = ent_class
        if raw_entity.get("name") or raw_entity.get("label"):
            self.aliases.add((raw_entity.get("name") or raw_entity.get("label")).strip())
        for a in raw_entity.get("aliases_in_doc") or []:
            if a and a.strip(): self.aliases.add(a.strip())
        self.by_doc.setdefault(doc_id, {"chunks": [], "raw": raw_entity})
        ev = raw_entity.get("evidence_chunks") or []
        self.by_doc[doc_id]["chunks"] = sorted(set(self.by_doc[doc_id]["chunks"]) | set(ev))
 def merge_dates(buckets: dict[str, EntityBucket], get_date) -> dict[str, dict]:
    """Aggregate per-bucket dates from per-doc raw_entity. (For events only.)"""
    out = {}
    for k, b in buckets.items():
        for doc_id, occ in b.by_doc.items():
            d = get_date(occ["raw"])
            if d:
                out.setdefault(k, {}).setdefault("dates", set()).add(d)
    return out
 def aggregate_all() -> dict:
    """Walk all _reextract.json files. Return a structured aggregation."""
    people: dict[str, EntityBucket] = {}
    orgs:   dict[str, EntityBucket] = {}
    locs:   dict[str, EntityBucket] = {}
    events: dict[str, EntityBucket] = {}
    uap_objs: dict[str, EntityBucket] = {}  # per (doc, event, idx) — never deduped
    relations: list[dict] = []
    docs_processed = 0
    chunk_maps: dict[str, dict[str, int]] = {}
    for jpath in sorted(RAW.glob("*--subagent/_reextract.json")):
        doc_id = jpath.parent.name.removesuffix("--subagent")
        try:
            data = json.loads(jpath.read_text(encoding="utf-8"))
        except Exception as e:
            print(f"  skip {doc_id}: {e}", file=sys.stderr); continue
        docs_processed += 1
        chunk_maps[doc_id] = load_chunk_to_page(doc_id)
        # people
        for p in data.get("people") or []:
            name = (p.get("name") or "").strip()
            if not name or name.lower() == "unknown": continue
            pid = canonicalize_name(name)
            if not pid: continue
            bucket = people.setdefault(pid, EntityBucket(pid, name))
            bucket.add_occurrence(doc_id, p, p.get("person_class"))
        # organizations
        for o in data.get("organizations") or []:
            name = (o.get("name") or "").strip()
            if not name or name.lower() == "unknown": continue
            oid = canonicalize_name(name)
            if not oid: continue
            bucket = orgs.setdefault(oid, EntityBucket(oid, name))
            bucket.add_occurrence(doc_id, o, o.get("org_class"))
        # locations
        for l in data.get("locations") or []:
            name = (l.get("name") or "").strip()
            if not name or name.lower() == "unknown": continue
            lid = canonicalize_name(name)
            if not lid: continue
            bucket = locs.setdefault(lid, EntityBucket(lid, name))
            bucket.add_occurrence(doc_id, l, l.get("geo_class"))
        # events
        for e in data.get("events") or []:
            label = (e.get("label") or "").strip()
            if not label: continue
            eid = event_id_from(label, e.get("date_start"))
            bucket = events.setdefault(eid, EntityBucket(eid, label))
            bucket.add_occurrence(doc_id, e, e.get("event_class"))
            # uap_objects — never cross-event-deduped; inherit parent event's evidence_chunks
            event_chunks = e.get("evidence_chunks") or []
            for i, u in enumerate(e.get("uap_objects_observed") or [], 1):
                if not isinstance(u, dict): continue
                uid = uap_object_id(eid, i)
                ubucket = uap_objs.setdefault(uid, EntityBucket(uid, f"{label} — object {i}"))
                u_with_evidence = {**u, "evidence_chunks": u.get("evidence_chunks") or event_chunks}
                ubucket.add_occurrence(doc_id, u_with_evidence, u.get("shape"))
                ubucket.extra.setdefault("event_id", eid)
        # relations — collected raw, mapped to canonical IDs later
        for r in data.get("relations") or []:
            if not isinstance(r, dict): continue
            relations.append({"doc_id": doc_id, **r})
    return {
        "docs_processed": docs_processed,
        "people": people, "organizations": orgs, "locations": locs,
        "events": events, "uap_objects": uap_objs,
        "relations": relations, "chunk_maps": chunk_maps,
    }
 # ─────────────────────────────────────────────────────────────────────────────
 # WRITERS
 # ─────────────────────────────────────────────────────────────────────────────
 def write_person(b: EntityBucket, chunk_maps: dict) -> None:
    mentioned_in = sorted({
        ref
        for doc_id, occ in b.by_doc.items()
        for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
    })
    affiliations = sorted({
        (occ["raw"].get("affiliation") or "").strip()
        for occ in b.by_doc.values() if occ["raw"].get("affiliation")
    } - {""})
    roles = sorted({
        (occ["raw"].get("role_at_doc_date") or "").strip()
        for occ in b.by_doc.values() if occ["raw"].get("role_at_doc_date")
    } - {""})
    fm = {
        "schema_version": SCHEMA_VERSION,
        "type": "entity",
        "entity_class": "person",
        "person_id": b.ent_id,
        "canonical_name": b.canonical_name,
        "aliases": sorted(b.aliases),
        "person_class": b.first_class,
        "affiliations": affiliations,
        "roles": roles,
        "mentioned_in": mentioned_in,
        "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
        "documents_count": len(b.by_doc),
        "enrichment_status": "none",
        "last_ingest": NOW,
        "wiki_version": WIKI_VERSION,
        "source": "reextract-v1",
    }
    write_entity(ENT / "people" / f"{b.ent_id}.md", fm, b.canonical_name)
 def write_org(b: EntityBucket, chunk_maps: dict) -> None:
    mentioned_in = sorted({
        ref for doc_id, occ in b.by_doc.items()
        for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
    })
    countries = sorted({
        (occ["raw"].get("country") or "").strip()
        for occ in b.by_doc.values() if occ["raw"].get("country")
    } - {""})
    fm = {
        "schema_version": SCHEMA_VERSION,
        "type": "entity",
        "entity_class": "organization",
        "organization_id": b.ent_id,
        "canonical_name": b.canonical_name,
        "aliases": sorted(b.aliases),
        "org_class": b.first_class,
        "countries": countries,
        "mentioned_in": mentioned_in,
        "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
        "documents_count": len(b.by_doc),
        "enrichment_status": "none",
        "last_ingest": NOW,
        "wiki_version": WIKI_VERSION,
        "source": "reextract-v1",
    }
    write_entity(ENT / "organizations" / f"{b.ent_id}.md", fm, b.canonical_name)
 def write_location(b: EntityBucket, chunk_maps: dict) -> None:
    mentioned_in = sorted({
        ref for doc_id, occ in b.by_doc.items()
        for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
    })
    countries = sorted({
        (occ["raw"].get("country") or "").strip()
        for occ in b.by_doc.values() if occ["raw"].get("country")
    } - {""})
    regions = sorted({
        (occ["raw"].get("region_or_state") or "").strip()
        for occ in b.by_doc.values() if occ["raw"].get("region_or_state")
    } - {""})
    fm = {
        "schema_version": SCHEMA_VERSION,
        "type": "entity",
        "entity_class": "location",
        "location_id": b.ent_id,
        "canonical_name": b.canonical_name,
        "aliases": sorted(b.aliases),
        "geo_class": b.first_class,
        "countries": countries,
        "regions_or_states": regions,
        "mentioned_in": mentioned_in,
        "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
        "documents_count": len(b.by_doc),
        "enrichment_status": "none",
        "last_ingest": NOW,
        "wiki_version": WIKI_VERSION,
        "source": "reextract-v1",
    }
    write_entity(ENT / "locations" / f"{b.ent_id}.md", fm, b.canonical_name)
 def write_event(b: EntityBucket, chunk_maps: dict) -> None:
    mentioned_in = sorted({
        ref for doc_id, occ in b.by_doc.items()
        for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
    })
    date_starts = sorted({occ["raw"].get("date_start") for occ in b.by_doc.values() if occ["raw"].get("date_start")})
    date_ends = sorted({occ["raw"].get("date_end") for occ in b.by_doc.values() if occ["raw"].get("date_end")})
    primary_locs = sorted({
        (occ["raw"].get("primary_location_name") or "").strip()
        for occ in b.by_doc.values() if occ["raw"].get("primary_location_name")
    } - {""})
    geos = sorted({
        occ["raw"].get("primary_location_geo_class")
        for occ in b.by_doc.values() if occ["raw"].get("primary_location_geo_class")
    } - {None})
    # narrative: take the longest non-empty
    def best(field):
        best_val = ""
        for occ in b.by_doc.values():
            v = (occ["raw"].get(field) or "").strip()
            if len(v) > len(best_val): best_val = v
        return best_val or None
    fm = {
        "schema_version": SCHEMA_VERSION,
        "type": "entity",
        "entity_class": "event",
        "event_id": b.ent_id,
        "canonical_name": b.canonical_name,
        "aliases": sorted(b.aliases),
        "event_class": b.first_class,
        "date_start": date_starts[0] if date_starts else None,
        "date_end": date_ends[-1] if date_ends else None,
        "date_confidence": None,
        "primary_location_names": primary_locs,
        "primary_location_geo_classes": geos,
        "narrative_summary_en": best("narrative_summary"),
        "narrative_summary_pt_br": best("narrative_summary_pt_br"),
        "mentioned_in": mentioned_in,
        "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
        "documents_count": len(b.by_doc),
        "enrichment_status": "none",
        "last_ingest": NOW,
        "wiki_version": WIKI_VERSION,
        "source": "reextract-v1",
    }
    write_entity(ENT / "events" / f"{b.ent_id}.md", fm, b.canonical_name)
 def write_uap_object(b: EntityBucket, chunk_maps: dict) -> None:
    mentioned_in = sorted({
        ref for doc_id, occ in b.by_doc.items()
        for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {}))
    })
    raw_first = next(iter(b.by_doc.values()))["raw"]
    fm = {
        "schema_version": SCHEMA_VERSION,
        "type": "entity",
        "entity_class": "uap_object",
        "uap_object_id": b.ent_id,
        "canonical_name": b.canonical_name,
        "event_id": b.extra.get("event_id"),
        "shape": raw_first.get("shape"),
        "color": raw_first.get("color"),
        "medium": raw_first.get("medium"),
        "size_estimate_m": raw_first.get("size_estimate_m"),
        "altitude_ft": raw_first.get("altitude_ft"),
        "speed_kts": raw_first.get("speed_kts"),
        "maneuver_notes": raw_first.get("maneuver_notes"),
        "mentioned_in": mentioned_in,
        "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()),
        "documents_count": len(b.by_doc),
        "last_ingest": NOW,
        "wiki_version": WIKI_VERSION,
        "source": "reextract-v1",
    }
    write_entity(ENT / "uap-objects" / f"{b.ent_id}.md", fm, b.canonical_name)
 def main():
    print(f"[1/3] Aggregating from {RAW}/*--subagent/_reextract.json ...")
    agg = aggregate_all()
    print(f"      docs processed:  {agg['docs_processed']}")
    print(f"      unique people:   {len(agg['people'])}")
    print(f"      unique orgs:     {len(agg['organizations'])}")
    print(f"      unique locs:     {len(agg['locations'])}")
    print(f"      unique events:   {len(agg['events'])}")
    print(f"      uap objects:     {len(agg['uap_objects'])}")
    print(f"      raw relations:   {len(agg['relations'])}")
    print(f"\n[2/3] Writing entity markdown files ...")
    cmaps = agg["chunk_maps"]
    written = {"people": 0, "organizations": 0, "locations": 0, "events": 0, "uap_objects": 0}
    for b in agg["people"].values():        write_person(b, cmaps);    written["people"] += 1
    for b in agg["organizations"].values(): write_org(b, cmaps);       written["organizations"] += 1
    for b in agg["locations"].values():     write_location(b, cmaps);  written["locations"] += 1
    for b in agg["events"].values():        write_event(b, cmaps);     written["events"] += 1
    for b in agg["uap_objects"].values():   write_uap_object(b, cmaps);written["uap_objects"] += 1
    for k, n in written.items(): print(f"      {k}: {n}")
    print(f"\n[3/3] Saving relations index to wiki/entities/_relations.json (for downstream ingest)")
    rels_path = ENT / "_relations.json"
    rels_path.write_text(json.dumps({
        "schema_version": SCHEMA_VERSION,
        "rebuilt_at": NOW,
        "count": len(agg["relations"]),
        "relations": agg["relations"],
    }, indent=2, ensure_ascii=False), encoding="utf-8")
    print(f"      saved {len(agg['relations'])} relations to {rels_path}")
    print(f"\n✓ done.")
 if __name__ == "__main__":
    main()
--- a/scripts/synthesize/31_aggregate_pages_from_chunks.py
+++ b/scripts/synthesize/31_aggregate_pages_from_chunks.py
@ -0,0 +1,212 @@
 #!/usr/bin/env python3
 """
 31_aggregate_pages_from_chunks.py — Generate thin wiki/pages/<doc>/p<NNN>.md
 files for pages where the chunks/ already have content but the per-page vision
 pipeline (02-vision-page.py) never produced an aggregator file.
 Source of truth: raw/<doc>--subagent/_index.json + chunks/c*.md (Sonnet-extracted)
 Output:          wiki/pages/<doc>/p<NNN>.md (thin aggregator, tagged source:chunk-aggregator)
 Skips pages that already have a wiki/pages/.md (idempotent).
 Run:
  python3 scripts/synthesize/31_aggregate_pages_from_chunks.py
  python3 scripts/synthesize/31_aggregate_pages_from_chunks.py --doc-id <id>   # one doc
  python3 scripts/synthesize/31_aggregate_pages_from_chunks.py --dry-run
 """
 from __future__ import annotations
 import argparse
 import json
 import re
 import sys
 from collections import defaultdict
 from datetime import datetime, timezone
 from pathlib import Path
 UFO = Path("/Users/guto/ufo")
 RAW = UFO / "raw"
 PNG_BASE = UFO / "processing" / "png"
 PAGES_BASE = UFO / "wiki" / "pages"
 SCHEMA_VERSION = "0.1.0"
 WIKI_VERSION = "0.1.0"
 NOW = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
 def split_frontmatter(text: str) -> tuple[dict, str]:
    if not text.startswith("---"): return {}, text
    parts = text.split("---", 2)
    if len(parts) < 3: return {}, text
    fm: dict = {}
    for line in parts[1].splitlines():
        m = re.match(r"^([a-zA-Z_]+):\s*(.*)$", line)
        if not m: continue
        fm[m.group(1)] = m.group(2).strip()
    return fm, parts[2]
 def extract_bilingual(body: str) -> tuple[str, str]:
    """Return (en, pt_br) text from a chunk body. Either may be empty."""
    en_m = re.search(r"\*\*EN:\*\*\s*(.*?)(?:\n\s*\*\*PT-BR:\*\*|\Z)", body, re.S)
    pt_m = re.search(r"\*\*PT-BR:\*\*\s*(.*?)\Z", body, re.S)
    en = (en_m.group(1).strip() if en_m else "").strip()
    pt = (pt_m.group(1).strip() if pt_m else "").strip()
    return en, pt
 def find_missing_pages() -> dict[str, list[int]]:
    """For each doc, return sorted list of pages where PNG exists but wiki/pages/.md doesn't."""
    missing: dict[str, list[int]] = defaultdict(list)
    for png in PNG_BASE.glob("*/p-*.png"):
        doc_id = png.parent.name
        m = re.match(r"p-(\d+)\.png$", png.name)
        if not m: continue
        n = int(m.group(1))
        wiki = PAGES_BASE / doc_id / f"p{n:03d}.md"
        if not wiki.is_file():
            missing[doc_id].append(n)
    return {d: sorted(ps) for d, ps in missing.items()}
 def build_page_md(doc_id: str, page_num: int) -> str | None:
    """Assemble a single page.md from the doc's _index.json + chunks/."""
    sub = RAW / f"{doc_id}--subagent"
    idx_path = sub / "_index.json"
    if not idx_path.is_file(): return None
    idx = json.loads(idx_path.read_text(encoding="utf-8"))
    chunks_for_page = [c for c in (idx.get("chunks") or []) if c.get("page") == page_num]
    if not chunks_for_page:
        return None  # no chunk data → can't aggregate
    chunks_for_page.sort(key=lambda x: x.get("order_in_page", 0))
    total_pages = idx.get("total_pages")
    rel_png = f"../../../processing/png/{doc_id}/p-{page_num:03d}.png"
    # Aggregate per-chunk EN/PT/metadata
    body_blocks: list[str] = []
    types_seen: set[str] = set()
    chunk_ids: list[str] = []
    has_redaction = has_image = has_table = has_stamp = has_signature = False
    classifications: set[str] = set()
    for c in chunks_for_page:
        cid = c.get("chunk_id")
        chunk_ids.append(cid)
        ctype = c.get("type") or "?"
        types_seen.add(ctype)
        chunk_path = sub / "chunks" / f"{cid}.md"
        if not chunk_path.is_file(): continue
        text = chunk_path.read_text(encoding="utf-8")
        fm, body = split_frontmatter(text)
        en, pt = extract_bilingual(body)
        if not en and not pt:
            # fall back to extracted_text / image_description fields
            en = (fm.get("image_description_en") or fm.get("extracted_text") or "").strip().strip('"\'')
            pt = (fm.get("image_description_pt_br") or "").strip().strip('"\'')
        # Heuristic flags
        if ctype in ("redaction", "redacted_block"): has_redaction = True
        if "image" in ctype or "photo" in ctype or "diagram" in ctype or "sketch" in ctype or "map" in ctype:
            has_image = True
        if "table" in ctype: has_table = True
        if "stamp" in ctype: has_stamp = True
        if "signature" in ctype: has_signature = True
        cls = fm.get("classification")
        if cls and cls != "null": classifications.add(cls)
        # Body block
        block = f"### Chunk `{cid}` — type: {ctype}\n"
        bbox = c.get("bbox") or {}
        if bbox:
            block += f"_bbox_: x={bbox.get('x')}, y={bbox.get('y')}, w={bbox.get('w')}, h={bbox.get('h')}\n\n"
        if en:    block += f"**EN:** {en}\n\n"
        if pt:    block += f"**PT-BR:** {pt}\n"
        body_blocks.append(block.rstrip())
    # Content classification
    content_class = []
    if has_image: content_class.append("contains-photos")
    if has_table: content_class.append("contains-tables")
    if has_stamp: content_class.append("contains-stamps")
    if has_signature: content_class.append("contains-signatures")
    if has_redaction: content_class.append("redaction-heavy")
    if not content_class: content_class.append("text-only")
    # Page-level inferred type (best-effort)
    if "classification_banner" in types_seen and len(types_seen) <= 3:
        page_type = "cover"
    elif "header" in types_seen and "transcript_block" in types_seen:
        page_type = "transcript"
    elif has_table and not body_blocks:
        page_type = "table_only"
    elif "letterhead" in types_seen:
        page_type = "memo"
    else:
        page_type = "mixed"
    # Frontmatter
    fm = {
        "schema_version": SCHEMA_VERSION,
        "type": "page",
        "page_id": f"{doc_id}/p{page_num:03d}",
        "doc_id": doc_id,
        "page_number": page_num,
        "total_pages": total_pages,
        "png_path": rel_png,
        "page_type": page_type,
        "content_classification": content_class,
        "classification_markings": [{"level": c} for c in sorted(classifications)] if classifications else [],
        "chunks_on_page": chunk_ids,
        "chunk_count": len(chunk_ids),
        "source": "chunk-aggregator",
        "source_note": "Page-md generated from chunks built by Sonnet vision (raw/<doc>--subagent/chunks/). Per-page vision Haiku pipeline (02-vision-page.py) never produced an output for this page.",
        "last_ingest": NOW,
        "wiki_version": WIKI_VERSION,
    }
    import yaml
    yaml_block = yaml.safe_dump(fm, sort_keys=False, allow_unicode=True,
                                default_flow_style=False, width=10_000).rstrip()
    body = "\n\n".join(body_blocks) if body_blocks else "_(no extractable text — see chunk files directly)_"
    return f"---\n{yaml_block}\n---\n\n# Page {page_num} of {doc_id}\n\n{body}\n"
 def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--doc-id", default=None)
    ap.add_argument("--dry-run", action="store_true")
    args = ap.parse_args()
    missing = find_missing_pages()
    if args.doc_id:
        missing = {args.doc_id: missing.get(args.doc_id, [])}
    total_missing = sum(len(ps) for ps in missing.values())
    print(f"[1/2] Inventory: {sum(1 for d, ps in missing.items() if ps)} docs, {total_missing} missing pages")
    if args.dry_run:
        for d, ps in sorted(missing.items(), key=lambda kv: -len(kv[1])):
            if ps: print(f"  {d}: {len(ps)}")
        return 0
    print(f"\n[2/2] Generating thin aggregator page.md files ...")
    written = 0
    skipped_no_chunks = 0
    for doc_id, pages in missing.items():
        for n in pages:
            md = build_page_md(doc_id, n)
            if md is None:
                skipped_no_chunks += 1
                continue
            out = PAGES_BASE / doc_id / f"p{n:03d}.md"
            out.parent.mkdir(parents=True, exist_ok=True)
            out.write_text(md, encoding="utf-8")
            written += 1
    print(f"      written: {written}")
    print(f"      skipped (no chunk data): {skipped_no_chunks}")
    print(f"\n✓ done.")
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/synthesize/32_reprocess_missing_pages.py
+++ b/scripts/synthesize/32_reprocess_missing_pages.py
@ -0,0 +1,375 @@
 #!/usr/bin/env python3
 """
 32_reprocess_missing_pages.py — Reprocess pages that the doc-rebuilder agent
 silently dropped due to context-window overflow.
 For each doc:
  1. Read raw/<doc>--subagent/_index.json (current chunk inventory)
  2. Find missing pages: PNGs that exist but have no chunks
  3. For each missing page, call `claude -p --model sonnet` with the page PNG
     and ask for a chunks JSON (matching the page-rebuilder schema)
  4. Append new chunks to _index.json with continued global IDs (chunk_id, order_global)
  5. Write new chunks/c<NNNN>.md files
 Idempotent — re-running skips pages already processed.
 Uses WORKERS=2 to avoid hammering OAuth rate limits.
 Usage:
  python3 scripts/synthesize/32_reprocess_missing_pages.py --dry-run
  python3 scripts/synthesize/32_reprocess_missing_pages.py --doc-id <id>
  WORKERS=2 python3 scripts/synthesize/32_reprocess_missing_pages.py
 """
 from __future__ import annotations
 import argparse
 import json
 import os
 import re
 import subprocess
 import sys
 import tempfile
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime, timezone
 from pathlib import Path
 UFO = Path("/Users/guto/ufo")
 RAW = UFO / "raw"
 PNG_BASE = UFO / "processing" / "png"
 NOW = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
 SONNET_MODEL = "sonnet"
 WORKERS = int(os.environ.get("WORKERS", "2"))
 PROMPT_TEMPLATE = """You are a page-rebuilder for a declassified UAP/UFO government document.
 You will receive the path of ONE page image. Read it with the Read tool, then analyze it carefully and extract ALL content as structured chunks.
 DOCUMENT_ID: {doc_id}
 PAGE_NUMBER: {page_num}
 PNG_PATH: {png_path}
 Return ONE JSON object with this exact structure (no markdown fence, no preamble, no postscript):
 {{
  "page_number": {page_num},
  "chunks": [
    {{
      "order_in_page": 1,
      "type": "<chunk_type>",
      "content_en": "<English verbatim text or visual description>",
      "content_pt_br": "<Brazilian Portuguese translation>",
      "bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}},
      "classification": null,
      "formatting": [],
      "cross_page_hint": "self_contained",
      "ocr_confidence": 0.85,
      "redaction_code": null,
      "redaction_inferred_content_type": null,
      "image_type": null,
      "ufo_anomaly_detected": false,
      "ufo_anomaly_type": null,
      "ufo_anomaly_rationale": null,
      "cryptid_anomaly_detected": false,
      "cryptid_anomaly_type": null,
      "cryptid_anomaly_rationale": null,
      "image_description_en": null,
      "image_description_pt_br": null,
      "extracted_text": null
    }}
  ]
 }}
 CHUNK TYPES (use only these): letterhead, classification_banner, header, subheader, paragraph, list_item, caption, footnote, page_number, signature_block, stamp, redaction_block, image, table_marker, form_field, watermark, separator, blank, annotation, transcript_block.
 RULES:
 1. Extract EVERY element on the page — nothing is skipped.
 2. bbox is normalized coords (0.0..1.0) relative to the page image.
 3. content_en is verbatim OCR text for text chunks; for images, describe what you see.
 4. content_pt_br is Brazilian Portuguese (NOT European Portuguese). Preserve UTF-8 accents.
 5. For redactions: type="redaction_block", content_en="[REDACTED]", content_pt_br="[REDACTADO]".
 6. For images/photos/diagrams/sketches/maps: type="image", describe in image_description_en/pt_br.
 7. For stamps: type="stamp".
 8. classification: extract markings if visible ("SECRET", "CONFIDENTIAL", "UNCLASSIFIED", etc.).
 9. formatting: any of ["bold", "italic", "underline", "all_caps", "handwritten", "typewritten"].
 10. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev".
 11. ufo_anomaly_detected: true if chunk contains UAP/UFO sighting data, coordinates, witness accounts, anomalous phenomena.
 12. If page is truly blank: return one chunk with type="blank".
 13. Order chunks top-to-bottom, left-to-right.
 Return ONLY the JSON. No markdown. No commentary.
 """
 DISALLOWED = (
    "AskUserQuestion,Bash,Edit,Write,Task,Glob,Grep,"
    "TaskCreate,TaskUpdate,TaskList,TaskGet,TaskStop,TaskOutput,"
    "Skill,ScheduleWakeup,Monitor,WebSearch,WebFetch,NotebookEdit,"
    "EnterPlanMode,ExitPlanMode,EnterWorktree,ExitWorktree,"
    "CronCreate,CronDelete,CronList,RemoteTrigger,ToolSearch,"
    "PushNotification,ListMcpResourcesTool,ReadMcpResourceTool,"
    "ShareOnboardingGuide"
 )  # NOTE: Read is allowed (we need vision)
 def extract_json_block(s: str) -> str:
    s = s.strip()
    if s.startswith("```"):
        s = "\n".join(line for line in s.splitlines() if not line.startswith("```"))
        s = s.strip()
    start = s.find("{")
    end = s.rfind("}")
    if start >= 0 and end > start: return s[start:end + 1]
    return s
 def call_sonnet_vision(doc_id: str, page_num: int) -> dict | None:
    png_path = PNG_BASE / doc_id / f"p-{page_num:03d}.png"
    if not png_path.is_file(): return None
    prompt = PROMPT_TEMPLATE.format(doc_id=doc_id, page_num=page_num, png_path=str(png_path))
    env = {**os.environ, "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000"}
    with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding="utf-8") as tmp:
        tmp_path = tmp.name
    try:
        with open(tmp_path, "wb") as out_f:
            r = subprocess.run(
                ["claude", "-p", "--model", SONNET_MODEL, "--output-format", "text",
                 "--disallowed-tools", DISALLOWED],
                input=prompt.encode("utf-8"),
                stdout=out_f, stderr=subprocess.PIPE, env=env,
                timeout=300,
            )
        if r.returncode != 0:
            print(f"  [FAIL] {doc_id} p{page_num:03d} — claude rc={r.returncode}: {r.stderr.decode('utf-8', errors='replace')[:300]}", file=sys.stderr)
            return None
        with open(tmp_path, "r", encoding="utf-8") as f:
            raw = f.read()
        js = extract_json_block(raw)
        try:
            return json.loads(js)
        except json.JSONDecodeError as e:
            print(f"  [JSON] {doc_id} p{page_num:03d} — {e} | raw_len={len(raw)}", file=sys.stderr)
            return None
    finally:
        try: os.unlink(tmp_path)
        except OSError: pass
 def find_missing_pages_per_doc() -> dict[str, list[int]]:
    """For each doc, find pages that have a PNG but no chunks in _index.json.
    Excludes the Poppler-phantom (last) page only if pdf_pages is known and PNG == pdf+1."""
    result: dict[str, list[int]] = {}
    import subprocess as sp
    # Try to map pdf_pages by exact filename matching
    pdf_pages_map: dict[str, int] = {}
    for p in RAW.glob("*.pdf"):
        try:
            out = sp.run(["pdfinfo", str(p)], capture_output=True, text=True, timeout=30).stdout
            m = re.search(r"Pages:\s+(\d+)", out)
            if m:
                # filename → doc_id (same algorithm as page-rebuilder did)
                import unicodedata
                nfd = unicodedata.normalize("NFD", p.stem)
                ascii_str = "".join(c for c in nfd if not unicodedata.combining(c))
                slug = re.sub(r"-+", "-", re.sub(r"[^a-z0-9-]", "-", ascii_str.lower())).strip("-")
                if slug and slug[0].isdigit(): slug = "doc-" + slug
                pdf_pages_map[slug] = int(m.group(1))
        except Exception: pass
    for png_dir in PNG_BASE.glob("*/"):
        doc_id = png_dir.name
        pngs = sorted(
            int(re.match(r"p-(\d+)\.png", p.name).group(1))
            for p in png_dir.glob("p-*.png") if re.match(r"p-\d+\.png", p.name)
        )
        if not pngs: continue
        idx_path = RAW / f"{doc_id}--subagent" / "_index.json"
        if not idx_path.is_file(): continue
        try:
            idx = json.loads(idx_path.read_text(encoding="utf-8"))
        except Exception: continue
        pages_in_chunks = {c.get("page") for c in idx.get("chunks", []) if c.get("page")}
        # Filter: only pages 1..pdf_pages (avoid Poppler phantom)
        pdf_pages = pdf_pages_map.get(doc_id)
        upper_bound = pdf_pages if pdf_pages else pngs[-1]
        missing = [p for p in pngs if p <= upper_bound and p not in pages_in_chunks]
        if missing: result[doc_id] = missing
    return result
 def render_chunk_md(chunk: dict) -> str:
    """Render a chunk dict to the chunk.md format."""
    import yaml
    body_en = chunk.pop("_body_en", "")
    body_pt = chunk.pop("_body_pt", "")
    # YAML keys in stable order
    fm_keys = [
        "chunk_id", "type", "page", "order_in_page", "order_global", "bbox",
        "classification", "formatting", "cross_page_hint", "prev_chunk", "next_chunk",
        "related_image", "related_table", "ocr_confidence", "ocr_source_lines",
        "redaction_code", "redaction_inferred_content_type", "image_type",
        "ufo_anomaly_detected", "cryptid_anomaly_detected",
        "ufo_anomaly_type", "ufo_anomaly_rationale",
        "cryptid_anomaly_type", "cryptid_anomaly_rationale",
        "image_description_en", "image_description_pt_br", "extracted_text",
        "source_png",
    ]
    fm = {k: chunk.get(k) for k in fm_keys if k in chunk}
    yaml_block = yaml.safe_dump(fm, sort_keys=False, allow_unicode=True,
                                default_flow_style=False, width=10_000).rstrip()
    body = f"**EN:** {body_en}\n\n**PT-BR:** {body_pt}\n" if (body_en or body_pt) else ""
    return f"---\n{yaml_block}\n---\n\n{body}"
 def integrate_page_chunks(doc_id: str, page_num: int, page_result: dict, idx: dict) -> int:
    """Add new page chunks to idx + write chunk .md files. Returns chunks added."""
    chunks = page_result.get("chunks") or []
    if not chunks: return 0
    sub = RAW / f"{doc_id}--subagent"
    chunks_dir = sub / "chunks"
    chunks_dir.mkdir(exist_ok=True)
    # Determine next global order
    next_global = max((c.get("order_global", 0) for c in idx.get("chunks") or []), default=0) + 1
    # Determine next chunk_id numeric
    next_id_num = next_global
    rel_png = f"../../processing/png/{doc_id}/p-{page_num:03d}.png"
    added = 0
    new_index_entries = []
    for i, c in enumerate(chunks, 1):
        cid = f"c{next_id_num:04d}"
        ctype = c.get("type") or "paragraph"
        en = c.get("content_en") or ""
        pt = c.get("content_pt_br") or ""
        entry = {
            "chunk_id": cid,
            "type": ctype,
            "page": page_num,
            "order_in_page": c.get("order_in_page") or i,
            "order_global": next_id_num,
            "file": f"chunks/{cid}.md",
            "bbox": c.get("bbox") or {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0},
            "preview": (en or pt or "")[:120],
        }
        new_index_entries.append(entry)
        chunk_dict = {
            "chunk_id": cid,
            "type": ctype,
            "page": page_num,
            "order_in_page": entry["order_in_page"],
            "order_global": next_id_num,
            "bbox": entry["bbox"],
            "classification": c.get("classification"),
            "formatting": c.get("formatting") or [],
            "cross_page_hint": c.get("cross_page_hint") or "self_contained",
            "prev_chunk": f"c{next_id_num-1:04d}" if next_id_num > 1 else None,
            "next_chunk": None,  # patched after all known
            "related_image": None,
            "related_table": None,
            "ocr_confidence": c.get("ocr_confidence") or 0.85,
            "ocr_source_lines": [],
            "redaction_code": c.get("redaction_code"),
            "redaction_inferred_content_type": c.get("redaction_inferred_content_type"),
            "image_type": c.get("image_type"),
            "ufo_anomaly_detected": bool(c.get("ufo_anomaly_detected")),
            "cryptid_anomaly_detected": bool(c.get("cryptid_anomaly_detected")),
            "ufo_anomaly_type": c.get("ufo_anomaly_type"),
            "ufo_anomaly_rationale": c.get("ufo_anomaly_rationale"),
            "cryptid_anomaly_type": c.get("cryptid_anomaly_type"),
            "cryptid_anomaly_rationale": c.get("cryptid_anomaly_rationale"),
            "image_description_en": c.get("image_description_en"),
            "image_description_pt_br": c.get("image_description_pt_br"),
            "extracted_text": c.get("extracted_text"),
            "source_png": rel_png,
            "_body_en": en, "_body_pt": pt,
        }
        (chunks_dir / f"{cid}.md").write_text(render_chunk_md(chunk_dict), encoding="utf-8")
        next_id_num += 1
        added += 1
    idx.setdefault("chunks", []).extend(new_index_entries)
    return added
 import threading
 # One lock per doc_id (only contended when 2+ workers process pages of same doc)
 _doc_locks: dict[str, threading.Lock] = {}
 _locks_mutex = threading.Lock()
 def _doc_lock(doc_id: str) -> threading.Lock:
    with _locks_mutex:
        if doc_id not in _doc_locks: _doc_locks[doc_id] = threading.Lock()
        return _doc_locks[doc_id]
 def process_one_page(doc_id: str, page_num: int) -> tuple[bool, int]:
    """Process a single page and persist to _index.json under doc lock.
    Returns (ok, chunks_added)."""
    result = call_sonnet_vision(doc_id, page_num)
    if not result:
        print(f"  [SKIP] {doc_id} p{page_num:03d} — no result", flush=True)
        return (False, 0)
    sub = RAW / f"{doc_id}--subagent"
    idx_path = sub / "_index.json"
    with _doc_lock(doc_id):
        idx = json.loads(idx_path.read_text(encoding="utf-8"))
        # Idempotent: if page already integrated meanwhile, skip
        if any(c.get("page") == page_num for c in idx.get("chunks") or []):
            print(f"  [SKIP] {doc_id} p{page_num:03d} — already present", flush=True)
            return (False, 0)
        try:
            n = integrate_page_chunks(doc_id, page_num, result, idx)
        except Exception as e:
            print(f"  [ERR ] {doc_id} p{page_num:03d} — integrate: {e}", flush=True)
            return (False, 0)
        idx_path.write_text(json.dumps(idx, indent=2, ensure_ascii=False), encoding="utf-8")
    print(f"  [OK  ] {doc_id} p{page_num:03d} — {n} chunks", flush=True)
    return (True, n)
 def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--doc-id", default=None)
    ap.add_argument("--page", type=int, default=None, help="single page for testing")
    ap.add_argument("--dry-run", action="store_true")
    args = ap.parse_args()
    missing = find_missing_pages_per_doc()
    if args.doc_id:
        missing = {args.doc_id: missing.get(args.doc_id, [])}
    if args.page and args.doc_id:
        missing = {args.doc_id: [args.page]}
    # Flatten (doc, page) job list — page-level parallelism
    jobs: list[tuple[str, int]] = []
    for d, ps in missing.items():
        for p in ps: jobs.append((d, p))
    total = len(jobs)
    print(f"[1/2] {len(missing)} docs · {total} page-jobs")
    if args.dry_run:
        for d, ps in sorted(missing.items(), key=lambda kv: -len(kv[1])):
            if ps: print(f"  {d}: {len(ps)} pages → {ps[:5]}{'...' if len(ps)>5 else ''}")
        return 0
    if total == 0: print("Nothing to do."); return 0
    print(f"\n[2/2] Processing with WORKERS={WORKERS} (page-level parallel) ...")
    pages_done = chunks_added = 0
    completed = 0
    with ThreadPoolExecutor(max_workers=WORKERS) as pool:
        futs = {pool.submit(process_one_page, d, p): (d, p) for d, p in jobs}
        for fut in as_completed(futs):
            d, p = futs[fut]
            completed += 1
            try:
                ok, n = fut.result()
                if ok: pages_done += 1; chunks_added += n
            except Exception as e:
                print(f"  [ERR ] {d} p{p:03d}: {e}", flush=True)
            if completed % 25 == 0:
                print(f"  ... [progress] {completed}/{total}  pages_done={pages_done}  chunks={chunks_added}", flush=True)
    print(f"\n✓ {pages_done}/{total} pages processed, {chunks_added} new chunks.")
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/web/app/api/relations/route.ts
+++ b/web/app/api/relations/route.ts
@ -0,0 +1,60 @@
 /**
 * /api/relations — read typed relations for an entity.
 *
 * GET /api/relations?class=person&id=j-edgar-hoover
 *   → Returns relations where this entity is source OR target,
 *     grouped by relation_type and direction.
 */
 import { NextRequest } from "next/server";
 import { pgQuery } from "@/lib/retrieval/db";
 export const runtime = "nodejs";
 export const dynamic = "force-dynamic";
 function json(data: unknown, status = 200) {
  return new Response(JSON.stringify(data), {
    status,
    headers: { "content-type": "application/json" },
  });
 }
 interface Relation {
  source_class: string;
  source_id: string;
  relation_type: string;
  target_class: string;
  target_id: string;
  evidence_ref: string | null;
  confidence: string;
 }
 export async function GET(req: NextRequest) {
  const u = new URL(req.url);
  const cls = u.searchParams.get("class") ?? "";
  const id = u.searchParams.get("id") ?? "";
  if (!cls || !id) return json({ error: "class and id required" }, 400);
  try {
    const outgoing = await pgQuery<Relation>(
      `SELECT source_class, source_id, relation_type, target_class, target_id,
              evidence_ref, confidence
       FROM public.relations
       WHERE source_class = $1 AND source_id = $2
       ORDER BY confidence DESC, relation_type, target_class, target_id
       LIMIT 200`,
      [cls, id],
    );
    const incoming = await pgQuery<Relation>(
      `SELECT source_class, source_id, relation_type, target_class, target_id,
              evidence_ref, confidence
       FROM public.relations
       WHERE target_class = $1 AND target_id = $2
       ORDER BY confidence DESC, relation_type, source_class, source_id
       LIMIT 200`,
      [cls, id],
    );
    return json({ outgoing, incoming });
  } catch (e) {
    return json({ error: "db_unavailable", message: (e as Error).message }, 503);
  }
 }
--- a/web/app/api/timeline/route.ts
+++ b/web/app/api/timeline/route.ts
@ -103,6 +103,8 @@ export async function GET(req: Request) {
      ) as TimelineEntry["summary_status"];
      // Default: hide events without a real narrative.
      if (!includeUnsynthesized && summary_status === "none") continue;
      // Always hide generic concept-entities (categories, not real events).
      if (fm.is_generic === true) continue;
      if (q && !canonical.toLowerCase().includes(q) && !narrative.toLowerCase().includes(q)) {
        continue;
      }
--- a/web/app/d/[docId]/[page]/page.tsx
+++ b/web/app/d/[docId]/[page]/page.tsx
@ -53,8 +53,6 @@ export default async function DocPageView({
  if (!idx) notFound();
  const pageChunks = byPage.get(pageNum) ?? [];
  if (pageChunks.length === 0) notFound();
  const pngUrl = `/api/static/processing/png/${docId}/p-${m[1]}.png`;
  const totalPages = idx.total_pages;
@ -123,7 +121,18 @@ export default async function DocPageView({
          <h2 className="font-mono text-xs uppercase tracking-widest text-[#7fdbff] mb-2">
            trechos (ordem de leitura)
          </h2>
-          <DocRendererV2 docId={docId} chunksByPage={[[pageNum, pageChunks]]} />
+          {pageChunks.length === 0 ? (
            <div className="border border-[rgba(0,255,156,0.15)] bg-[#0a121e] rounded p-6 text-sm text-[#c8d4e6]">
              <p className="font-mono text-[#7fdbff] mb-2">▍ página sem trechos extraídos</p>
              <p className="text-[#5a6678] text-xs">
                O scan existe (veja à esquerda) mas o processo de chunking não gerou trechos
                para esta página específica. Pode ser página em branco, divisor de seção
                ou conteúdo sem texto extraível. Próxima execução do chunker preencherá.
              </p>
            </div>
          ) : (
            <DocRendererV2 docId={docId} chunksByPage={[[pageNum, pageChunks]]} />
          )}
        </article>
      </div>
--- a/web/app/e/[cls]/[id]/page.tsx
+++ b/web/app/e/[cls]/[id]/page.tsx
@ -10,6 +10,7 @@ import { MarkdownBody } from "@/components/markdown-body";
 import { ChatBubble } from "@/components/chat-bubble";
 import { AuthBar } from "@/components/auth-bar";
 import { EntityGraphMini } from "@/components/entity-graph-mini";
 import { EntityRelations } from "@/components/entity-relations";
 import {
  getEntityCore,
  getEntityMentionsByDoc,
@ -101,7 +102,11 @@ export default async function EntityPage({
  const totalMentions = core?.total_mentions ?? 0;
  const documentsCount = core?.documents_count ?? 0;
  const strength = core?.signal_strength ?? "unverified";
-  const sigs = core?.signal_sources ?? { db_chunks: 0, page_refs: 0, cross_refs: 0 };
+  const sigs = core?.signal_sources ?? { db_chunks: 0, page_refs: 0, cross_refs: 0, text_refs: 0 };
  // Derived display class: orphan + curated narrative is not noise — it's a
  // knowledge-curated entity the corpus simply doesn't mention. Label it apart.
  const displayStrength: "strong" | "weak" | "curated" | "orphan" | "unverified" =
    strength === "orphan" && core?.summary_status === "curated" ? "curated" : (strength as any);
  const classColor = CLASS_COLOR[folder as EntityClass];
  const classBg = CLASS_BG[folder as EntityClass];
@ -167,42 +172,54 @@ export default async function EntityPage({
          )}
          <div
            className={`px-4 py-3 bg-[#0a121e] border rounded ${
-              strength === "strong"
+              displayStrength === "strong"
                ? "border-[#00ff9c]"
-                : strength === "weak"
+                : displayStrength === "weak"
                  ? "border-[#ffa500]"
-                  : strength === "orphan"
+                  : displayStrength === "curated"
-                    ? "border-[#ff6b6b]"
+                    ? "border-[#a78bfa]"
-                    : "border-[#5a6678]"
+                    : displayStrength === "orphan"
                      ? "border-[#ff6b6b]"
                      : "border-[#5a6678]"
            }`}
-            title="Cruzamento dos 3 sinais que confirmam esta entidade no corpus."
+            title="Cruzamento dos sinais que confirmam esta entidade no corpus."
          >
            <div className="font-mono text-[10px] uppercase tracking-widest text-[#5a6678]">
              força do sinal
            </div>
            <div
              className={`font-mono text-sm mt-0.5 ${
-                strength === "strong"
+                displayStrength === "strong"
                  ? "text-[#00ff9c]"
-                  : strength === "weak"
+                  : displayStrength === "weak"
                    ? "text-[#ffa500]"
-                    : strength === "orphan"
+                    : displayStrength === "curated"
-                      ? "text-[#ff6b6b]"
+                      ? "text-[#a78bfa]"
-                      : "text-[#8896aa]"
+                      : displayStrength === "orphan"
                        ? "text-[#ff6b6b]"
                        : "text-[#8896aa]"
              }`}
            >
-              {strength === "strong" && "forte"}
+              {displayStrength === "strong" && "forte"}
-              {strength === "weak" && "fraca"}
+              {displayStrength === "weak" && "fraca"}
-              {strength === "orphan" && "órfã"}
+              {displayStrength === "curated" && "curado"}
-              {strength === "unverified" && "não verificada"}
+              {displayStrength === "orphan" && "órfã"}
              {displayStrength === "unverified" && "não verificada"}
            </div>
            <div className="font-mono text-[9px] text-[#5a6678] mt-1 leading-tight">
-              {sigs.db_chunks} chunks · {sigs.page_refs} págs · {sigs.cross_refs} backlinks
+              {sigs.db_chunks} chunks · {sigs.page_refs} págs · {sigs.cross_refs} backlinks · {sigs.text_refs} textuais
            </div>
          </div>
        </div>
-        {strength === "orphan" && (
+        {strength === "orphan" && core?.summary_status === "curated" && (
          <p className="mt-4 text-xs text-[#a78bfa] font-mono leading-relaxed">
            📚 conhecimento curado · este evento/entidade faz parte do registro UAP/UFO
            mundial mas <strong>não foi mencionado</strong> nos PDFs deste corpus (war.gov/ufo).
            Narrativa abaixo vem de fonte curada manualmente, não de extração.
          </p>
        )}
        {strength === "orphan" && core?.summary_status !== "curated" && (
          <p className="mt-4 text-xs text-[#ff6b6b] font-mono">
            ⚠ entidade não confirmada: nenhuma página, chunk ou outra entidade aponta para
            ela. Pode ser extração ruidosa do pipeline original.
@ -287,10 +304,17 @@ export default async function EntityPage({
        <aside className="lg:sticky lg:top-6 lg:self-start space-y-6">
          <section>
            <h3 className="font-mono text-[10px] text-[#8896aa] uppercase tracking-widest mb-2">
-              Aparece em {documentsCount} documento(s)
+              Aparece em {mentionGroups.length} documento(s)
            </h3>
            {mentionGroups.length === 0 ? (
-              <p className="text-[#5a6678] text-xs italic">Sem dados de mention ainda.</p>
+              displayStrength === "curated" ? (
                <p className="text-[#a78bfa] text-xs italic leading-relaxed">
                  Não documentado nos PDFs deste corpus. Conteúdo abaixo vem de fonte
                  curada (registro UAP mundial), não de extração de documentos.
                </p>
              ) : (
                <p className="text-[#5a6678] text-xs italic">Sem dados de mention ainda.</p>
              )
            ) : (
              <ul className="space-y-1 max-h-[50vh] overflow-y-auto pr-1">
                {mentionGroups.map((m) => (
@ -303,6 +327,14 @@ export default async function EntityPage({
                        <span className="text-[#7fdbff] group-hover:text-[#00ff9c] truncate flex-1">
                          {m.canonical_title ?? m.doc_id}
                        </span>
                        {m.text_only && (
                          <span
                            title="Menção textual encontrada via back-fill (alias dentro do corpo narrativo); pipeline estruturado não pegou."
                            className="text-[9px] text-[#a78bfa] border border-[rgba(167,139,250,0.40)] px-1 rounded shrink-0"
                          >
                            texto
                          </span>
                        )}
                        <span className="text-[#00ff9c] tabular-nums shrink-0">{m.mention_count}×</span>
                      </div>
                      <div className="flex items-center gap-2 mt-0.5 font-mono text-[10px] text-[#5a6678]">
@ -318,6 +350,13 @@ export default async function EntityPage({
            )}
          </section>
          <section className="border-t border-[rgba(0,255,156,0.12)] pt-4">
            <h3 className="font-mono text-[10px] text-[#8896aa] uppercase tracking-widest mb-3">
              Relações tipadas
            </h3>
            <EntityRelations entityClass={entityClassSingular} entityId={id} />
          </section>
          <EntityGraphMini
            entityClassSingular={entityClassSingular}
            entityId={id}
--- a/web/app/e/[cls]/page.tsx
+++ b/web/app/e/[cls]/page.tsx
@ -46,7 +46,7 @@ interface EntityRow {
  enrichment_status: string | null;
 }
-async function listEntities(cls: EntityClass): Promise<EntityRow[]> {
+async function listEntities(cls: EntityClass, includeGeneric = false): Promise<EntityRow[]> {
  const dir = path.join(WIKI, "entities", cls);
  let files: string[] = [];
  try {
@ -59,6 +59,9 @@ async function listEntities(cls: EntityClass): Promise<EntityRow[]> {
    try {
      const raw = await fs.readFile(path.join(dir, f), "utf-8");
      const fm = matter(raw).data as Record<string, unknown>;
      // Hide generic concept-entities (e.g. "Flying disc sighting reports") —
      // they're categories, not real instances. Opt-in via ?include_generic=1.
      if (!includeGeneric && fm.is_generic === true) continue;
      rows.push({
        id: f.replace(/\.md$/, ""),
        canonical_name: String(fm.canonical_name ?? f.replace(/\.md$/, "")),
@ -77,13 +80,17 @@ async function listEntities(cls: EntityClass): Promise<EntityRow[]> {
 export default async function EntityListPage({
  params,
  searchParams,
 }: {
  params: Promise<{ cls: string }>;
  searchParams?: Promise<{ include_generic?: string }>;
 }) {
  const { cls } = await params;
  const sp = (await searchParams) ?? {};
  const includeGeneric = sp.include_generic === "1";
  const folder = classKeyToFolder(cls);
  if (!folder) notFound();
-  const entities = await listEntities(folder as EntityClass);
+  const entities = await listEntities(folder as EntityClass, includeGeneric);
  return (
    <main className="min-h-screen p-6 md:p-10 max-w-5xl mx-auto">
--- a/web/components/entity-relations.tsx
+++ b/web/components/entity-relations.tsx
@ -0,0 +1,152 @@
 /**
 * EntityRelations — typed relations panel for an entity page.
 *
 * Renders semantically-typed edges (Person witnessed Event, Event documented_in
 * Document, etc.) grouped by relation_type and direction, instead of the
 * noisy co-mention list.
 */
 "use client";
 import { useEffect, useState } from "react";
 import Link from "next/link";
 interface Relation {
  source_class: string;
  source_id: string;
  relation_type: string;
  target_class: string;
  target_id: string;
  evidence_ref: string | null;
  confidence: string;
 }
 interface ApiResponse {
  outgoing: Relation[];
  incoming: Relation[];
  error?: string;
 }
 const TYPE_LABEL_PT: Record<string, { out: string; in: string }> = {
  witnessed: { out: "testemunhou", in: "foi testemunhado por" },
  occurred_at: { out: "ocorreu em", in: "foi local de" },
  involves_uap: { out: "envolve UAP", in: "observado em" },
  documented_in: { out: "documentado em", in: "documenta" },
  authored: { out: "autoria de", in: "autor:" },
  signed: { out: "assinou", in: "assinado por" },
  mentioned_by: { out: "mencionado em", in: "menciona" },
  employed_by: { out: "trabalhou em", in: "empregou" },
  operated_by: { out: "operada por", in: "operou" },
  investigated: { out: "investigou", in: "investigado por" },
  commanded: { out: "comandou", in: "comandado por" },
  related_to: { out: "relacionado a", in: "relacionado por" },
  similar_to: { out: "similar a", in: "similar de" },
  precedes: { out: "precede", in: "precedido por" },
  follows: { out: "segue", in: "seguido por" },
 };
 const ENTITY_FOLDER: Record<string, string> = {
  person: "people",
  organization: "organizations",
  location: "locations",
  event: "events",
  uap_object: "uap-objects",
  vehicle: "vehicles",
  operation: "operations",
  concept: "concepts",
 };
 function entityHref(cls: string, id: string): string {
  if (cls === "document") return `/d/${id}`;
  const folder = ENTITY_FOLDER[cls] ?? cls;
  return `/e/${folder}/${id}`;
 }
 export function EntityRelations({
  entityClass,
  entityId,
 }: {
  entityClass: string;
  entityId: string;
 }) {
  const [data, setData] = useState<ApiResponse | null>(null);
  const [loading, setLoading] = useState(true);
  useEffect(() => {
    let aborted = false;
    setLoading(true);
    fetch(`/api/relations?class=${entityClass}&id=${encodeURIComponent(entityId)}`)
      .then((r) => r.json())
      .then((j) => { if (!aborted) { setData(j); setLoading(false); } })
      .catch(() => { if (!aborted) { setData({ outgoing: [], incoming: [] }); setLoading(false); } });
    return () => { aborted = true; };
  }, [entityClass, entityId]);
  if (loading) {
    return <div className="font-mono text-xs text-[#5a6678]">carregando relações…</div>;
  }
  if (!data || (data.outgoing.length === 0 && data.incoming.length === 0)) {
    return (
      <div className="font-mono text-xs text-[#5a6678] italic">
        sem relações tipadas extraídas para esta entidade.
      </div>
    );
  }
  // Group by relation_type for outgoing and incoming separately
  const groupOut: Record<string, Relation[]> = {};
  for (const r of data.outgoing) (groupOut[r.relation_type] ||= []).push(r);
  const groupIn: Record<string, Relation[]> = {};
  for (const r of data.incoming) (groupIn[r.relation_type] ||= []).push(r);
  const renderGroup = (type: string, list: Relation[], dir: "out" | "in") => {
    const label = TYPE_LABEL_PT[type]?.[dir] ?? type;
    return (
      <div key={`${dir}-${type}`} className="mb-3">
        <div className="font-mono text-[10px] uppercase tracking-widest text-[#7fdbff] mb-1">
          {label} · <span className="text-[#5a6678]">{list.length}</span>
        </div>
        <ul className="space-y-0.5 text-xs">
          {list.slice(0, 12).map((r, i) => {
            const otherClass = dir === "out" ? r.target_class : r.source_class;
            const otherId = dir === "out" ? r.target_id : r.source_id;
            return (
              <li key={i} className="font-mono text-[#c8d4e6]">
                <Link
                  href={entityHref(otherClass, otherId)}
                  className="hover:text-[#00ff9c] truncate"
                  title={`${otherClass}/${otherId}`}
                >
                  <span className="text-[#5a6678]">[{otherClass[0]}]</span> {otherId}
                </Link>
              </li>
            );
          })}
          {list.length > 12 && (
            <li className="font-mono text-[10px] text-[#5a6678]">… +{list.length - 12}</li>
          )}
        </ul>
      </div>
    );
  };
  return (
    <div className="space-y-4">
      {Object.keys(groupOut).length > 0 && (
        <section>
          <h3 className="font-mono text-xs uppercase tracking-widest text-[#00ff9c] mb-2">
            Relações desta entidade →
          </h3>
          {Object.entries(groupOut).map(([t, list]) => renderGroup(t, list, "out"))}
        </section>
      )}
      {Object.keys(groupIn).length > 0 && (
        <section>
          <h3 className="font-mono text-xs uppercase tracking-widest text-[#ffa500] mb-2">
            ← Entidades que apontam para esta
          </h3>
          {Object.entries(groupIn).map(([t, list]) => renderGroup(t, list, "in"))}
        </section>
      )}
    </div>
  );
 }
--- a/web/lib/retrieval/entity-pages.ts
+++ b/web/lib/retrieval/entity-pages.ts
@ -43,8 +43,10 @@ export interface EntityCore {
    db_chunks: number;
    page_refs: number;
    cross_refs: number;
    text_refs: number;
  };
-  mentioned_in: string[];      // [[doc-id/p007]]
+  mentioned_in: string[];      // [[doc-id/p007]] — structured page refs (Haiku)
  text_mentioned_in: string[]; // [[doc-id/p007]] — text-only matches (back-fill)
  referenced_by: string[];     // [[class/id]] cross-links
  enrichment_status: string | null;
  narrative_summary: string | null;
@ -132,8 +134,10 @@ export async function getEntityCore(
      db_chunks: num(sigSources.db_chunks, 0),
      page_refs: num(sigSources.page_refs, 0),
      cross_refs: num(sigSources.cross_refs, 0),
      text_refs: num(sigSources.text_refs, 0),
    },
    mentioned_in: arr(fm.mentioned_in),
    text_mentioned_in: arr(fm.text_mentioned_in),
    referenced_by: arr(fm.referenced_by),
    enrichment_status: strOrNull(fm.enrichment_status),
    narrative_summary: strOrNull(fm.narrative_summary),
@ -150,6 +154,7 @@ export interface EntityMentionGroup {
  classification: string | null;
  mention_count: number;
  pages: number[];
  text_only: boolean; // true when all refs came from the text back-fill (no structured Haiku evidence)
 }
 /**
@ -164,22 +169,25 @@ export async function getEntityMentionsByDoc(
 ): Promise<EntityMentionGroup[]> {
  const fm = await readEntityYaml(entityClass, entityId);
  if (!fm) return [];
-  const refs = arr(fm.mentioned_in);
+  const structuredRefs = arr(fm.mentioned_in);
  const textRefs = arr(fm.text_mentioned_in);
  // Each ref looks like "[[doc-id/p007]]". Strip wikilink delimiters.
-  const byDoc = new Map<string, Set<number>>();
+  const byDoc = new Map<string, { structured: Set<number>; text: Set<number> }>();
-  for (const ref of refs) {
+  const addRef = (ref: string, source: "structured" | "text") => {
    const m = ref.match(/\[\[([^\]|]+?)\]\]/);
    const target = (m ? m[1] : ref).trim();
    const [docId, pageStr] = target.split("/", 2);
-    if (!docId) continue;
+    if (!docId) return;
    const pageNum = pageStr ? parseInt(pageStr.replace(/^p/, ""), 10) : NaN;
-    if (!byDoc.has(docId)) byDoc.set(docId, new Set());
+    if (!byDoc.has(docId)) byDoc.set(docId, { structured: new Set(), text: new Set() });
-    if (Number.isFinite(pageNum)) byDoc.get(docId)!.add(pageNum);
+    if (Number.isFinite(pageNum)) byDoc.get(docId)![source].add(pageNum);
-  }
+  };
  for (const r of structuredRefs) addRef(r, "structured");
  for (const r of textRefs) addRef(r, "text");
  // Hydrate each doc's metadata from wiki/documents/<doc-id>.md
  const groups: EntityMentionGroup[] = [];
-  for (const [docId, pages] of byDoc) {
+  for (const [docId, sets] of byDoc) {
    let canonical_title: string | null = null;
    let collection: string | null = null;
    let page_count: number | null = null;
@ -197,14 +205,16 @@ export async function getEntityMentionsByDoc(
    } catch {
      /* doc missing — use raw id */
    }
    const merged = new Set<number>([...sets.structured, ...sets.text]);
    groups.push({
      doc_id: docId,
      canonical_title,
      collection,
      page_count,
      classification,
-      mention_count: pages.size,
+      mention_count: merged.size,
-      pages: Array.from(pages).sort((a, b) => a - b),
+      pages: Array.from(merged).sort((a, b) => a - b),
      text_only: sets.structured.size === 0 && sets.text.size > 0,
    });
  }
  groups.sort((a, b) => b.mention_count - a.mention_count);
--- a/web/lib/retrieval/graph.ts
+++ b/web/lib/retrieval/graph.ts
@ -70,7 +70,7 @@ export async function getNeighbors(
            e.total_mentions, e.documents_count, c.weight, c.sample_chunks
     FROM coloc c
     JOIN public.entities e ON e.entity_pk = c.other_pk
-     WHERE 1=1 ${classFilter}
+     WHERE NOT e.is_generic ${classFilter}
     ORDER BY c.weight DESC
     LIMIT $${params.length}`,
    params,
@ -142,6 +142,7 @@ export async function getGraphSeed(opts: {
       WHERE LENGTH(TRIM(canonical_name)) >= 4
         AND canonical_name !~ '^[A-Z]{1,3}$'
         AND canonical_name !~ '^[0-9.()-]+$'
         AND NOT is_generic
         ${classFilter}
     )
     SELECT entity_pk, entity_class, entity_id, canonical_name, total_mentions, documents_count, entity_class_short