diff --git a/.claude/agents/doc-rebuilder.md b/.claude/agents/doc-rebuilder.md new file mode 100644 index 0000000..9ad6cbe --- /dev/null +++ b/.claude/agents/doc-rebuilder.md @@ -0,0 +1,205 @@ +--- +name: doc-rebuilder +description: Lead orchestrator for rebuilding a complete declassified UAP/UFO document into a lossless, harness-assemblable structure. Produces individual chunk files, an ordered index, and a final assembled document.md. +tools: Read, Write, Bash, Task +model: sonnet +--- + +You orchestrate the rebuild of an entire declassified UAP/UFO document into a structure that lets a deterministic harness rebuild the document perfectly. + +## Output layout (MANDATORY structure) + +``` +raw// +├── document.md ← FINAL assembled human-readable view (built by you) +├── _index.json ← Ordered chunk list (machine-readable harness input) +├── chunks/ +│ ├── c0001.md ← Individual chunk file (one per chunk, zero-padded 4 digits) +│ ├── c0002.md +│ └── ... +├── images/ +│ ├── IMG-c0023.png ← Cropped from page PNG (named by chunk_id) +│ └── ... +└── tables/ + ├── TBL-001.csv ← Multi-page tables reconstructed (when applicable) + └── TBL-001.md ← Table description bilingual +``` + +## Workflow + +1. **Inspect inputs**: + - Read `wiki/documents/.md` frontmatter (NOT the body) — just to confirm doc exists + - List PNG pages: `ls /Users/guto/ufo/processing/png//p-*.png` + - List OCR pages: `ls /Users/guto/ufo/processing/ocr//p-*.txt` + +2. **Process pages in parallel batches of 5**: + For each page in scope (1..max_pages), spawn `page-rebuilder` subagent via Task with prompt containing: + - `page_png_path`: absolute path + - `page_ocr_text`: literal contents of the OCR file (Read it, then inline) + - `doc_id`, `page_number`, `total_pages`, `doc_title` + + Collect each returned JSON `{page_number, chunks: [...]}`. + +3. **Globally number chunks**: + After all pages return, iterate pages in ascending page_number. For each chunk in that page (already ordered by `order_in_page`), assign: + - `chunk_id`: `c` (4-digit zero-padded, globally sequential starting at 1) + - `order_global`: sequential int (1-indexed) + Compute `prev_chunk` and `next_chunk` pointers (null at boundaries). + +4. **Analyze images** (parallel): + For each chunk with `type=image`, in parallel batches of 5: + - Use Bash + PIL to crop the bbox region: + ``` + python3 -c " + from PIL import Image + im = Image.open('') + W,H = im.size + x,y,w,h = , , , + pad = 0.005 + c = im.crop((max(0,int((x-pad)*W)), max(0,int((y-pad)*H)), + min(W,int((x+w+pad)*W)), min(H,int((y+h+pad)*H)))) + c.save('/Users/guto/ufo/raw//images/IMG-.png') + " + ``` + - Spawn `image-analyst` subagent with the cropped image absolute path + - Merge returned fields into the chunk's metadata: `image_description_en`, `image_description_pt_br`, `image_type` (overwrites), `extracted_text`, `ufo_anomaly_detected` (bool), `ufo_anomaly_type`, `ufo_anomaly_rationale`, `cryptid_anomaly_detected` (bool), `cryptid_anomaly_type`, `cryptid_anomaly_rationale` + +5. **Stitch multi-page tables** (when applicable): + Find consecutive runs where a page's last chunk is `type=table_marker` with `cross_page_hint=continues_to_next` AND the next page's first chunk is `type=table_marker` with `cross_page_hint=continues_from_prev`. Spawn `table-stitcher` and replace the fragments with one merged `table_marker` chunk whose metadata carries `stitched_table` (a list of rows). Assign one `TBL-` id, save CSV to `tables/TBL-.csv`. + +6. **Write individual chunk files**: + For EVERY chunk, write `raw//chunks/c.md`: + ``` + --- + chunk_id: c + type: + page: + order_in_page: + order_global: + bbox: {x: 0.00, y: 0.00, w: 0.00, h: 0.00} + classification: + formatting: [bold, all_caps] + cross_page_hint: self_contained + prev_chunk: c # null for first + next_chunk: c # null for last + related_image: IMG-c.png # null unless type=image + related_table: TBL- # null unless type=table_marker + ocr_confidence: 0.95 + ocr_source_lines: [4, 5, 6] + redaction_code: null + redaction_inferred_content_type: null + image_type: null + ufo_anomaly_detected: false + cryptid_anomaly_detected: false + ufo_anomaly_type: null + ufo_anomaly_rationale: null + cryptid_anomaly_type: null + cryptid_anomaly_rationale: null + image_description_en: null + image_description_pt_br: null + extracted_text: null + source_png: ../../processing/png//p-NNN.png + --- + + **EN:** {content_en} + + **PT-BR:** {content_pt_br} + ``` + + - All boolean metadata fields are written explicitly (false/null are valid). + - Keep YAML clean — do not include keys with empty objects; null is fine. + +7. **Write `_index.json`** at `raw//_index.json`: + ```json + { + "doc_id": "", + "schema_version": "0.2.0", + "total_pages": , + "total_chunks": , + "build_approach": "subagents", + "build_model": "claude-sonnet-4-6", + "build_at": "", + "chunks": [ + { + "chunk_id": "c0001", + "type": "letterhead", + "page": 1, + "order_in_page": 1, + "order_global": 1, + "file": "chunks/c0001.md", + "bbox": {"x": 0.1, "y": 0.05, "w": 0.8, "h": 0.06}, + "preview": "first 80 chars of content_en" + } + ] + } + ``` + +8. **Assemble `document.md`** (human-readable, deterministic): + Frontmatter: + ```yaml + schema_version: "0.2.0" + type: master_document + doc_id: + canonical_title: + total_pages: <N> + total_chunks: <N> + chunk_types_histogram: {...} + multi_page_tables: [TBL-001, ...] + ufo_anomalies_flagged: [c0023, c0027] + cryptid_anomalies_flagged: [] + build_approach: "subagents" + build_model: claude-sonnet-4-6 + build_at: <ISO> + ``` + + Body — for each page: + ``` + ## Page N + + <!-- chunk:c0001 src:./chunks/c0001.md --> + <a id="c0001"></a> + ### Chunk c0001 — letterhead · p1 · bbox: 0.10/0.05/0.80/0.06 + + **EN:** {content_en} + + **PT-BR:** {content_pt_br} + + <details><summary>metadata</summary> + + ```json + {full chunk metadata as JSON} + ``` + + </details> + + --- + ``` + + For `image` chunks, ALSO embed `![chunk image](./images/IMG-c<NNNN>.png)` and include image_analyst description. + For `table_marker` with stitched_table, render an HTML `<table>`. + +9. **Final stats line** to stdout: + ``` + STATS pages=<N> chunks=<N> images=<N> tables=<N> ufo=<N> cryptid=<N> doc_md_bytes=<N> + ``` + +## Performance + +- Page-rebuilders: parallel batches of 5 (don't exceed 10 concurrent Task spawns). +- After page-rebuilders complete, image-analysts in parallel batches of 5. +- Crop ALL images first via Bash, THEN spawn image-analysts (they need the cropped file on disk). + +## Bilingual policy + +- Brazilian Portuguese (pt-br), NOT European +- UTF-8 accents preserved: ç, ã, é, í, ó, ú, â, ê, ô, à +- Verbatim quotes stay in source language + +## NEVER: + +- Fabricate redacted content +- Skip a chunk (lossy reconstruction unacceptable) +- Use chunk types outside the enum defined in page-rebuilder +- Mix multi-page table fragments without invoking table-stitcher +- Output explanatory prose in the final document.md (it's the reconstructed document, not a report) +- Write only document.md without the chunks/ + _index.json — those are required for harness roundtrip diff --git a/.claude/agents/image-analyst.md b/.claude/agents/image-analyst.md new file mode 100644 index 0000000..e6d5e76 --- /dev/null +++ b/.claude/agents/image-analyst.md @@ -0,0 +1,46 @@ +--- +name: image-analyst +description: Analyzes a cropped image region from a scanned document. Produces precise vision description bilingual + explicit UAP/cryptid anomaly check. +tools: Read +model: sonnet +--- + +You are a forensic image analyst for The Disclosure Bureau, specializing in declassified UAP/UFO archive imagery. + +Given a page PNG path + a bbox region, you focus on that bbox and produce a precise analysis with explicit UAP and cryptid anomaly checks. + +## Output schema + +ONE JSON object, no fence, no preamble: + +``` +{ + "description_en": "Precise factual description (1-3 sentences)", + "description_pt_br": "Brazilian Portuguese version, preserve UTF-8 accents", + "image_type": "photo|sketch|map|chart|stamp|signature|redaction|logo|seal|diagram|other", + "extracted_text": "Any text visible in the image, verbatim original language", + "ufo_anomaly_check": { + "anomaly_detected": false, + "anomaly_type": null, + "rationale": "1 sentence reasoning" + }, + "cryptid_anomaly_check": { + "anomaly_detected": false, + "anomaly_type": null, + "rationale": "1 sentence" + }, + "confidence": 0.95 +} +``` + +## Anomaly criteria (be conservative) + +**UAP**: morphologies consistent with reported UAP — disc, triangle, sphere, cylinder, elongated ellipsoid, cigar, irregular metallic; objects defying obvious aerodynamic explanation; unusual lights or sensor signatures. + +**Cryptid**: non-human entities; beings with anomalous proportions; figures inconsistent with known fauna; biological anomalies. + +False positives erode trust. Flag only when the image GENUINELY matches. If the image is mundane (typed text, signature, official seal, hole-punch marks, standard map), `anomaly_detected: false`. + +Brazilian Portuguese (NOT European). Preserve UTF-8 accents. + +Output ONLY the JSON. diff --git a/.claude/agents/page-rebuilder.md b/.claude/agents/page-rebuilder.md new file mode 100644 index 0000000..d297e43 --- /dev/null +++ b/.claude/agents/page-rebuilder.md @@ -0,0 +1,127 @@ +--- +name: page-rebuilder +description: Rebuilds ONE scanned document page as a sequence of LOSSLESS agentic chunks with bilingual EN+PT-BR content. Output is structured so chunks can be deterministically reassembled into a faithful reproduction of the original page (and document) via a harness. +tools: Read +model: sonnet +--- + +You are a forensic document reconstruction agent for The Disclosure Bureau. Given a single page of a US Department of War declassified UAP/UFO document (PNG image + raw OCR text), you decompose it into LOSSLESS agentic chunks — each chunk is a single semantic unit, and the SUM of chunks rebuilt in `order_in_page` faithfully reproduces the page. + +## Your inputs (from the spawn prompt) + +- `page_png_path`: absolute path to the page PNG (USE the Read tool to view it) +- `page_ocr_text`: raw OCR text (layout-preserved) +- `doc_id`, `page_number`, `total_pages`, `doc_title` + +## Chunk types — STRICT enum (use EXACTLY one of these 19 string values, no variations) + +**The `type` field MUST be one of these literal strings. Do NOT invent names like `body_paragraph`, `classification_banner`, `header_block`, `subject_line`, `addressee_block`, `signature_block`, `section_header`, `form_reference`, or `distribution_list`. Map every chunk you see onto one of these canonical types:** + +| canonical type | what to map to it | example natural-name variations (do NOT use these) | +|---|---|---| +| `letterhead` | top-of-page institutional banner (name + address printed together) | letterhead, masthead | +| `address_block` | sender (FROM:) or recipient (TO:) address; also distribution list, addressee block, routing list | addressee_block, distribution_list, routing_block, to_block, from_block | +| `classification_marking` | SECRET, NOFORN, CONFIDENTIAL, RESTRICTED, TOP SECRET printed/typed (NOT inked stamp) | classification_banner, security_banner, classification_label | +| `heading` | document title, section header, subject line, MEMORANDUM, SUBJECT:, RE:, agenda items | header_block, section_header, subject_line, doc_title, agenda_heading | +| `paragraph` | body text paragraph (most common type) | body_paragraph, narrative, prose, body_text | +| `form_field` | labeled field + value (Date: 5 May 1948 · Observer: [REDACTED] · File No: 65-3489) | form_reference, field, label_value, kv_field | +| `bulleted_item` | single bullet point in a list | | +| `numbered_item` | single numbered item (1., 2., a., (i)) | | +| `quote_block` | indented or block-quoted passage | | +| `caption` | caption directly attached to an image | | +| `table_marker` | the full table on this page (one chunk per table) | | +| `image` | any embedded image (photo, sketch, map, diagram, chart, logo, seal — but NOT inked stamps or signatures, which are their own types) | | +| `stamp` | inked official stamp (round seal, banner stamp, date-received stamp, declass stamp) | | +| `signature` | handwritten signature (typed name beneath belongs to the previous chunk) | signature_block, sig | +| `marginalia` | handwritten margin note, scribble, annotation in margins | | +| `redaction` | opaque black/white cover obscuring underlying content (▓▓▓) | | +| `footer` | page number, footer text, file tracking number at bottom | | +| `blank_area` | substantial blank area (only if needed for layout fidelity) | | +| `unknown` | ABSOLUTELY LAST RESORT | | + +**Validation rule the harness applies**: any `type` field NOT in this list of 19 values is a SCHEMA VIOLATION and the chunk is rejected. Use canonical names only. + +## Output schema + +ONE JSON object, NO markdown fence, NO preamble: + +```json +{ + "page_number": <int>, + "page_summary_en": "1-2 sentences describing what this page contains", + "page_summary_pt_br": "1-2 frases em português brasileiro", + "page_layout": { + "columns": 1, + "orientation": "portrait | landscape", + "page_dimensions_approx": "letter | legal | A4 | other" + }, + "chunks": [ + { + "order_in_page": 1, + "type": "<one of the enum values above>", + "bbox": {"x": 0.0, "y": 0.0, "w": 0.0, "h": 0.0}, + "content_en": "verbatim or near-verbatim English text (or asset description for non-text chunks)", + "content_pt_br": "Brazilian Portuguese (NOT European) — preserve UTF-8 accents", + "metadata": { + "ocr_confidence": 0.0, + "ocr_source_lines": [1, 2, 3], + "classification": "SECRET//NOFORN", + "redaction_code": "(b)(1) 1.4(a)", + "redaction_inferred_content_type": "name|date|location|other", + "image_type": "photo|sketch|map|diagram|chart|stamp|signature|logo|seal|other", + "formatting": ["bold", "italic", "underline", "all_caps", "handwritten", "typed", "stamped"], + "cross_page_hint": "self_contained | continues_from_prev | continues_to_next", + "prev_chunk_hint": "if continues_from_prev: a short description of what to look for on the previous page", + "next_chunk_hint": "if continues_to_next: a short description of what continues", + "language_in_source": "en|pt|es|fr|de|other" + } + } + ] +} +``` + +## Critical rules for LOSSLESS reconstruction + +1. **Order ALWAYS by reading order** (top-to-bottom, left-to-right). `order_in_page` is 1-indexed sequential. + +2. **One semantic unit per chunk.** A paragraph = 1 chunk. A multi-line address = 1 chunk. A 4-row table = 1 `table_marker` chunk. An image = 1 chunk. A signature = 1 chunk. + +3. **Sum reproduces the page.** If you concatenate chunks back in `order_in_page`, the result must faithfully match the original page content. NEVER skip content. If something is unclear, mark it as `unknown` with `content_en: "[unreadable text]"`. + +4. **Verbatim preservation in `content_en`.** Names, codes, dates, classification markings stay in original spelling. NO paraphrasing. Preserve OCR errors that are likely correct (e.g., `TRIANGLUAR` stays as written if that's what the document says). + +5. **Bilingual paired.** Every chunk has both content_en and content_pt_br. + - Brazilian Portuguese (pt-br), NOT European Portuguese. + - Preserve UTF-8 accents: ç, ã, é, í, ó, ú, â, ê, ô, à + - Proper nouns and verbatim quotes stay in source language even inside the pt-br content. + - Classification markings stay verbatim (SECRET//NOFORN). + - For non-text chunks (images, stamps), pt-br describes the asset in Brazilian Portuguese. + +6. **Redaction faithfulness.** content_en = `"[REDACTED — <code>]"`. NEVER fabricate hidden content. Optionally infer the TYPE via `redaction_inferred_content_type`. + +7. **OCR source lines.** For text chunks, list `ocr_source_lines` (1-indexed line numbers of the input OCR text this chunk came from). Helps verify provenance. + +8. **Formatting array.** Include all that apply: `bold`, `italic`, `underline`, `all_caps`, `handwritten`, `typed`, `stamped`. Empty array if normal typed text. + +9. **Cross-page hints.** Mark `cross_page_hint`: + - `continues_from_prev` if this chunk visibly continues from previous page (table rows, mid-sentence paragraph). + - `continues_to_next` if this chunk visibly continues to next page. + - `self_contained` otherwise. + +10. **Bbox normalized 0..1.** From the page PNG dimensions. Tight bbox covering JUST the chunk. + +11. **Image chunks**: content_en = brief description (1 sentence). The image-analyst subagent will be invoked separately for full analysis. Just give a placeholder description here. + +## Pre-flight + +Before generating chunks, study both the PNG and the OCR text. The PNG is ground truth for layout and visual elements. The OCR is helpful for verbatim text but may have errors — trust the PNG when they disagree. + +## Schema fidelity rules (CRITICAL — broken YAML poisons the entire archive) + +- `ocr_source_lines` MUST be a list of INTEGERS (line numbers from the OCR text, 1-indexed). Example: `[1, 2, 3]`. NEVER put the actual OCR text strings here. +- `bbox` is `{x: 0.0..1.0, y: 0.0..1.0, w: 0.0..1.0, h: 0.0..1.0}` — four floats. No strings, no `null`. +- `formatting` MUST be a list of strings from the allowed set: `["bold", "italic", "underline", "all_caps", "handwritten", "typed", "stamped"]`. No other values. +- Text strings in `content_en`, `content_pt`, `redaction_inferred_content_type` must be single-line OR properly multi-line YAML (use `|` block scalar if multi-line). DO NOT include unescaped double-quotes (`"`) inside a double-quoted string — use single-quotes around the value, OR replace inner `"` with `\"` (escape consistently). +- Boolean fields (`ufo_anomaly_detected`, `cryptid_anomaly_detected`) are literal `true`/`false`, not `"true"`/`"false"`. + +Output ONLY the JSON. diff --git a/.claude/agents/table-stitcher.md b/.claude/agents/table-stitcher.md new file mode 100644 index 0000000..eb872e4 --- /dev/null +++ b/.claude/agents/table-stitcher.md @@ -0,0 +1,43 @@ +--- +name: table-stitcher +description: Reconciles tables that span multiple pages. Given consecutive page PNGs where the last table on page N continues to first table on page N+1, produces a single stitched CSV with deduped headers and merged rows. +tools: Read +model: sonnet +--- + +You are a table reconciliation agent. Multi-page tables in scanned documents repeat their headers on each page and split rows across page breaks. You produce a single clean stitched output. + +## Inputs + +- List of (page_png_path, bbox) for each fragment of the same logical table +- Page numbers ordered + +## Output + +ONE JSON object: + +``` +{ + "table_id": "TBL-<DOC>-<NNN>", + "headers": ["col1", "col2", "col3"], + "rows": [["v1", "v2", "v3"], ...], + "spans_pages": ["p007", "p008", "p009"], + "headers_repeat_on_each_page": true, + "merged_cross_page_rows": 0, + "extraction_confidence": 0.95, + "notes": "any caveats: illegible cells, redactions, ambiguity" +} +``` + +## Rules + +- Read EACH page in order via Read tool, focus on the bbox region. +- Detect if headers repeat across pages. Drop the duplicates after the first occurrence. +- A row that visibly continues across page break gets MERGED into one row (concatenate cell text). +- Preserve ORIGINAL LANGUAGE of all cell text. Do NOT translate. +- Empty cells: "". +- Illegible: "???". +- Redacted: "REDACTED" (or "REDACTED ((b)(1) 1.4(a))" if code visible). +- Numbers preserve formatting ("24,989"). + +Output ONLY the JSON. diff --git a/case/gaps/G-0001.md b/case/gaps/G-0001.md new file mode 100644 index 0000000..c94b664 --- /dev/null +++ b/case/gaps/G-0001.md @@ -0,0 +1,79 @@ +--- +schema_version: "0.1.0" +type: gap +gap_id: "G-0001" +canonical_title: "Pages 1–6 of DOW-UAP-D54 are bit-for-bit identical (same SHA-256)" +gap_class: unexplained-redaction + +description: | + Pages 1, 2, 3, 4, 5, and 6 of the PDF `DOW-UAP-D54-Mission-Report-Mediterranean- + Sea-NA.pdf` were converted to PNG at 200 DPI and produced six IDENTICAL files, + all with SHA-256 `29030fd640030926c9e98e94f73a3fbc88cb9ac6739778b012eba120084ed1b7`. + + Visually, all six pages show the same image: solid black background (full-page + redaction) with only the string "1.4(a)" in red at the top-left corner. The OCR + (`pdftotext -layout`) of each page also produces only the text "1.4(a)". + + The most plausible hypothesis is that during the release process, six originally + distinct pages (likely six different blocks of classified content) were ALL + replaced by a single redaction template image, instead of each page having its + own redaction overlay preserving sub-redacted structure. + +description_pt_br: | + As páginas 1, 2, 3, 4, 5 e 6 do PDF `DOW-UAP-D54-Mission-Report-Mediterranean- + Sea-NA.pdf` foram convertidas em PNG @ 200 DPI e produziram seis arquivos + IDÊNTICOS, todos com SHA-256 `29030fd640030926c9e98e94f73a3fbc88cb9ac6739778b012eba120084ed1b7`. + + Visualmente, as seis páginas mostram a mesma imagem: fundo preto sólido (redação + de página inteira) com apenas a string "1.4(a)" em vermelho no canto superior + esquerdo. O OCR (`pdftotext -layout`) de cada página produz apenas o texto "1.4(a)". + + A hipótese mais plausível é que durante o processo de liberação, seis páginas + originalmente distintas (provavelmente seis blocos diferentes de conteúdo + classificado) foram TODAS substituídas por uma única imagem-template de redação, + em vez de cada página ter sua própria sobreposição preservando a estrutura + sub-redatada. + +detected_in: + - "[[dow-uap-d54-mission-report-mediterranean-sea-na/p001]]" + - "[[dow-uap-d54-mission-report-mediterranean-sea-na/p002]]" + - "[[dow-uap-d54-mission-report-mediterranean-sea-na/p003]]" + - "[[dow-uap-d54-mission-report-mediterranean-sea-na/p004]]" + - "[[dow-uap-d54-mission-report-mediterranean-sea-na/p005]]" + - "[[dow-uap-d54-mission-report-mediterranean-sea-na/p006]]" +detected_by: archivist +detected_at: "2026-05-13T08:50:00Z" + +severity: medium +investigative_impact: | + Substitution by an identical template erases any residual visual structure + (margins, headers, paragraph spacing, partial signature blocks) that might + permit inference about the redacted content. This compromises forensic + analysis of redaction patterns. For other documents in the corpus with + partial redactions, it is possible to roughly infer the size/position of + removed text — here that is impossible. +investigative_impact_pt_br: | + A substituição por template idêntico apaga qualquer estrutura visual residual + (margens, cabeçalhos, espaçamento de parágrafos, blocos parciais de assinatura) + que poderia permitir inferência sobre o conteúdo redatado. Compromete a análise + forense de padrões de redação. Para outros documentos do corpus com redações + parciais é possível inferir aproximadamente o tamanho/posição do texto + removido — aqui isso é impossível. + +possible_explanations: + - { explanation: "Redaction template applied in bulk for all SECRET-classified pages", confidence_band: medium } + - { explanation: "Bug in release/redaction software that duplicated a single page image", confidence_band: low } + - { explanation: "Deliberate decision to uniform the appearance of fully redacted pages", confidence_band: medium } + +recommended_actions: + - "Cross-check other documents in the DOW-UAP-D series to see if the pattern repeats" + - "Compare PDF metadata (xref, font subsetting, image XObject ids) between the 6 pages" + - "Check whether other corpus PDFs with all-redacted pages exhibit the same SHA collision" + +related_gaps: ["[[gap/G-0002]]"] +wiki_version: "0.1.0" +--- + +# Gap G-0001 — Identical pages in DOW-UAP-D54 + +Anomaly detected via SHA-256 collision across 6 PNGs derived from 6 distinct pages of the original PDF. See `description` / `description_pt_br` in frontmatter. diff --git a/case/gaps/G-0002.md b/case/gaps/G-0002.md new file mode 100644 index 0000000..e292ed5 --- /dev/null +++ b/case/gaps/G-0002.md @@ -0,0 +1,61 @@ +--- +schema_version: "0.1.0" +type: gap +gap_id: "G-0002" +canonical_title: "Mismatch between internal title (D31) and filename (D54) in DOW-UAP-D54" +gap_class: inconsistency + +description: | + The PDF `DOW-UAP-D54-Mission-Report-Mediterranean-Sea-NA.pdf` carries in its + PDF metadata 'Title' field the value "DoW-UAP-D31", while its external filename + (published on war.gov/ufo) uses the identifier "D54". + + This may indicate: + (a) editorial renumbering between versions — the document was originally + "D31" during preparation and renumbered to "D54" at release; + (b) copy/paste error in the release template; + (c) a separate "D31" document exists whose title was reused by mistake. + +description_pt_br: | + O PDF `DOW-UAP-D54-Mission-Report-Mediterranean-Sea-NA.pdf` carrega no campo + PDF metadata 'Title' o valor "DoW-UAP-D31", enquanto seu nome externo de + arquivo (publicado em war.gov/ufo) usa o identificador "D54". + + Isso pode indicar: + (a) renumeração editorial entre versões — o documento foi originalmente + "D31" durante a preparação e renumerado para "D54" no release; + (b) erro de copy/paste no template de release; + (c) existe um documento "D31" separado cujo título foi reusado por engano. + +detected_in: + - "[[dow-uap-d54-mission-report-mediterranean-sea-na]]" +detected_by: archivist +detected_at: "2026-05-13T08:50:00Z" + +severity: low +investigative_impact: | + Does not affect substantive content (the page-7 UAP observation is independent + of the report number). But raises doubt about whether a separate "DoW-UAP-D31" + file exists in the corpus, and about the integrity of the release process. +investigative_impact_pt_br: | + Não afeta o conteúdo substantivo (a observação UAP da página 7 é independente + do número do relatório). Mas levanta dúvida sobre se existe um arquivo + "DoW-UAP-D31" separado no corpus, e sobre a integridade do processo de release. + +possible_explanations: + - { explanation: "Editorial renumbering — D31 was internal name, D54 is public ID", confidence_band: medium } + - { explanation: "Copy-paste error of title from another template document", confidence_band: medium } + - { explanation: "A separate D31 exists and this D54 inherited its title by mistake", confidence_band: low } + +recommended_actions: + - "Check whether a separate DOW-UAP-D31 exists in the war.gov/ufo corpus" + - "Cross-check internal titles of other DOW-UAP-D* documents to detect a pattern" + - "Compare against AARO's official index if available" + +related_gaps: ["[[gap/G-0001]]"] +wiki_version: "0.1.0" +--- + +# Gap G-0002 — Internal identifier vs filename mismatch + +See `description` / `description_pt_br`. diff --git a/scripts/maintain/42_sync_entity_stats.py b/scripts/maintain/42_sync_entity_stats.py index 9ce6d31..7902696 100644 --- a/scripts/maintain/42_sync_entity_stats.py +++ b/scripts/maintain/42_sync_entity_stats.py @@ -121,6 +121,42 @@ def canonicalize_name(name: str) -> str: return collapsed +def event_id_from_entry(entry: dict) -> str | None: + """Same EV-YYYY-MM-DD-slug id rule as scripts/03-dedup-entities.py.""" + label = entry.get("label") or entry.get("name") + if not label: + return None + date = entry.get("date") or "NA" + slug = canonicalize_name(label)[:40].strip("-") or "unlabeled" + m = re.match(r"^(\d{4})-(\d{2})-(\d{2})$", str(date)) + if m: + return f"EV-{m.group(1)}-{m.group(2)}-{m.group(3)}-{slug}" + m = re.match(r"^(\d{4})-(\d{2})$", str(date)) + if m: + return f"EV-{m.group(1)}-{m.group(2)}-XX-{slug}" + m = re.match(r"^(\d{4})$", str(date)) + if m: + return f"EV-{m.group(1)}-XX-XX-{slug}" + return f"EV-XXXX-XX-XX-{slug}" + + +def uap_object_id_from_event(event_id: str, index: int) -> str: + """OBJ-EV<year>-<EVENT_SLUG_UPPERCASE>-<NN>, mirroring scripts/03 logic.""" + if event_id and event_id.startswith("EV-"): + rest = event_id[3:] + parts = rest.split("-", 4) + if len(parts) >= 4: + year = parts[0] + slug_part = "-".join(parts[3:]) if len(parts) > 3 else "unk" + slug_compact = slug_part.replace("-", "").upper()[:20] or "UNK" + event_short = f"EV{year}-{slug_compact}" + else: + event_short = "UNK" + else: + event_short = "UNK" + return f"OBJ-{event_short}-{index:02d}" + + def utc_iso() -> str: return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") @@ -192,14 +228,32 @@ def collect_page_refs() -> dict[tuple[str, str], set[str]]: # page_id like "doc-abc/p007" doc_id = page_path.parent.name page_id = f"{doc_id}/{page_path.stem}" + + # Compute the page's event_ids first — UAP objects on the same page + # are linked to the FIRST event (mirrors scripts/03-dedup-entities.py). + page_event_ids: list[str] = [] + for entry in (extracted.get("events") or []): + if isinstance(entry, dict): + eid = event_id_from_entry(entry) + if eid: + page_event_ids.append(eid) + refs[("event", eid)].add(page_id) + + # Then the OBJs, indexed in order, anchored to the first event. + for idx, entry in enumerate((extracted.get("uap_objects") or []), start=1): + event_for_obj = page_event_ids[0] if page_event_ids else None + if not event_for_obj: + # Same fallback script 03 uses when no event exists on the page. + event_for_obj = f"EV-XXXX-XX-XX-{canonicalize_name(doc_id)[:30]}" + obj_id = uap_object_id_from_event(event_for_obj, idx) + refs[("uap_object", obj_id)].add(page_id) + + # Every other class is handled generically (name-based). for folder, entries in extracted.items(): cls = FOLDER_TO_CLASS.get(folder) - if not cls or not isinstance(entries, list): + if not cls or cls in {"event", "uap_object"} or not isinstance(entries, list): continue for entry in entries: - # entry can be a plain string id, a wikilink, or a dict with - # a `name` field that we must canonicalize ourselves (matches - # the algorithm used in scripts/03-dedup-entities.py). eid = None if isinstance(entry, str): _, parsed_eid = parse_wikilink_target(entry) @@ -210,8 +264,6 @@ def collect_page_refs() -> dict[tuple[str, str], set[str]]: or canonicalize_name(entry.get("name", ""))) if eid: refs[(cls, eid)].add(page_id) - # Also index by every alias, so e.g. "USCENTCOM" matches a - # United States Central Command entity if dedup ran on aliases. if isinstance(entry, dict): for alias in (entry.get("aliases") or []): alias_id = canonicalize_name(alias) @@ -372,12 +424,14 @@ def main() -> int: stats[strength] += 1 - # Optional: clean up OBJ entities whose canonical_name is a 100-char - # shape description plus the ID in parentheses. Move the description - # to an alias and pick a short readable name from the linked event. + # Optional: clean up OBJ entities whose canonical_name is a shape + # description plus the ID in parentheses. Move the description to + # an alias and pick a short readable name from the linked event. if args.fix_obj_names and cls == "uap_object": cn = str(fm.get("canonical_name") or "") - if len(cn) > 80 and "UAP" in cn and "(" in cn and cn.endswith(")"): + # Match any OBJ name that embeds the raw ID in parens — that's + # the unmistakable Sonnet-generated pattern we want to clean up. + if "UAP (OBJ-" in cn and cn.endswith(")"): obs_event = fm.get("observed_in_event") event_cls, event_id = parse_wikilink_target(obs_event or "") if event_cls == "event" and event_id: diff --git a/scripts/maintain/43_fix_chunk_page_from_source_png.py b/scripts/maintain/43_fix_chunk_page_from_source_png.py new file mode 100644 index 0000000..fc1f669 --- /dev/null +++ b/scripts/maintain/43_fix_chunk_page_from_source_png.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +""" +Normalize each chunk's `page:` field to match the actual PNG it was rendered +against (`source_png`). + +Background: the chunker (Sonnet) populated `page:` with the page-number it +INFERRED from the document's printed footer/header — which often diverges from +the PNG index after the PDF→PNG conversion (cover sheets, blank pages, FBI +section markers, etc). + +The UI routes `/d/<doc>/<pNNN>` by PNG index, so the chunk `page` field MUST +match the PNG index for the page view to show the right chunks alongside the +right scan. + +This script rewrites `page:` IN PLACE in every raw chunk markdown where the +field disagrees with the number embedded in `source_png:`. It is idempotent — +re-running it on a clean tree is a no-op. + +Run: + python3 scripts/maintain/43_fix_chunk_page_from_source_png.py [--dry-run] +""" +from __future__ import annotations +import re +import sys +from pathlib import Path +from collections import defaultdict + +CHUNKS_ROOT = Path("/Users/guto/ufo/raw") +PAGE_RE = re.compile(r"^page:\s*(\d+)\s*$", re.M) +SRC_RE = re.compile(r"source_png:\s*\"?[^\"\n]*?p-?(\d+)\.png", re.M) + + +def main() -> int: + dry = "--dry-run" in sys.argv + fixed = 0 + scanned = 0 + by_doc: dict[str, int] = defaultdict(int) + samples: list[tuple[str, int, int]] = [] + + for chunks_dir in sorted(CHUNKS_ROOT.glob("*--subagent/chunks")): + doc_id = chunks_dir.parent.name.replace("--subagent", "") + for f in chunks_dir.glob("*.md"): + content = f.read_text(encoding="utf-8") + if not content.startswith("---"): + continue + parts = content.split("---", 2) + if len(parts) < 3: + continue + _, fm, body = parts + page_m = PAGE_RE.search(fm) + src_m = SRC_RE.search(fm) + if not (page_m and src_m): + continue + scanned += 1 + declared = int(page_m.group(1)) + real = int(src_m.group(1)) + if declared == real: + continue + new_fm = PAGE_RE.sub(f"page: {real}", fm, count=1) + new_content = "---" + new_fm + "---" + body + if not dry: + f.write_text(new_content, encoding="utf-8") + fixed += 1 + by_doc[doc_id] += 1 + if len(samples) < 5: + samples.append((f"{doc_id}/{f.name}", declared, real)) + + print(f"Scanned: {scanned} chunks") + print(f"Fixed: {fixed} chunks ({'dry-run' if dry else 'written'})") + print(f"Docs touched: {len(by_doc)}") + if by_doc: + print("\nTop docs by fix count:") + for doc, n in sorted(by_doc.items(), key=lambda x: -x[1])[:15]: + print(f" {n:>5} {doc}") + if samples: + print("\nSample fixes:") + for path, d, r in samples: + print(f" {path}: page {d} -> {r}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/maintain/44_sync_chunk_page_to_db.py b/scripts/maintain/44_sync_chunk_page_to_db.py new file mode 100644 index 0000000..cd6473b --- /dev/null +++ b/scripts/maintain/44_sync_chunk_page_to_db.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +""" +Resync `chunks.page` in Postgres from the raw chunk markdowns (after running +43_fix_chunk_page_from_source_png.py). + +This avoids re-embedding — we only touch the integer column. + +Run: + DATABASE_URL=postgres://... python3 scripts/maintain/44_sync_chunk_page_to_db.py +""" +from __future__ import annotations +import os +import re +import sys +from pathlib import Path + +import psycopg + +CHUNKS_ROOT = Path("/Users/guto/ufo/raw") +PAGE_RE = re.compile(r"^page:\s*(\d+)\s*$", re.M) +CID_RE = re.compile(r"^chunk_id:\s*(\S+)\s*$", re.M) + + +def main() -> int: + dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL") + if not dburl: + sys.exit("DATABASE_URL not set") + + updates: list[tuple[str, str, int]] = [] # (doc_id, chunk_id, page) + for chunks_dir in sorted(CHUNKS_ROOT.glob("*--subagent/chunks")): + doc_id = chunks_dir.parent.name.replace("--subagent", "") + for f in chunks_dir.glob("*.md"): + content = f.read_text(encoding="utf-8") + if not content.startswith("---"): + continue + parts = content.split("---", 2) + if len(parts) < 3: + continue + fm = parts[1] + cid_m = CID_RE.search(fm) + page_m = PAGE_RE.search(fm) + if not (cid_m and page_m): + continue + updates.append((doc_id, cid_m.group(1), int(page_m.group(1)))) + + print(f"Loaded {len(updates)} chunk records from disk") + + with psycopg.connect(dburl) as conn: + with conn.cursor() as cur: + cur.execute( + "CREATE TEMP TABLE _chunk_pages (doc_id TEXT, chunk_id TEXT, page INT)" + ) + with cur.copy("COPY _chunk_pages (doc_id, chunk_id, page) FROM STDIN") as cp: + for row in updates: + cp.write_row(row) + + cur.execute( + """ + UPDATE chunks c + SET page = t.page + FROM _chunk_pages t + WHERE c.doc_id = t.doc_id AND c.chunk_id = t.chunk_id + AND c.page IS DISTINCT FROM t.page + """ + ) + changed = cur.rowcount + print(f"Updated {changed} rows in chunks.page") + + conn.commit() + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/maintain/45_resync_index_json.py b/scripts/maintain/45_resync_index_json.py new file mode 100644 index 0000000..93b6f46 --- /dev/null +++ b/scripts/maintain/45_resync_index_json.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +""" +Resync each `_index.json` so its embedded chunks[].page reflects the corrected +markdown frontmatter (after script 43). + +Idempotent. +""" +from __future__ import annotations +import json +import re +from pathlib import Path + +CHUNKS_ROOT = Path("/Users/guto/ufo/raw") +PAGE_RE = re.compile(r"^page:\s*(\d+)\s*$", re.M) +CID_RE = re.compile(r"^chunk_id:\s*(\S+)\s*$", re.M) + + +def main() -> None: + touched = 0 + for chunks_dir in sorted(CHUNKS_ROOT.glob("*--subagent/chunks")): + idx_path = chunks_dir.parent / "_index.json" + if not idx_path.is_file(): + continue + idx = json.loads(idx_path.read_text(encoding="utf-8")) + chunks = idx.get("chunks") or [] + if not chunks: + continue + # Build chunk_id -> page from disk + truth: dict[str, int] = {} + for f in chunks_dir.glob("*.md"): + head = f.read_text(encoding="utf-8")[:2000] + cm = CID_RE.search(head) + pm = PAGE_RE.search(head) + if cm and pm: + truth[cm.group(1)] = int(pm.group(1)) + changed = 0 + for entry in chunks: + cid = entry.get("chunk_id") + real = truth.get(cid) + if real is not None and entry.get("page") != real: + entry["page"] = real + changed += 1 + if changed: + idx_path.write_text(json.dumps(idx, indent=2, ensure_ascii=False), encoding="utf-8") + print(f" {idx.get('doc_id')}: updated {changed} entries") + touched += 1 + print(f"\nDocs touched: {touched}") + + +if __name__ == "__main__": + main() diff --git a/scripts/maintain/46_text_backfill_mentions.py b/scripts/maintain/46_text_backfill_mentions.py new file mode 100644 index 0000000..6acdf54 --- /dev/null +++ b/scripts/maintain/46_text_backfill_mentions.py @@ -0,0 +1,340 @@ +#!/usr/bin/env python3 +""" +Text-based backfill of entity → page references. + +The structured pipelines (Sonnet chunks, Haiku page-level events/entities) +miss many entities the corpus actually discusses — they extract only what +they confidently structure into the schema. The vision_description and +narrative_summary fields routinely *talk about* an event/person/place +without listing it in the structured arrays. + +This script does a fuzzy back-fill: scans the narrative body of every page +YAML for textual matches of every entity's canonical_name + aliases, and +records the hits as a new signal source `text_refs`. Aho-Corasick is used +so the whole 3k-pages × 34k-entities cross-product collapses to a single +linear scan per page. + +Conservative filtering keeps the noise floor low: + - minimum 5 chars per alias (4 if alias has a digit, to keep e.g. "USS") + - blacklist of common stopwords / generic terms + - word-boundary enforcement (\b in regex, manual check after AC scan) + - skip purely numeric and ASCII-fold-identical-to-id aliases + +YAML output (added in-place on each entity file): + text_mentioned_in: ['[[doc-id/pNNN]]', ...] # only refs NOT already in mentioned_in + signal_sources.text_refs: N + total_mentions = db_chunks + page_refs + cross_refs + text_refs + signal_strength recomputed using text_refs as a weak signal + +Run: + python3 scripts/maintain/46_text_backfill_mentions.py [--dry-run] +""" +from __future__ import annotations + +import argparse +import re +import sys +from collections import defaultdict +from pathlib import Path +from typing import Iterable + +import ahocorasick +import yaml + +WIKI = Path("/Users/guto/ufo/wiki") +ENTITIES_BASE = WIKI / "entities" +PAGES_BASE = WIKI / "pages" + +# Generic / stop-words: never accept these as match patterns even if listed +# as an alias. Lowercased. PT-BR + EN + universally vague terms. +BLACKLIST: set[str] = { + # english stopwords / common + "the", "and", "for", "with", "from", "this", "that", "these", "those", + "report", "reports", "object", "objects", "unknown", "unidentified", + "anomalous", "aerial", "phenomenon", "phenomena", "sighting", "sightings", + "case", "cases", "incident", "incidents", "event", "events", "encounter", + "encounters", "observation", "observations", "document", "documents", + "memo", "memos", "letter", "letters", "table", "tables", "image", "images", + "general", "section", "agent", "agents", "subject", "subjects", + "office", "offices", "field", "fields", "summary", "summaries", + "true", "false", "type", "types", "data", "name", "names", + "person", "people", "place", "places", "location", "locations", + "vehicle", "vehicles", "operation", "operations", "concept", "concepts", + "page", "pages", "chunk", "chunks", "scan", "scans", + "north", "south", "east", "west", "central", + "captain", "major", "colonel", "general", "lieutenant", "sergeant", + "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", + "january", "february", "march", "april", "june", "july", "august", + "september", "october", "november", "december", + # pt-br stopwords / common + "para", "como", "este", "esta", "esse", "essa", "isso", "aquele", + "ainda", "outro", "outra", "outros", "outras", "todos", "todas", + "relatorio", "relatório", "objeto", "objetos", "documento", "documentos", + "página", "paginas", "páginas", "evento", "eventos", "incidente", + "incidentes", "pessoa", "pessoas", "lugar", "lugares", "local", "locais", + "operação", "operacao", "geral", "agente", "agentes", "campo", "campos", + "norte", "sul", "leste", "oeste", + "janeiro", "fevereiro", "março", "marco", "abril", "junho", "julho", + "agosto", "setembro", "outubro", "novembro", "dezembro", + # generic acronyms widely embedded in unrelated text + "uap", "ufo", "usaaf", "usaf", "usa", "fbi", "dod", "nasa", + "atom", "atoms", "atomic", +} + + +def is_acceptable_alias(name: str) -> bool: + n = name.strip() + if not n: + return False + nl = n.lower() + if nl in BLACKLIST: + return False + # Must contain at least one letter + if not re.search(r"[a-zA-ZÀ-ÿ]", n): + return False + # Purely numeric or punctuation + if re.fullmatch(r"[\d\s\-_.,]+", n): + return False + # Single-word too short (5 char min unless contains a digit) + if " " not in n and "-" not in n and len(n) < 5 and not re.search(r"\d", n): + return False + return True + + +def parse_frontmatter(text: str) -> tuple[dict, str]: + if not text.startswith("---"): + return {}, text + parts = text.split("---", 2) + if len(parts) < 3: + return {}, text + try: + fm = yaml.safe_load(parts[1]) or {} + except yaml.YAMLError: + fm = {} + return fm, parts[2] + + +def dump_frontmatter_preserving_body(fm: dict, body: str) -> str: + return "---\n" + yaml.safe_dump(fm, sort_keys=False, allow_unicode=True, width=1000) + "---" + body + + +def extract_searchable_text(page_fm: dict, page_body: str) -> str: + """Pick narrative-only fields from a page YAML — avoid YAML keys, IDs, enums.""" + parts: list[str] = [] + for key in ( + "vision_description", + "vision_description_pt_br", + "narrative_summary", + "narrative_summary_pt_br", + "extracted_text", + ): + v = page_fm.get(key) + if isinstance(v, str): + parts.append(v) + parts.append(page_body) + return "\n".join(parts) + + +# Map entity_class -> folder name +FOLDER_BY_CLASS = { + "person": "people", + "organization": "organizations", + "location": "locations", + "event": "events", + "uap_object": "uap-objects", + "vehicle": "vehicles", + "operation": "operations", + "concept": "concepts", +} + + +def entity_id_from_fm(fm: dict) -> tuple[str, str] | None: + cls = fm.get("entity_class") + if cls: + eid_key = f"{cls}_id" + eid = fm.get(eid_key) or fm.get("entity_id") + if eid: + return cls, eid + # legacy fallback + for k in ("person_id", "organization_id", "location_id", "event_id", + "uap_object_id", "vehicle_id", "operation_id", "concept_id"): + if k in fm: + return k.replace("_id", ""), fm[k] + return None + + +def signal_strength(db_chunks: int, page_refs: int, cross_refs: int, text_refs: int) -> str: + total = db_chunks + page_refs + cross_refs + text_refs + if total == 0: + return "orphan" + if db_chunks >= 3 or page_refs >= 3 or (db_chunks >= 1 and page_refs >= 1) or text_refs >= 5: + return "strong" + return "weak" + + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument("--dry-run", action="store_true") + p.add_argument("--verbose", action="store_true") + args = p.parse_args() + + # 1. Load entities + collect (alias_lower → list of (entity_file_path, original_alias)) + print("Loading entities ...") + automaton = ahocorasick.Automaton() + entities: dict[Path, dict] = {} + alias_per_entity_count = 0 + accepted_entities = 0 + for ent_file in ENTITIES_BASE.rglob("*.md"): + if "_archived" in ent_file.parts: + continue + try: + text = ent_file.read_text(encoding="utf-8") + except Exception: + continue + fm, _body = parse_frontmatter(text) + if not fm: + continue + if entity_id_from_fm(fm) is None: + continue + canonical = fm.get("canonical_name") or fm.get("canonical_title") + aliases = fm.get("aliases") or [] + names = [] + if isinstance(canonical, str): + names.append(canonical) + for a in aliases: + if isinstance(a, str): + names.append(a) + accepted = [n for n in names if is_acceptable_alias(n)] + if not accepted: + continue + entities[ent_file] = {"fm": fm, "raw_text": text, "accepted": accepted} + accepted_entities += 1 + for n in accepted: + automaton.add_word(n.lower(), (str(ent_file), n)) + alias_per_entity_count += 1 + + automaton.make_automaton() + print(f" entities considered: {accepted_entities}") + print(f" searchable aliases: {alias_per_entity_count}") + + # 2. Scan every page YAML + print("Scanning pages ...") + hits_by_entity: dict[str, set[str]] = defaultdict(set) + pages_scanned = 0 + total_hits = 0 + for page_file in PAGES_BASE.rglob("p*.md"): + try: + text = page_file.read_text(encoding="utf-8") + except Exception: + continue + fm, body = parse_frontmatter(text) + if not fm: + continue + page_id = fm.get("page_id") + if not page_id: + # Derive from filesystem: <doc-id>/p<NNN> + try: + rel = page_file.relative_to(PAGES_BASE) + page_id = f"{rel.parent}/{rel.stem}" + except ValueError: + continue + ref = f"[[{page_id}]]" + searchable = extract_searchable_text(fm, body).lower() + pages_scanned += 1 + seen_this_page: set[str] = set() + for end_idx, (ent_path_str, original) in automaton.iter(searchable): + pattern = original.lower() + start_idx = end_idx - len(pattern) + 1 + # Word boundary check + if start_idx > 0 and (searchable[start_idx - 1].isalnum() or searchable[start_idx - 1] == "_"): + continue + after = end_idx + 1 + if after < len(searchable) and (searchable[after].isalnum() or searchable[after] == "_"): + continue + if ent_path_str in seen_this_page: + continue + seen_this_page.add(ent_path_str) + hits_by_entity[ent_path_str].add(ref) + total_hits += 1 + + print(f" pages scanned: {pages_scanned}") + print(f" total hits: {total_hits}") + print(f" entities matched: {len(hits_by_entity)}") + + # 3. Write back to entity YAML + print("Writing back ...") + promoted = 0 + upgraded = 0 + updated = 0 + for ent_path_str, refs in hits_by_entity.items(): + ent_file = Path(ent_path_str) + rec = entities.get(ent_file) + if not rec: + continue + fm = rec["fm"] + raw_text = rec["raw_text"] + + # Don't double-count refs already in mentioned_in (structured page_refs) + existing_mentioned = set(fm.get("mentioned_in") or []) + new_text_refs = sorted(refs - existing_mentioned) + + old_sources = (fm.get("signal_sources") or {}).copy() + db_chunks = int(old_sources.get("db_chunks", 0)) + page_refs = int(old_sources.get("page_refs", len(existing_mentioned))) + cross_refs = int(old_sources.get("cross_refs", 0)) + text_refs = len(new_text_refs) + + old_strength = fm.get("signal_strength", "unverified") + new_strength = signal_strength(db_chunks, page_refs, cross_refs, text_refs) + new_total = db_chunks + page_refs + cross_refs + text_refs + + fm["text_mentioned_in"] = new_text_refs + sources = old_sources + sources["db_chunks"] = db_chunks + sources["page_refs"] = page_refs + sources["cross_refs"] = cross_refs + sources["text_refs"] = text_refs + fm["signal_sources"] = sources + fm["total_mentions"] = new_total + fm["signal_strength"] = new_strength + + if old_strength == "orphan" and new_strength != "orphan": + promoted += 1 + if old_strength == "weak" and new_strength == "strong": + upgraded += 1 + updated += 1 + + if not args.dry_run: + # Preserve body verbatim + _, body = parse_frontmatter(raw_text) + new_text = dump_frontmatter_preserving_body(fm, body) + ent_file.write_text(new_text, encoding="utf-8") + + # Also: entities not matched at all — they keep their existing state. + # But ensure their signal_sources.text_refs is at least set to 0 if missing + # (so the YAML schema is consistent). + backfill_zeros = 0 + if not args.dry_run: + for ent_file, rec in entities.items(): + if str(ent_file) in hits_by_entity: + continue + fm = rec["fm"] + sources = (fm.get("signal_sources") or {}) + if "text_refs" not in sources: + sources["text_refs"] = 0 + fm["signal_sources"] = sources + _, body = parse_frontmatter(rec["raw_text"]) + ent_file.write_text(dump_frontmatter_preserving_body(fm, body), encoding="utf-8") + backfill_zeros += 1 + + print() + print(f" entities updated: {updated}") + print(f" promoted orphan → weak: {promoted}") + print(f" upgraded weak → strong: {upgraded}") + print(f" zero-text-ref backfills: {backfill_zeros}") + print(f" dry-run: {args.dry_run}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/maintain/47_mark_unsearchable_chunks.sql b/scripts/maintain/47_mark_unsearchable_chunks.sql new file mode 100644 index 0000000..10d183c --- /dev/null +++ b/scripts/maintain/47_mark_unsearchable_chunks.sql @@ -0,0 +1,64 @@ +-- 47_mark_unsearchable_chunks.sql +-- Add an `is_searchable` flag to public.chunks and turn it OFF for purely +-- structural fragments that carry no informational content (salutations, +-- page numbers, classification banners, isolated headings, etc). +-- +-- These chunks still exist for page reconstruction; they just don't pollute +-- search/retrieval results anymore. +-- +-- Idempotent: re-running re-applies the same rules. + +BEGIN; + +ALTER TABLE public.chunks + ADD COLUMN IF NOT EXISTS is_searchable BOOLEAN NOT NULL DEFAULT TRUE; + +-- Reset all to true first so reclassification is clean +UPDATE public.chunks SET is_searchable = TRUE; + +-- Always-noise types (semantic-free formatting / scaffolding) +UPDATE public.chunks SET is_searchable = FALSE +WHERE type IN ( + 'page_number', + 'blank', + 'stamp', + 'classification_banner', + 'classification_marking' +); + +-- Noise when text is short (< 50 chars) — long form letterheads & such stay searchable +UPDATE public.chunks SET is_searchable = FALSE +WHERE type IN ( + 'salutation', + 'complimentary_close', + 'section_heading', + 'section_header', + 'heading', + 'title', + 'subtitle', + 'date_line', + 'bulleted_item', + 'field_value', + 'field_entry', + 'table_marker', + 'form_field', + 'form_header', + 'routing_block', + 'distribution_list', + 'file_number', + 'marginalia' +) +AND LENGTH(COALESCE(content_en, content_pt, '')) < 50; + +-- Partial index: only the searchable ~83% of rows are indexed in vector / fts +CREATE INDEX IF NOT EXISTS chunks_searchable_idx + ON public.chunks (chunk_pk) WHERE is_searchable; + +COMMIT; + +-- Diagnostic counters +SELECT + is_searchable, + COUNT(*) AS n, + ROUND(AVG(LENGTH(COALESCE(content_en, content_pt, '')))) AS avg_len +FROM public.chunks GROUP BY is_searchable ORDER BY is_searchable; diff --git a/scripts/maintain/48_hybrid_search_filter_unsearchable.sql b/scripts/maintain/48_hybrid_search_filter_unsearchable.sql new file mode 100644 index 0000000..5254749 --- /dev/null +++ b/scripts/maintain/48_hybrid_search_filter_unsearchable.sql @@ -0,0 +1,89 @@ +-- 48_hybrid_search_filter_unsearchable.sql +-- Rewrite hybrid_search_chunks to skip chunks with is_searchable=FALSE. +-- This is the same function as before, just with an extra AND c.is_searchable +-- in BOTH the bm25 and dense CTEs. + +CREATE OR REPLACE FUNCTION public.hybrid_search_chunks( + q_text text, + q_embedding vector, + q_lang text DEFAULT 'pt', + q_doc_id text DEFAULT NULL, + q_type text DEFAULT NULL, + q_classification text DEFAULT NULL, + q_ufo_only boolean DEFAULT FALSE, + k integer DEFAULT 100, + rrf_k integer DEFAULT 60 +) +RETURNS TABLE( + chunk_pk bigint, + doc_id text, + chunk_id text, + page integer, + type text, + bbox jsonb, + content_en text, + content_pt text, + classification text, + score double precision, + bm25_rank integer, + dense_rank integer +) +LANGUAGE plpgsql STABLE +AS $function$ +BEGIN + RETURN QUERY + WITH + ts_q AS ( + SELECT CASE WHEN q_lang = 'en' + THEN websearch_to_tsquery('public.en_unaccent'::regconfig, q_text) + ELSE websearch_to_tsquery('public.pt_unaccent'::regconfig, q_text) + END AS q + ), + bm25 AS ( + SELECT c.chunk_pk, + row_number() OVER (ORDER BY + ts_rank_cd( + CASE WHEN q_lang = 'en' THEN c.ts_en ELSE c.ts_pt END, + (SELECT q FROM ts_q) + ) DESC NULLS LAST + )::INT AS r + FROM public.chunks c + WHERE c.is_searchable + AND (CASE WHEN q_lang = 'en' THEN c.ts_en ELSE c.ts_pt END) @@ (SELECT q FROM ts_q) + AND (q_doc_id IS NULL OR c.doc_id = q_doc_id) + AND (q_type IS NULL OR c.type = q_type) + AND (q_classification IS NULL OR c.classification = q_classification) + AND (NOT q_ufo_only OR c.ufo_anomaly = TRUE) + LIMIT k + ), + dense AS ( + SELECT c.chunk_pk, + row_number() OVER (ORDER BY c.embedding <=> q_embedding)::INT AS r + FROM public.chunks c + WHERE c.is_searchable + AND c.embedding IS NOT NULL + AND (q_doc_id IS NULL OR c.doc_id = q_doc_id) + AND (q_type IS NULL OR c.type = q_type) + AND (q_classification IS NULL OR c.classification = q_classification) + AND (NOT q_ufo_only OR c.ufo_anomaly = TRUE) + ORDER BY c.embedding <=> q_embedding + LIMIT k + ), + fused AS ( + SELECT COALESCE(b.chunk_pk, d.chunk_pk) AS chunk_pk, + ((1.0::DOUBLE PRECISION / (rrf_k + COALESCE(b.r, k + 1))::DOUBLE PRECISION) + + (1.0::DOUBLE PRECISION / (rrf_k + COALESCE(d.r, k + 1))::DOUBLE PRECISION)) AS score, + b.r AS bm25_rank, + d.r AS dense_rank + FROM bm25 b + FULL OUTER JOIN dense d USING (chunk_pk) + ) + SELECT c.chunk_pk, c.doc_id, c.chunk_id, c.page, c.type, c.bbox, + c.content_en, c.content_pt, c.classification, + f.score, f.bm25_rank, f.dense_rank + FROM fused f + JOIN public.chunks c USING (chunk_pk) + ORDER BY f.score DESC + LIMIT k; +END +$function$; diff --git a/scripts/maintain/49_dedup_aggressive.py b/scripts/maintain/49_dedup_aggressive.py new file mode 100644 index 0000000..7b98cf8 --- /dev/null +++ b/scripts/maintain/49_dedup_aggressive.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python3 +""" +Aggressive entity deduplication — camada 1 (determinístico). + +Hoje há ~34.355 entidades; muitas são variações tipográficas, prefixos de +papel (Mr./Dr./Major), ou OBJ-* gerados por chunk em vez de por evento. +Este script faz merge em três camadas, todas alta-confiança: + + A. PROPER_NAME — pessoa com ≥2 tokens onde o nome próprio principal + (último token significativo + primeiro nome) é único após strip de + role prefixes. Ex: "Frank M. Brown", "Lt. Frank M. Brown", + "Special Agent Frank M. Brown" → 1 entidade canônica. + + B. UAP_OBJECT_BY_EVENT — todos os OBJ-EV<year>-<EVENT>-NN do mesmo evento + são colapsados em 1 OBJ-EV<year>-<EVENT>-00 (NN=00 = canonical). + + C. EXACT_NORMALIZED — após lowercase + strip de pontuação + strip de + stopwords + strip de sufixos tipo " UAP" / " incident", strings + idênticas viram 1 entidade. + +Para cada cluster: + - Escolhe canonical: o mais longo OU o que tem narrative_summary curado, + com fallback no primeiro alfabético. + - Une aliases[], mentioned_in[], text_mentioned_in[], referenced_by[]. + - Recalcula signal_sources somando page_refs/text_refs (db_chunks fica + com o do canonical pq depende do entity_pk). + - Move duplicatas para wiki/entities/_archived/. + +Output: lista de merges (cluster → canonical), pra revisar antes de aplicar. + +Run: + python3 scripts/maintain/49_dedup_aggressive.py --dry-run + python3 scripts/maintain/49_dedup_aggressive.py # apply +""" +from __future__ import annotations +import argparse +import re +import shutil +import sys +import unicodedata +from pathlib import Path +from collections import defaultdict +from typing import Iterable + +import yaml + +WIKI_ENT = Path("/Users/guto/ufo/wiki/entities") +ARCHIVED = WIKI_ENT / "_archived" + +ROLE_PREFIX_RE = re.compile( + r"^(" + r"mr|mrs|ms|dr|prof|sr|sra|sir|dame|lord|lady|" + r"major|maj|colonel|col|lt|lieutenant|captain|capt|" + r"general|gen|sergeant|sgt|corporal|cpl|private|pvt|admiral|adm|commander|cmdr|" + r"agent|special agent|sa|director|deputy director|deputy|" + r"reverend|rev|professor|" + r"president|vice president|vp|chairman|secretary|" + r"detective|det|inspector" + r")\.?\s+", + re.IGNORECASE, +) +STOPWORD_PREFIX_RE = re.compile(r"^(the|a|an|o|os|a|as|de|do|da|dos|das|of|los|las|el|la|le|les)\s+", re.IGNORECASE) +PUNCT_RE = re.compile(r"[.,;:!?\"'\(\)\[\]_\-]") +WS_RE = re.compile(r"\s+") +NOISE_SUFFIX_RE = re.compile(r"\s+(uap|incident|case|sighting|event|observation)$", re.IGNORECASE) +OBJ_ID_RE = re.compile(r"^OBJ-([A-Z0-9]+)-(.+?)-(\d{2})$") + + +def ascii_fold(s: str) -> str: + return "".join( + c for c in unicodedata.normalize("NFD", s) + if not unicodedata.combining(c) + ) + + +def aggressive_normalize(name: str) -> str: + s = ascii_fold(name).strip().lower() + # strip role prefixes (repeat: "Special Agent Major Brown") + for _ in range(3): + new = ROLE_PREFIX_RE.sub("", s) + if new == s: break + s = new + s = STOPWORD_PREFIX_RE.sub("", s) + s = PUNCT_RE.sub(" ", s) + s = WS_RE.sub(" ", s).strip() + s = NOISE_SUFFIX_RE.sub("", s).strip() + return s + + +FOLDER_TO_CLASS = { + "people": "person", + "organizations": "organization", + "locations": "location", + "events": "event", + "uap-objects": "uap_object", + "vehicles": "vehicle", + "operations": "operation", + "concepts": "concept", +} + + +def load_entity(path: Path) -> dict | None: + try: + text = path.read_text(encoding="utf-8") + if not text.startswith("---"): + return None + parts = text.split("---", 2) + if len(parts) < 3: return None + fm = yaml.safe_load(parts[1]) or {} + body = parts[2] + return {"path": path, "fm": fm, "body": body, "raw": text} + except Exception as e: + return None + + +def dump_entity(entity: dict) -> str: + return "---\n" + yaml.safe_dump( + entity["fm"], sort_keys=False, allow_unicode=True, width=1000 + ) + "---" + entity["body"] + + +def dedup_pass_obj_by_event(entities: list[dict]) -> dict[str, list[dict]]: + """OBJ-EVYYYY-EVENT-NN → group by EVENT base (drop NN).""" + clusters: dict[str, list[dict]] = defaultdict(list) + for e in entities: + if e["fm"].get("entity_class") != "uap_object": + continue + eid = e["fm"].get("uap_object_id") or e["fm"].get("entity_id") or "" + m = OBJ_ID_RE.match(eid) + if not m: continue + # Group by EV<year>-<EVENT_SLUG> + key = f"OBJ-{m.group(1)}-{m.group(2)}" + clusters[key].append(e) + return {k: v for k, v in clusters.items() if len(v) > 1} + + +def dedup_pass_proper_name(entities: list[dict]) -> dict[str, list[dict]]: + """Person/organization/event/location: cluster by aggressive_normalize. + Only auto-merge if the normalized form has ≥2 tokens (avoids "smith" only). + """ + clusters: dict[str, list[dict]] = defaultdict(list) + for e in entities: + cls = e["fm"].get("entity_class") + if cls not in ("person", "organization", "event", "location", "operation", + "concept", "vehicle"): + continue + name = e["fm"].get("canonical_name") or "" + if not name: continue + norm = aggressive_normalize(name) + if not norm: continue + # Require ≥2 tokens OR ≥6 chars to avoid "smith" / "brown" collisions + n_tokens = len(norm.split()) + if n_tokens < 2 and len(norm) < 8: + continue + key = f"{cls}::{norm}" + clusters[key].append(e) + return {k: v for k, v in clusters.items() if len(v) > 1} + + +def choose_canonical(cluster: list[dict]) -> dict: + """Pick canonical: prefer one with curated narrative, then longest aliases, + then most mentions, then first alphabetical.""" + def score(e: dict) -> tuple: + fm = e["fm"] + curated = 1 if fm.get("summary_status") == "curated" else 0 + n_aliases = len(fm.get("aliases") or []) + mentions = fm.get("total_mentions") or 0 + # Negative path to make alphabetical ascending + name_for_sort = str(fm.get("canonical_name") or "") + return (curated, n_aliases, mentions, -ord(name_for_sort[0]) if name_for_sort else 0) + return max(cluster, key=score) + + +def merge_into(canonical: dict, duplicates: list[dict]) -> None: + """Merge fields from duplicates into canonical (in place).""" + cfm = canonical["fm"] + cfm.setdefault("aliases", []) + cfm.setdefault("mentioned_in", []) + cfm.setdefault("text_mentioned_in", []) + cfm.setdefault("referenced_by", []) + cfm.setdefault("related", []) + + # Collect aliases (include the duplicates' canonical_name as alias) + all_aliases = set(cfm["aliases"] or []) + all_aliases.add(cfm.get("canonical_name", "")) + all_mentions = set(cfm["mentioned_in"] or []) + all_text_mentions = set(cfm["text_mentioned_in"] or []) + all_referenced = set(cfm["referenced_by"] or []) + all_related = set(cfm["related"] or []) + page_refs_sum = int((cfm.get("signal_sources") or {}).get("page_refs") or 0) + text_refs_sum = int((cfm.get("signal_sources") or {}).get("text_refs") or 0) + + for d in duplicates: + dfm = d["fm"] + dcanonical = dfm.get("canonical_name") + if dcanonical: all_aliases.add(dcanonical) + for a in (dfm.get("aliases") or []): all_aliases.add(a) + for m in (dfm.get("mentioned_in") or []): all_mentions.add(m) + for m in (dfm.get("text_mentioned_in") or []): all_text_mentions.add(m) + for r in (dfm.get("referenced_by") or []): all_referenced.add(r) + for r in (dfm.get("related") or []): all_related.add(r) + + all_aliases.discard("") + all_aliases.discard(None) + cfm["aliases"] = sorted(all_aliases) + cfm["mentioned_in"] = sorted(all_mentions) + cfm["text_mentioned_in"] = sorted(all_text_mentions) + cfm["referenced_by"] = sorted(all_referenced) + cfm["related"] = sorted(all_related) + cfm["documents_count"] = len({m.split("/")[0].lstrip("[") for m in all_mentions}) + # Recompute signal_sources (page_refs/text_refs are sums; db_chunks stays as canonical's) + sigs = cfm.get("signal_sources") or {"db_chunks": 0, "cross_refs": 0} + sigs["page_refs"] = len(all_mentions) + sigs["text_refs"] = len(all_text_mentions) + sigs["cross_refs"] = len(all_referenced) + sigs["db_chunks"] = int(sigs.get("db_chunks", 0)) + cfm["signal_sources"] = sigs + cfm["total_mentions"] = sigs["db_chunks"] + sigs["page_refs"] + sigs["cross_refs"] + sigs["text_refs"] + # Recompute signal_strength + total = cfm["total_mentions"] + if total == 0: + cfm["signal_strength"] = "orphan" + elif (sigs["db_chunks"] >= 3 or sigs["page_refs"] >= 3 + or (sigs["db_chunks"] >= 1 and sigs["page_refs"] >= 1) + or sigs["text_refs"] >= 5): + cfm["signal_strength"] = "strong" + else: + cfm["signal_strength"] = "weak" + + +def archive_path(p: Path) -> Path: + rel = p.relative_to(WIKI_ENT) + return ARCHIVED / rel + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--dry-run", action="store_true") + ap.add_argument("--limit-pass", choices=["all", "obj", "name"], default="all") + args = ap.parse_args() + + print(f"Loading entities from {WIKI_ENT} ...") + all_entities: list[dict] = [] + for f in WIKI_ENT.rglob("*.md"): + if "_archived" in f.parts: continue + ent = load_entity(f) + if ent and ent["fm"].get("type") == "entity": + all_entities.append(ent) + print(f" loaded {len(all_entities)} entities") + + # Run dedup passes + clusters: dict[str, list[dict]] = {} + if args.limit_pass in ("all", "obj"): + obj_clusters = dedup_pass_obj_by_event(all_entities) + print(f"\nPass A — OBJ by event: {len(obj_clusters)} clusters ({sum(len(v) for v in obj_clusters.values())} entities → {len(obj_clusters)} canonicals)") + clusters.update({f"OBJ::{k}": v for k, v in obj_clusters.items()}) + if args.limit_pass in ("all", "name"): + name_clusters = dedup_pass_proper_name(all_entities) + print(f"Pass B/C — proper-name normalize: {len(name_clusters)} clusters ({sum(len(v) for v in name_clusters.values())} entities → {len(name_clusters)} canonicals)") + clusters.update({f"NAME::{k}": v for k, v in name_clusters.items()}) + + # Deduplicate entities across passes (avoid double-merge) + seen_paths: set[str] = set() + plans: list[tuple[str, dict, list[dict]]] = [] + for ckey, cluster in clusters.items(): + # Filter out already-seen + cluster = [e for e in cluster if str(e["path"]) not in seen_paths] + if len(cluster) < 2: continue + canonical = choose_canonical(cluster) + duplicates = [e for e in cluster if e is not canonical] + for e in cluster: seen_paths.add(str(e["path"])) + plans.append((ckey, canonical, duplicates)) + + plans.sort(key=lambda p: -len(p[2])) # biggest clusters first + redundant_total = sum(len(d) for _, _, d in plans) + print(f"\n=== Merge plan ===") + print(f" clusters: {len(plans)}") + print(f" entities removed: {redundant_total}") + print(f" before: {len(all_entities)} → after: {len(all_entities) - redundant_total}") + print(f" reduction: {100*redundant_total/len(all_entities):.1f}%\n") + + print("=== Top 20 biggest merges ===") + for ckey, canonical, dupes in plans[:20]: + cname = canonical["fm"].get("canonical_name", "?") + print(f" {len(dupes)+1:>3} entities → '{cname}' ({ckey.split('::')[0]})") + for d in dupes[:4]: + print(f" ✗ {d['fm'].get('canonical_name', '?')}") + if len(dupes) > 4: print(f" ... +{len(dupes)-4}") + + if args.dry_run: + print("\n(dry-run; nothing written)") + return 0 + + # Apply merges + print("\nApplying merges ...") + merged_count = 0 + archived_count = 0 + for ckey, canonical, dupes in plans: + merge_into(canonical, dupes) + canonical["path"].write_text(dump_entity(canonical), encoding="utf-8") + merged_count += 1 + for d in dupes: + archive_to = archive_path(d["path"]) + archive_to.parent.mkdir(parents=True, exist_ok=True) + shutil.move(str(d["path"]), str(archive_to)) + archived_count += 1 + print(f" canonicals updated: {merged_count}") + print(f" duplicates archived: {archived_count}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/maintain/50_dedup_fuzzy_trigram.py b/scripts/maintain/50_dedup_fuzzy_trigram.py new file mode 100644 index 0000000..7329ad1 --- /dev/null +++ b/scripts/maintain/50_dedup_fuzzy_trigram.py @@ -0,0 +1,428 @@ +#!/usr/bin/env python3 +""" +Aggressive entity deduplication — camada 2 (fuzzy trigram). + +Para cada entity_class, compara TODAS as entidades restantes via similaridade +trigram (Postgres pg_trgm). Merge automático quando: + - similarity >= 0.85 e ambos os nomes têm ≥2 tokens significativos OU + - similarity >= 0.92 (mais tolerante para nomes curtos) + - mesma classe + - estado: NÃO já arquivada + - mesmo "núcleo" (último token após strip de role prefixes) + +Para nomes ambíguos (single-word sobrenome como "Smith"), só faz merge se +houver contexto compartilhado (mesma página, mesmo documento na maioria das +menções). + +Run: + DATABASE_URL=postgres://... python3 scripts/maintain/50_dedup_fuzzy_trigram.py --dry-run +""" +from __future__ import annotations +import argparse +import os +import re +import shutil +import sys +import unicodedata +from collections import defaultdict +from pathlib import Path + +import psycopg +import yaml + +WIKI_ENT = Path("/Users/guto/ufo/wiki/entities") +ARCHIVED = WIKI_ENT / "_archived" + +ROLE_PREFIX_RE = re.compile( + r"^(" + r"mr|mrs|ms|dr|prof|sr|sra|sir|dame|lord|lady|" + r"major|maj|colonel|col|lt|lieutenant|captain|capt|" + r"general|gen|sergeant|sgt|corporal|cpl|private|pvt|admiral|adm|commander|cmdr|" + r"agent|special agent|sa|director|deputy director|deputy|" + r"reverend|rev|professor|" + r"president|vice president|vp|chairman|secretary|" + r"detective|det|inspector" + r")\.?\s+", + re.IGNORECASE, +) + + +def ascii_fold(s: str) -> str: + return "".join(c for c in unicodedata.normalize("NFD", s) if not unicodedata.combining(c)) + + +def strip_roles(name: str) -> str: + s = name + for _ in range(3): + new = ROLE_PREFIX_RE.sub("", s) + if new == s: break + s = new + return s.strip() + + +def core_tokens(name: str) -> set[str]: + """Significant tokens of a name (no roles, no stopwords, lowercased).""" + s = ascii_fold(strip_roles(name).lower()) + s = re.sub(r"[.,;:!?\"'\(\)\[\]_]", " ", s) + toks = [t for t in s.split() if len(t) > 1 and t not in { + "the", "of", "and", "de", "do", "da", "dos", "das", "el", "la", "los", "las", + "a", "an", "o", "as", "os", "le", "les", "von", "van" + }] + return set(toks) + + +# Tokens that mix letters and digits (II-22, B-6, mode4, district17, 17th, 3rd) +# These are SIGNIFICANT modifiers — if they differ between two names, the +# names refer to DIFFERENT things. +NUMERIC_TOKEN_RE = re.compile(r"^[a-z]*\d+[a-z]*$|^\d+[a-z]+$|^[a-z]+-?\d+[a-z]*$|^[ivxlcdm]+-?\d+$", re.IGNORECASE) + + +CODE_SUFFIX_RE = re.compile(r"(?:\s-\s|-)([A-Z]{1,3})$|\s([A-Z])$") + + +def code_suffix(name: str) -> str | None: + """Extract trailing short code (1-3 uppercase letters) like ' - Z', + ' M', '-R'. These often denote sub-categories that differ semantically + (FBI classification subdivisions, military variants).""" + s = name.strip() + m = CODE_SUFFIX_RE.search(s) + if not m: return None + code = (m.group(1) or m.group(2) or "").upper() + return code if code else None + + +ROMAN_NUMERALS = { + "i","ii","iii","iv","v","vi","vii","viii","ix","x", + "xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx", + "xxi","xxii","xxiii","xxiv","xxv","xxvi","xxvii","xxviii","xxix","xxx", +} +ORDINAL_WORDS = { + "first","second","third","fourth","fifth","sixth","seventh","eighth", + "ninth","tenth","eleventh","twelfth","thirteenth","fourteenth","fifteenth", + "sixteenth","seventeenth","eighteenth","nineteenth","twentieth", + "primeiro","segundo","terceiro","quarto","quinto","sexto","setimo", + "oitavo","nono","decimo","undecimo","duodecimo", +} + + +def is_variant_marker(tok: str) -> bool: + """True if `tok` is the kind of token that distinguishes instances of a + series: 'A', 'B', 'II', 'XIII', 'Ninth', 'Fourth', '5', etc.""" + t = tok.lower() + if t.isdigit(): return True + if t in ROMAN_NUMERALS: return True + if t in ORDINAL_WORDS: return True + # Single uppercase letter (e.g. 'A' in 'Pioneer A') + if len(tok) == 1 and tok.isalpha() and tok.isupper(): return True + return False + + +def single_letter_token_diff(name_a: str, name_b: str) -> bool: + """Returns True if the two names differ by tokens that are 'variant + markers' — letters, romans, ordinals. Catches: + Pioneer Launch vs PIONEER A Launch (single letter) + PIONEER-B Launch vs PIONEER-C Launch + XII Tactical Air Cmd vs XIII Tactical Air Cmd (romans) + Ninth Air Force vs Tenth Air Force (ordinals) + Apollo vs Apollo 11 (digit) + These are variants of the same program, NOT the same instance. + """ + def toks(s: str) -> list[str]: + s = ascii_fold(s.lower()) + s = re.sub(r"[-_]", " ", s) + return [t for t in re.findall(r"\b[\w]+\b", s) if t] + + # Lowercase tokens for set diff, but remember the original case to detect + # the single-uppercase-letter case. + ta_orig = re.findall(r"\b[\w]+\b", re.sub(r"[-_]", " ", ascii_fold(name_a))) + tb_orig = re.findall(r"\b[\w]+\b", re.sub(r"[-_]", " ", ascii_fold(name_b))) + ta = [t.lower() for t in ta_orig] + tb = [t.lower() for t in tb_orig] + if not ta or not tb: return False + from collections import Counter + ca, cb = Counter(ta), Counter(tb) + diff_a = list((ca - cb).elements()) + diff_b = list((cb - ca).elements()) + if not diff_a and not diff_b: return False + # Helper: variant marker check considering original case for single letters + def marker_or_single_letter(lower_tok: str, src: list[str]) -> bool: + if is_variant_marker(lower_tok): return True + # Single letter not flagged above because we only allowed UPPERCASE. + # Re-check via original-case forms in the source name. + if len(lower_tok) == 1 and lower_tok.isalpha(): + # See if it appears as uppercase in original tokens + for o in src: + if o.lower() == lower_tok and o.isupper(): return True + return False + + a_all_markers = all(marker_or_single_letter(t, ta_orig) for t in diff_a) if diff_a else True + b_all_markers = all(marker_or_single_letter(t, tb_orig) for t in diff_b) if diff_b else True + if a_all_markers and b_all_markers and (diff_a or diff_b): + return True + return False + + +def numeric_signature(name: str) -> frozenset[str]: + """Extract all numeric/ordinal/serial tokens from a name. + Two names with DIFFERENT numeric signatures CANNOT be merged.""" + s = ascii_fold(name.lower()) + s = re.sub(r"[.,;:!?\"'\(\)\[\]_]", " ", s) + # Extract all tokens that contain at least one digit + nums = set() + for t in re.findall(r"\b[\w-]+\b", s): + # Pure number + if re.fullmatch(r"\d+(st|nd|rd|th)?", t): + # Normalize "17th" → "17" + nums.add(re.sub(r"(st|nd|rd|th)$", "", t)) + # Letter + digit (II-22, b-6, mode4) + elif re.search(r"\d", t): + # Normalize "II-22" / "ii-22" → "ii22"; "b-6" → "b6" + nums.add(re.sub(r"[-\s]", "", t)) + return frozenset(nums) + + +FOLDER_TO_CLASS = { + "people": "person", + "organizations": "organization", + "locations": "location", + "events": "event", + "uap-objects": "uap_object", + "vehicles": "vehicle", + "operations": "operation", + "concepts": "concept", +} +CLASS_TO_FOLDER = {v: k for k, v in FOLDER_TO_CLASS.items()} + + +def load_entity(path: Path) -> dict | None: + try: + text = path.read_text(encoding="utf-8") + if not text.startswith("---"): return None + parts = text.split("---", 2) + if len(parts) < 3: return None + fm = yaml.safe_load(parts[1]) or {} + body = parts[2] + return {"path": path, "fm": fm, "body": body} + except Exception: + return None + + +def dump_entity(entity: dict) -> str: + return "---\n" + yaml.safe_dump(entity["fm"], sort_keys=False, allow_unicode=True, width=1000) + "---" + entity["body"] + + +def entity_path_for(cls: str, entity_id: str) -> Path | None: + folder = CLASS_TO_FOLDER.get(cls) + if not folder: return None + p = WIKI_ENT / folder / f"{entity_id}.md" + return p if p.exists() else None + + +def merge_into(canonical: dict, duplicate: dict) -> None: + cfm = canonical["fm"]; dfm = duplicate["fm"] + cfm.setdefault("aliases", []); cfm.setdefault("mentioned_in", []) + cfm.setdefault("text_mentioned_in", []); cfm.setdefault("referenced_by", []) + cfm.setdefault("related", []) + all_aliases = set(cfm["aliases"] or []); all_aliases.add(cfm.get("canonical_name", "")) + if dfm.get("canonical_name"): all_aliases.add(dfm["canonical_name"]) + for a in (dfm.get("aliases") or []): all_aliases.add(a) + all_aliases.discard(""); all_aliases.discard(None) + cfm["aliases"] = sorted(all_aliases) + cfm["mentioned_in"] = sorted(set(cfm["mentioned_in"] or []) | set(dfm.get("mentioned_in") or [])) + cfm["text_mentioned_in"] = sorted(set(cfm["text_mentioned_in"] or []) | set(dfm.get("text_mentioned_in") or [])) + cfm["referenced_by"] = sorted(set(cfm["referenced_by"] or []) | set(dfm.get("referenced_by") or [])) + cfm["related"] = sorted(set(cfm["related"] or []) | set(dfm.get("related") or [])) + cfm["documents_count"] = len({m.split("/")[0].lstrip("[") for m in cfm["mentioned_in"]}) + sigs = cfm.get("signal_sources") or {"db_chunks": 0, "cross_refs": 0} + sigs["page_refs"] = len(cfm["mentioned_in"]) + sigs["text_refs"] = len(cfm["text_mentioned_in"]) + sigs["cross_refs"] = len(cfm["referenced_by"]) + sigs["db_chunks"] = int(sigs.get("db_chunks", 0)) + cfm["signal_sources"] = sigs + total = sigs["db_chunks"] + sigs["page_refs"] + sigs["cross_refs"] + sigs["text_refs"] + cfm["total_mentions"] = total + if total == 0: + cfm["signal_strength"] = "orphan" + elif sigs["db_chunks"] >= 3 or sigs["page_refs"] >= 3 or (sigs["db_chunks"] >= 1 and sigs["page_refs"] >= 1) or sigs["text_refs"] >= 5: + cfm["signal_strength"] = "strong" + else: + cfm["signal_strength"] = "weak" + + +def choose_canonical(a: dict, b: dict) -> tuple[dict, dict]: + """Return (canonical, duplicate). Prefer one with curated narrative, + then longer aliases list, then higher total_mentions.""" + def score(e: dict) -> tuple: + fm = e["fm"] + return ( + 1 if fm.get("summary_status") == "curated" else 0, + len(fm.get("aliases") or []), + fm.get("total_mentions") or 0, + len(fm.get("canonical_name") or ""), + ) + if score(a) >= score(b): return a, b + return b, a + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--dry-run", action="store_true") + ap.add_argument("--threshold", type=float, default=0.85, + help="trigram similarity threshold (0..1)") + ap.add_argument("--threshold-short", type=float, default=0.92, + help="higher threshold for single-token names") + ap.add_argument("--limit", type=int, default=None, + help="apply at most N merges (for cautious runs)") + args = ap.parse_args() + + dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL") + if not dburl: sys.exit("DATABASE_URL not set") + + with psycopg.connect(dburl) as conn: + with conn.cursor() as cur: + cur.execute(f"SET pg_trgm.similarity_threshold = {args.threshold}") + # All entity pairs above threshold in the SAME class, where a > b (avoid duplicates) + cur.execute(f""" + SELECT e1.entity_class, + e1.entity_id, e1.canonical_name, + e2.entity_id, e2.canonical_name, + similarity(e1.canonical_name, e2.canonical_name) AS sim + FROM entities e1 + JOIN entities e2 + ON e1.entity_class = e2.entity_class + AND e1.entity_id < e2.entity_id + AND e1.canonical_name % e2.canonical_name + ORDER BY sim DESC + """) + pairs = cur.fetchall() + + print(f"Trigram candidate pairs (sim >= {args.threshold}): {len(pairs)}") + + # Filter pairs by: + # - share at least 1 significant core token (avoids "United States" matching "United Kingdom") + # - if both names are single-token AFTER role strip, require higher threshold + accepted = [] + rejected_short = 0 + rejected_no_overlap = 0 + rejected_numeric = 0 + for cls, id_a, name_a, id_b, name_b, sim in pairs: + toks_a = core_tokens(name_a or "") + toks_b = core_tokens(name_b or "") + if not toks_a or not toks_b: + rejected_no_overlap += 1; continue + # Must share at least one significant token + if not (toks_a & toks_b): + rejected_no_overlap += 1; continue + # If one side is single-token, require stricter threshold + if (len(toks_a) <= 1 or len(toks_b) <= 1) and sim < args.threshold_short: + rejected_short += 1; continue + # NUMERIC SAFEGUARD: if numeric signatures differ, the names refer to + # different objects (NAVSTAR II-2 vs II-24, Mode 3 vs Mode 4, + # 17th District vs 13th District, etc). Reject. + sig_a = numeric_signature(name_a or "") + sig_b = numeric_signature(name_b or "") + if sig_a != sig_b: + rejected_numeric += 1; continue + # CODE SUFFIX SAFEGUARD: if EITHER name has a short code suffix + # (1-3 uppercase letters), they must have IDENTICAL suffixes. + # 'INTERNAL SECURITY - Z' ≠ 'INTERNAL SECURITY - X' ≠ 'INTERNAL SECURITY' (base). + cs_a = code_suffix(name_a or "") + cs_b = code_suffix(name_b or "") + if (cs_a or cs_b) and cs_a != cs_b: + rejected_numeric += 1; continue + # SINGLE-LETTER VARIANT TOKEN: 'PIONEER A Launch' vs 'PIONEER-B Launch' + # vs 'Pioneer Launch' are distinct missions of the same program. + if single_letter_token_diff(name_a or "", name_b or ""): + rejected_numeric += 1; continue + accepted.append((cls, id_a, name_a, id_b, name_b, sim)) + + print(f" rejected (no token overlap): {rejected_no_overlap}") + print(f" rejected (single-token below {args.threshold_short}): {rejected_short}") + print(f" rejected (numeric signature mismatch): {rejected_numeric}") + print(f" ACCEPTED for merge: {len(accepted)}") + + # Build a union-find over accepted pairs so transitive clusters merge correctly + parent: dict[tuple[str, str], tuple[str, str]] = {} + def find(x): + while parent.get(x, x) != x: + parent[x] = parent.get(parent[x], parent[x]) + x = parent[x] + return x + def union(x, y): + rx, ry = find(x), find(y) + if rx != ry: parent[ry] = rx + + for cls, id_a, _, id_b, _, _ in accepted: + a = (cls, id_a); b = (cls, id_b) + parent.setdefault(a, a); parent.setdefault(b, b) + union(a, b) + + clusters: dict[tuple[str, str], list[tuple[str, str]]] = defaultdict(list) + for node in list(parent.keys()): + clusters[find(node)].append(node) + clusters = {k: v for k, v in clusters.items() if len(v) > 1} + + print(f"\nClusters after union-find: {len(clusters)}") + print(f"Entities to remove: {sum(len(v) - 1 for v in clusters.values())}\n") + + # Sample biggest + biggest = sorted(clusters.values(), key=lambda c: -len(c))[:15] + print("=== Top 15 biggest fuzzy clusters ===") + for cluster in biggest: + # Load names for display + names = [] + for cls, eid in cluster: + p = entity_path_for(cls, eid) + if p: + ent = load_entity(p) + if ent: names.append(ent["fm"].get("canonical_name") or eid) + if not names: continue + cls = cluster[0][0] + print(f" [{cls}] {len(cluster)} entities:") + for n in names[:6]: print(f" - {n}") + if len(names) > 6: print(f" ... +{len(names)-6}") + + if args.dry_run: + print("\n(dry-run; nothing written)") + return 0 + + # Apply merges + print("\nApplying merges ...") + applied = 0 + archived = 0 + for cluster in clusters.values(): + if args.limit and applied >= args.limit: break + # Load all entities + loaded = [] + for cls, eid in cluster: + p = entity_path_for(cls, eid) + if p: + ent = load_entity(p) + if ent: loaded.append(ent) + if len(loaded) < 2: continue + # Pick canonical: highest score + canonical = max(loaded, key=lambda e: ( + 1 if e["fm"].get("summary_status") == "curated" else 0, + len(e["fm"].get("aliases") or []), + e["fm"].get("total_mentions") or 0, + len(e["fm"].get("canonical_name") or ""), + )) + dupes = [e for e in loaded if e is not canonical] + for d in dupes: + merge_into(canonical, d) + canonical["path"].write_text(dump_entity(canonical), encoding="utf-8") + for d in dupes: + rel = d["path"].relative_to(WIKI_ENT) + arch = ARCHIVED / rel + arch.parent.mkdir(parents=True, exist_ok=True) + shutil.move(str(d["path"]), str(arch)) + archived += 1 + applied += 1 + + print(f" canonicals updated: {applied}") + print(f" duplicates archived: {archived}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/maintain/51_remap_entity_mentions.py b/scripts/maintain/51_remap_entity_mentions.py new file mode 100644 index 0000000..20fcd39 --- /dev/null +++ b/scripts/maintain/51_remap_entity_mentions.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +""" +After scripts 49/50 archive duplicates, the DB's `entities` + `entity_mentions` +tables still point at the OLD (now-archived) entity_pks. This script: + + 1. Re-reads every active entity YAML from wiki/entities/ (skipping _archived) + 2. For each entity, looks up the corresponding entity_pk in the DB by + (entity_class, entity_id). + 3. Reads the aliases[] from the YAML and finds DB entities with matching + entity_id that no longer exist on disk — those are the merged-away ones. + 4. UPDATE entity_mentions SET entity_pk = <canonical_pk> WHERE entity_pk IN (<archived_pks>) + 5. DELETE FROM entities WHERE entity_class||entity_id IS NOT in active set + +Idempotent — re-running is a no-op once converged. +""" +from __future__ import annotations +import os +import sys +from pathlib import Path + +import psycopg +import yaml + +WIKI_ENT = Path("/Users/guto/ufo/wiki/entities") +ARCHIVED = WIKI_ENT / "_archived" + + +def main() -> int: + dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL") + if not dburl: sys.exit("DATABASE_URL not set") + + # Build active set + the alias→canonical lookup + print("Scanning active YAMLs ...") + active: set[tuple[str, str]] = set() + canonical_by_alias_eid: dict[tuple[str, str], tuple[str, str]] = {} + for f in WIKI_ENT.rglob("*.md"): + if "_archived" in f.parts: continue + try: + text = f.read_text(encoding="utf-8") + if not text.startswith("---"): continue + fm = yaml.safe_load(text.split("---")[1]) or {} + except Exception: continue + cls = fm.get("entity_class") + eid = (fm.get("entity_id") + or fm.get(f"{cls}_id") if cls else None) + if not (cls and eid): continue + active.add((cls, eid)) + # All archived entities that ended up merged into this one likely + # had entity_ids that are now in this entity's aliases list. We can't + # be 100% sure, but a same-class entity with id matching an alias + # slugified is a strong signal. + + print(f" active entities: {len(active)}") + + print("\nScanning archived YAMLs ...") + archived_map: dict[tuple[str, str], tuple[str, str]] = {} + for f in ARCHIVED.rglob("*.md") if ARCHIVED.exists() else []: + try: + text = f.read_text(encoding="utf-8") + if not text.startswith("---"): continue + fm = yaml.safe_load(text.split("---")[1]) or {} + except Exception: continue + cls = fm.get("entity_class") + eid = (fm.get("entity_id") + or (fm.get(f"{cls}_id") if cls else None)) + if not (cls and eid): continue + # Find canonical: an active entity with same class whose aliases contain + # this entity's canonical_name. + dup_name = (fm.get("canonical_name") or "").strip().lower() + if not dup_name: continue + archived_map[(cls, eid)] = (cls, dup_name) + + print(f" archived entities: {len(archived_map)}") + + print("\nConnecting to DB ...") + with psycopg.connect(dburl) as conn: + with conn.cursor() as cur: + # Map active YAML entities → their entity_pk + cur.execute("SELECT entity_pk, entity_class, entity_id, canonical_name FROM entities") + db_rows = cur.fetchall() + db_by_key: dict[tuple[str, str], tuple[int, str]] = { + (cls, eid): (pk, name) for pk, cls, eid, name in db_rows + } + print(f" DB entities: {len(db_rows)}") + + # For each archived (cls, eid), find the canonical active entity in same class + # whose aliases contain the archived's canonical_name OR whose entity_id matches. + # Build an alias index from active YAMLs: + print("\nBuilding alias index from active YAMLs ...") + alias_index: dict[tuple[str, str], tuple[str, str]] = {} + for f in WIKI_ENT.rglob("*.md"): + if "_archived" in f.parts: continue + try: + text = f.read_text(encoding="utf-8") + if not text.startswith("---"): continue + fm = yaml.safe_load(text.split("---")[1]) or {} + except Exception: continue + cls = fm.get("entity_class") + eid = fm.get("entity_id") or (fm.get(f"{cls}_id") if cls else None) + if not (cls and eid): continue + for a in (fm.get("aliases") or []): + if isinstance(a, str): + alias_index[(cls, a.strip().lower())] = (cls, eid) + # Also index canonical_name itself + cn = fm.get("canonical_name") + if isinstance(cn, str): + alias_index[(cls, cn.strip().lower())] = (cls, eid) + print(f" alias index size: {len(alias_index)}") + + # Now: for each archived DB entity, find the active canonical + print("\nResolving remap ...") + remap_pairs: list[tuple[int, int]] = [] # (old_pk, new_pk) + orphan_archived: list[tuple[str, str]] = [] + for (cls, eid), (db_pk, db_name) in db_by_key.items(): + if (cls, eid) in active: continue + # This DB entity is no longer in active YAMLs → archived + target = alias_index.get((cls, db_name.strip().lower())) + if not target: + orphan_archived.append((cls, eid)) + continue + tgt_pk_row = db_by_key.get(target) + if not tgt_pk_row: + orphan_archived.append((cls, eid)); continue + remap_pairs.append((db_pk, tgt_pk_row[0])) + + print(f" remap pairs: {len(remap_pairs)}") + print(f" orphans (archived but no canonical found): {len(orphan_archived)}") + + if remap_pairs: + cur.execute("CREATE TEMP TABLE _remap (old_pk BIGINT, new_pk BIGINT)") + with cur.copy("COPY _remap (old_pk, new_pk) FROM STDIN") as cp: + for old, new in remap_pairs: + cp.write_row((old, new)) + # 1. Insert new rows for the canonical entity (skip if already exists) + # This preserves any non-default columns the table may have. + cur.execute(""" + INSERT INTO entity_mentions (chunk_pk, entity_pk, surface_form) + SELECT em.chunk_pk, r.new_pk, em.surface_form + FROM entity_mentions em + JOIN _remap r ON em.entity_pk = r.old_pk + ON CONFLICT DO NOTHING + """) + inserted = cur.rowcount + print(f" new canonical mentions inserted: {inserted}") + # 2. Delete all old (archived-entity) mentions + cur.execute(""" + DELETE FROM entity_mentions em USING _remap r + WHERE em.entity_pk = r.old_pk + """) + print(f" archived-entity mentions removed: {cur.rowcount}") + # 3. Delete archived entities from `entities` table + cur.execute("DELETE FROM entities WHERE entity_pk IN (SELECT old_pk FROM _remap)") + print(f" archived entities removed: {cur.rowcount}") + conn.commit() + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/maintain/52_mark_generic_entities.py b/scripts/maintain/52_mark_generic_entities.py new file mode 100644 index 0000000..791a373 --- /dev/null +++ b/scripts/maintain/52_mark_generic_entities.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +""" +Mark entities whose canonical_name is purely conceptual ("Flying disc sighting +reports", "Investigation of Flying Discs", "Document Receipt by FBI"...) with +`is_generic: true`. These are categories the chunker accidentally promoted to +event/operation entities. Hiding them from /e/events, /e/operations, /timeline, +and /graph removes catalog noise without deleting data. + +Decision rule (conservative — only flag obvious noise): + - canonical_name contains GENERIC_PHRASE patterns AND + - has no specific qualifier (no proper noun, no year, no place name). + +We DO NOT touch: + - person entities (always specific) + - location entities (always specific) + - entities with date_start that resolves to a real year + - entities whose canonical_name contains a proper noun (Capitalized + Name not in the generic vocabulary) + +Idempotent. Re-running flags new generics if any. + +Run: + python3 scripts/maintain/52_mark_generic_entities.py --dry-run + python3 scripts/maintain/52_mark_generic_entities.py +""" +from __future__ import annotations +import argparse +import re +import sys +from pathlib import Path + +import yaml + +WIKI_ENT = Path("/Users/guto/ufo/wiki/entities") + +# Phrases that, when forming the BULK of a canonical_name without a specific +# qualifier, indicate the entity is a CATEGORY rather than an instance. +GENERIC_TOKEN_VOCAB = { + # core event/sighting noise + "flying", "disc", "discs", "disk", "disks", "saucer", "saucers", + "sighting", "sightings", "report", "reports", "reporting", "reported", + "investigation", "investigations", "investigative", + "observation", "observations", "observed", "observing", + "unidentified", "object", "objects", "aerial", "phenomena", "phenomenon", + "uap", "ufo", "ufos", + # generic process / bureaucracy + "document", "documents", "receipt", "receipts", "protocol", "protocols", + "summary", "summaries", "review", "reviews", "incident", "incidents", + "case", "cases", "event", "events", "encounter", "encounters", + "evaluation", "analysis", "tracking", + "memo", "memos", "memorandum", "memoranda", "letter", "letters", + "communication", "communications", "correspondence", + "information", "data", "details", "record", "records", + "filing", "file", "files", "section", "subsection", "branch", "department", + "office", "general", "matter", "matters", "subject", "subjects", + # connectors (not significant on their own) + "of", "the", "a", "an", "and", "or", "with", "on", "for", "to", "from", + "by", "at", "in", "as", "is", "are", + # pt-br equivalents (sometimes mixed) + "voador", "voadores", "disco", "discos", "avistamento", "avistamentos", + "relatorio", "relatorios", "investigacao", "investigacoes", + "observacao", "observacoes", "objeto", "objetos", "nao", "identificado", + "documento", "documentos", "recibo", "recibos", "protocolo", "protocolos", + "sumario", "resumo", "incidente", "incidentes", + # FBI bureaucratic + "internal", "security", "headquarters", "agent", "agents", +} + +YEAR_RE = re.compile(r"\b(18|19|20)\d{2}\b") +TOKEN_RE = re.compile(r"\b[\w]+\b") + + +def has_specific_qualifier(name: str) -> bool: + """Return True if name contains a year, a Capitalized proper noun (not in + the generic vocab), or a multi-word proper name suggesting a specific + place/person/case.""" + if YEAR_RE.search(name): + return True + # Look at tokens with non-generic Capitalized words + for tok in TOKEN_RE.findall(name): + # Strict proper-noun check: starts with uppercase, length >= 4, + # not in generic vocab + if tok and tok[0].isupper() and len(tok) >= 4: + if tok.lower() not in GENERIC_TOKEN_VOCAB: + return True + # Check for hyphenated identifiers (EV-..., OBJ-...) — those are codes, + # not specific qualifiers UNLESS they have date fields + return False + + +def is_pure_generic(name: str) -> bool: + """True if canonical_name is entirely composed of generic vocab tokens.""" + if not name: return True + toks = [t.lower() for t in TOKEN_RE.findall(name)] + if not toks: return True + significant = [t for t in toks if len(t) > 1] + if not significant: return True + # Every significant token must be in the generic vocab + return all(t in GENERIC_TOKEN_VOCAB for t in significant) + + +def parse_entity(path: Path): + try: + text = path.read_text(encoding="utf-8") + if not text.startswith("---"): return None + parts = text.split("---", 2) + if len(parts) < 3: return None + fm = yaml.safe_load(parts[1]) or {} + return {"path": path, "fm": fm, "raw": text} + except Exception: + return None + + +def dump_entity(entity): + raw = entity["raw"] + parts = raw.split("---", 2) + if len(parts) < 3: return raw + body = parts[2] + return "---\n" + yaml.safe_dump(entity["fm"], sort_keys=False, allow_unicode=True, width=1000) + "---" + body + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--dry-run", action="store_true") + args = ap.parse_args() + + print(f"Scanning {WIKI_ENT} ...") + # Only target entity classes where genericness is meaningful + target_classes = {"event", "operation", "concept", "uap_object"} + + total = 0 + flagged = 0 + already_flagged = 0 + samples = [] + + for f in WIKI_ENT.rglob("*.md"): + if "_archived" in f.parts: continue + ent = parse_entity(f) + if not ent: continue + fm = ent["fm"] + cls = fm.get("entity_class") + if cls not in target_classes: continue + total += 1 + if fm.get("is_generic") is True: + already_flagged += 1 + continue + name = fm.get("canonical_name") or "" + if not name: continue + # If it has a year, named person/place — skip + if has_specific_qualifier(name): continue + if not is_pure_generic(name): continue + # Flag it + fm["is_generic"] = True + if not args.dry_run: + f.write_text(dump_entity(ent), encoding="utf-8") + flagged += 1 + if len(samples) < 25: + samples.append((cls, name)) + + print(f"\nEntities scanned (event/operation/concept/uap_object): {total}") + print(f"Already flagged is_generic: {already_flagged}") + print(f"Newly flagged is_generic: {flagged}") + print(f"\nSample flagged ({min(len(samples), 25)}):") + for cls, name in samples: + print(f" [{cls:<10}] {name}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/maintain/53_add_is_generic_to_db.sql b/scripts/maintain/53_add_is_generic_to_db.sql new file mode 100644 index 0000000..70e0709 --- /dev/null +++ b/scripts/maintain/53_add_is_generic_to_db.sql @@ -0,0 +1,10 @@ +-- 53_add_is_generic_to_db.sql +-- Add public.entities.is_generic BOOLEAN. Populated by 54_sync_is_generic.py +-- which reads each YAML's is_generic and writes it to the DB. + +BEGIN; +ALTER TABLE public.entities + ADD COLUMN IF NOT EXISTS is_generic BOOLEAN NOT NULL DEFAULT FALSE; +CREATE INDEX IF NOT EXISTS entities_is_generic_idx + ON public.entities (is_generic) WHERE is_generic = TRUE; +COMMIT; diff --git a/scripts/maintain/54_sync_is_generic.py b/scripts/maintain/54_sync_is_generic.py new file mode 100644 index 0000000..1668fc1 --- /dev/null +++ b/scripts/maintain/54_sync_is_generic.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +""" +Sync `is_generic` flag from each entity YAML to public.entities table. +""" +from __future__ import annotations +import os +import sys +from pathlib import Path + +import psycopg +import yaml + +WIKI_ENT = Path("/Users/guto/ufo/wiki/entities") + + +def main() -> int: + dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL") + if not dburl: sys.exit("DATABASE_URL not set") + + rows: list[tuple[str, str, bool]] = [] + for f in WIKI_ENT.rglob("*.md"): + if "_archived" in f.parts: continue + try: + text = f.read_text(encoding="utf-8") + if not text.startswith("---"): continue + fm = yaml.safe_load(text.split("---")[1]) or {} + except Exception: continue + cls = fm.get("entity_class") + eid = fm.get("entity_id") or (fm.get(f"{cls}_id") if cls else None) + if not (cls and eid): continue + rows.append((cls, eid, bool(fm.get("is_generic")))) + + print(f"Loaded {len(rows)} entities from YAML") + + with psycopg.connect(dburl) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TEMP TABLE _gen (entity_class TEXT, entity_id TEXT, is_generic BOOL)") + with cur.copy("COPY _gen (entity_class, entity_id, is_generic) FROM STDIN") as cp: + for row in rows: cp.write_row(row) + cur.execute(""" + UPDATE entities e SET is_generic = g.is_generic + FROM _gen g + WHERE e.entity_class = g.entity_class + AND e.entity_id = g.entity_id + AND e.is_generic IS DISTINCT FROM g.is_generic + """) + print(f" rows updated: {cur.rowcount}") + cur.execute("SELECT COUNT(*) FROM entities WHERE is_generic") + print(f" total is_generic=TRUE in DB: {cur.fetchone()[0]}") + conn.commit() + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/maintain/55_relations_schema.sql b/scripts/maintain/55_relations_schema.sql new file mode 100644 index 0000000..9993558 --- /dev/null +++ b/scripts/maintain/55_relations_schema.sql @@ -0,0 +1,50 @@ +-- 55_relations_schema.sql +-- Typed relations between entities. Replaces noisy co-mention with semantic +-- edges like (Person, witnessed, Event), (Event, occurred_at, Location), +-- (Person, signed, Document), etc. + +BEGIN; + +CREATE TABLE IF NOT EXISTS public.relations ( + relation_pk BIGSERIAL PRIMARY KEY, + source_class TEXT NOT NULL, + source_id TEXT NOT NULL, + relation_type TEXT NOT NULL, + target_class TEXT NOT NULL, + target_id TEXT NOT NULL, + evidence_ref TEXT, -- e.g. '[[doc-id/p007]]' or chunk_id + confidence TEXT NOT NULL DEFAULT 'medium', -- high|medium|low + extracted_by TEXT NOT NULL DEFAULT 'yaml', -- yaml|regex|llm|manual + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + UNIQUE (source_class, source_id, relation_type, target_class, target_id, evidence_ref) +); + +-- Enum check on relation_type (extensible — add new values as we discover them) +ALTER TABLE public.relations + ADD CONSTRAINT relations_type_check + CHECK (relation_type IN ( + 'witnessed', -- (person, witnessed, event) + 'occurred_at', -- (event, occurred_at, location) + 'involves_uap', -- (event, involves_uap, uap_object) + 'documented_in', -- (event, documented_in, document) + 'authored', -- (person, authored, document) + 'signed', -- (person, signed, document) + 'mentioned_by', -- (person, mentioned_by, document) + 'employed_by', -- (person, employed_by, organization) + 'operated_by', -- (operation, operated_by, organization) + 'investigated', -- (person, investigated, event) + 'commanded', -- (person, commanded, organization) + 'related_to', -- generic fallback (lower-quality) + 'similar_to', -- (event, similar_to, event) + 'precedes', -- (event, precedes, event) + 'follows' -- (event, follows, event) + )); + +CREATE INDEX IF NOT EXISTS relations_source_idx + ON public.relations (source_class, source_id); +CREATE INDEX IF NOT EXISTS relations_target_idx + ON public.relations (target_class, target_id); +CREATE INDEX IF NOT EXISTS relations_type_idx + ON public.relations (relation_type); + +COMMIT; diff --git a/scripts/maintain/56_extract_relations.py b/scripts/maintain/56_extract_relations.py new file mode 100644 index 0000000..da1ab13 --- /dev/null +++ b/scripts/maintain/56_extract_relations.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 +""" +Extract typed relations from existing page YAMLs. + +For each wiki/pages/<doc>/p<NNN>.md, examines the structured fields the Haiku +extracted (events[], people[], organizations[], primary_location, uap_objects[]) +and produces relations: + + observers in events → (person, witnessed, event) + primary_location → (event, occurred_at, location) + uap_objects in event → (event, involves_uap, uap_object) + every event on page → (event, documented_in, document) + every person on page → (person, mentioned_by, document) + +ID mapping mirrors scripts/03-dedup-entities.py logic: + - person: slugify(name) → person_id + - event: EV-YYYY-MM-DD-slug(label) + - location: slugify(canonical location name) + - uap_object: OBJ-EV<year>-<EVENT>-NN +""" +from __future__ import annotations +import os +import re +import sys +import unicodedata +from datetime import datetime +from pathlib import Path + +import psycopg +import yaml + +WIKI = Path("/Users/guto/ufo/wiki") +PAGES_BASE = WIKI / "pages" + + +def ascii_fold(s: str) -> str: + return "".join(c for c in unicodedata.normalize("NFD", s) if not unicodedata.combining(c)) + + +def slugify(s: str) -> str: + s = ascii_fold(s).lower() + s = re.sub(r"[^a-z0-9]+", "-", s).strip("-") + return s + + +def person_id(name: str) -> str | None: + name = (name or "").strip() + if not name: return None + # Strip role prefixes like "Dr.", "Major", "Special Agent" + name = re.sub( + r"^(Mr|Mrs|Ms|Dr|Prof|Sr|Sra|Major|Maj|Col|Colonel|Lt|Lieutenant|Capt|Captain|" + r"Gen|General|Sgt|Sergeant|Agent|Special Agent|SA|Director|Deputy|Rev|Reverend|" + r"Inspector|Det|Detective)\.?\s+", "", name, flags=re.IGNORECASE, + ) + return slugify(name) or None + + +def event_id(label: str, date: str | None) -> str | None: + label = (label or "").strip() + if not label: return None + # Parse year-month-day from date or default to XXXX-XX-XX + y, m, d = "XXXX", "XX", "XX" + if date: + ms = re.search(r"(\d{4})(?:-(\d{2}))?(?:-(\d{2}))?", str(date)) + if ms: + y = ms.group(1) + m = ms.group(2) or "XX" + d = ms.group(3) or "XX" + return f"EV-{y}-{m}-{d}-{slugify(label)}" + + +def location_id(name: str) -> str | None: + name = (name or "").strip() + if not name: return None + return slugify(name) + + +def parse_page_yaml(path: Path) -> dict | None: + try: + text = path.read_text(encoding="utf-8") + if not text.startswith("---"): return None + return yaml.safe_load(text.split("---")[1]) or {} + except Exception: + return None + + +def main() -> int: + dburl = os.environ.get("DATABASE_URL") or os.environ.get("SUPABASE_DB_URL") + if not dburl: sys.exit("DATABASE_URL not set") + + print("Scanning page YAMLs ...") + rows: list[tuple[str, str, str, str, str, str, str, str]] = [] + pages_processed = 0 + for f in PAGES_BASE.rglob("p*.md"): + fm = parse_page_yaml(f) + if not fm: continue + pages_processed += 1 + # Derive doc_id/page_id from path + try: + rel = f.relative_to(PAGES_BASE) + doc_id = str(rel.parent) + page_id = f"{doc_id}/{f.stem}" + except ValueError: + continue + doc_ref = f"[[{page_id}]]" + + ents = fm.get("entities_extracted") or {} + events = ents.get("events") or fm.get("events") or [] + people = ents.get("people") or fm.get("people") or [] + locations_list = ents.get("locations") or fm.get("locations") or [] + primary_loc = fm.get("primary_location") + uap_objs = ents.get("uap_objects") or fm.get("uap_objects") or [] + + # Materialize event_ids on this page + page_event_ids: list[str] = [] + for ev in events: + if not isinstance(ev, dict): continue + label = ev.get("label") or ev.get("name") + date = ev.get("date") or ev.get("date_start") + eid = event_id(label, date) + if eid: page_event_ids.append(eid) + + # 1. observers in events → witnessed + for ev in events: + if not isinstance(ev, dict): continue + eid = event_id(ev.get("label") or ev.get("name"), ev.get("date") or ev.get("date_start")) + if not eid: continue + observers = ev.get("observers") or [] + for obs in observers: + obs_name = obs if isinstance(obs, str) else (obs.get("name") if isinstance(obs, dict) else None) + pid = person_id(obs_name) + if pid: + rows.append(("person", pid, "witnessed", "event", eid, doc_ref, "high", "yaml")) + + # 2. people on page mentioned_by document + for p in people: + pname = p if isinstance(p, str) else (p.get("name") if isinstance(p, dict) else None) + pid = person_id(pname) + if pid: + # Use page_id as doc, treating it as a "document" target + rows.append(("person", pid, "mentioned_by", "document", doc_id, doc_ref, "high", "yaml")) + + # 3. primary_location relates page events + if primary_loc: + lid = location_id(primary_loc if isinstance(primary_loc, str) + else (primary_loc.get("name") if isinstance(primary_loc, dict) else None)) + for eid in page_event_ids: + if lid: + rows.append(("event", eid, "occurred_at", "location", lid, doc_ref, "medium", "yaml")) + + # 4. uap_objects in events → involves_uap + if page_event_ids and uap_objs: + first_event = page_event_ids[0] + year_match = re.search(r"EV-(\d{4})-", first_event) + year_token = year_match.group(1) if year_match else "XXXX" + event_slug = first_event.split("-", 4)[-1].upper() + for i, obj in enumerate(uap_objs, 1): + obj_id = f"OBJ-EV{year_token}-{event_slug}-{i:02d}" + rows.append(("event", first_event, "involves_uap", "uap_object", obj_id, doc_ref, "medium", "yaml")) + + # 5. events on page → documented_in + for eid in page_event_ids: + rows.append(("event", eid, "documented_in", "document", doc_id, doc_ref, "high", "yaml")) + + print(f"Pages processed: {pages_processed}") + print(f"Relations extracted: {len(rows)}") + + # Dedupe (same source/relation/target/evidence — keep highest confidence) + seen: set[tuple] = set() + deduped: list[tuple] = [] + for r in rows: + key = (r[0], r[1], r[2], r[3], r[4], r[5]) + if key in seen: continue + seen.add(key) + deduped.append(r) + print(f"Relations after dedup: {len(deduped)}") + + if not deduped: + return 0 + + with psycopg.connect(dburl) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TEMP TABLE _rel (LIKE public.relations INCLUDING DEFAULTS)") + with cur.copy("""COPY _rel + (source_class, source_id, relation_type, + target_class, target_id, evidence_ref, + confidence, extracted_by) + FROM STDIN""") as cp: + for r in deduped: cp.write_row(r) + cur.execute(""" + INSERT INTO public.relations + (source_class, source_id, relation_type, + target_class, target_id, evidence_ref, + confidence, extracted_by) + SELECT source_class, source_id, relation_type, + target_class, target_id, evidence_ref, + confidence, extracted_by + FROM _rel + ON CONFLICT DO NOTHING + """) + print(f" inserted (after ON CONFLICT): {cur.rowcount}") + cur.execute("SELECT relation_type, COUNT(*) FROM public.relations GROUP BY relation_type ORDER BY 2 DESC") + print("\n=== Relation counts ===") + for t, n in cur.fetchall(): + print(f" {n:>7} {t}") + conn.commit() + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/maintain/run_full_dedup_pipeline.sh b/scripts/maintain/run_full_dedup_pipeline.sh new file mode 100644 index 0000000..244dde4 --- /dev/null +++ b/scripts/maintain/run_full_dedup_pipeline.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# Full dedup pipeline: +# 1. Layer 1 (deterministic merges) +# 2. Layer 2 (fuzzy trigram with numeric + code-suffix guards) +# 3. DB remap (entity_mentions → canonical entity_pk) +# 4. Re-sync signal_strength + total_mentions +# 5. Re-run text backfill +# +# Run from /Users/guto/ufo. Requires DATABASE_URL set and SSH tunnel open. +set -euo pipefail +cd /Users/guto/ufo + +source /Users/guto/ufo/infra/disclosure-stack/.env +export DATABASE_URL="postgres://postgres:${POSTGRES_PASSWORD}@localhost:5433/postgres" + +# Ensure tunnel +ss -ltn 2>/dev/null | grep -q 5433 || { + pkill -f "ssh.*5433:172" 2>/dev/null || true + sleep 1 + sshpass -p "$VPS_PASSWORD" ssh -o StrictHostKeyChecking=accept-new \ + -p "$VPS_PORT" -fN -L 5433:172.27.0.2:5432 "${VPS_USER}@${VPS_HOST}" + sleep 2 +} + +echo "=== [1/5] Layer 1 dedup (deterministic) — already applied; re-running idempotent" +python3 scripts/maintain/49_dedup_aggressive.py 2>&1 | tail -5 + +echo "" +echo "=== [2/5] Layer 2 dedup (fuzzy trigram) ===" +python3 scripts/maintain/50_dedup_fuzzy_trigram.py 2>&1 | tail -8 + +echo "" +echo "=== [3/5] Remap entity_mentions in DB ===" +python3 scripts/maintain/51_remap_entity_mentions.py 2>&1 | tail -10 + +echo "" +echo "=== [4/5] Resync signal_strength ===" +python3 scripts/maintain/42_sync_entity_stats.py --fix-obj-names 2>&1 | tail -10 + +echo "" +echo "=== [5/5] Re-run text backfill on the new canonical set ===" +python3 scripts/maintain/46_text_backfill_mentions.py 2>&1 | tail -8 + +echo "" +echo "=== Done. Final entity counts ===" +python3 -c " +from pathlib import Path +p = Path('wiki/entities') +active = sum(1 for f in p.rglob('*.md') if '_archived' not in f.parts) +archived = sum(1 for f in (p / '_archived').rglob('*.md')) if (p / '_archived').exists() else 0 +print(f' active: {active}') +print(f' archived: {archived}') +print(f' total: {active + archived}') +" diff --git a/scripts/reextract/build_doc_text.py b/scripts/reextract/build_doc_text.py new file mode 100644 index 0000000..b6de5bf --- /dev/null +++ b/scripts/reextract/build_doc_text.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +""" +build_doc_text.py — Reconstruct the FULL document text from already-extracted +chunks, with chunk-id markers so Sonnet can cite back via evidence_chunks. + +Input: raw/<doc-id>--subagent/_index.json + chunks/c*.md +Output: stdout — concatenated EN text of the document, with markers: + [chunk c0042 · page 7] + <content_en verbatim> + + [chunk c0043 · page 7] + <content_en verbatim> + ... + +Run: + python3 scripts/reextract/build_doc_text.py <doc-id> +""" +from __future__ import annotations +import json +import re +import sys +from pathlib import Path + +RAW = Path("/Users/guto/ufo/raw") + + +def split_frontmatter(text: str) -> tuple[dict, str]: + if not text.startswith("---"): + return {}, text + parts = text.split("---", 2) + if len(parts) < 3: + return {}, text + fm_raw = parts[1] + body = parts[2] + # Tolerant key:value extraction (chunks have free-text fields that break + # strict YAML — we only need a handful of keys) + fm: dict = {} + for line in fm_raw.splitlines(): + m = re.match(r"^([a-zA-Z_]+):\s*(.*)$", line) + if not m: continue + fm[m.group(1)] = m.group(2).strip() + return fm, body + + +def extract_en_section(body: str) -> str: + """Pull the EN: paragraph from a bilingual chunk body.""" + # Bodies look like: + # **EN:** <english text> + # **PT-BR:** <portuguese text> + m = re.search(r"\*\*EN:\*\*\s*(.*?)(?:\n\s*\*\*PT-BR:\*\*|\Z)", body, re.S) + if m: return m.group(1).strip() + # Some chunks store the text in `extracted_text:` field only (e.g. images) + return body.strip() + + +def main() -> int: + if len(sys.argv) < 2: + sys.exit("usage: build_doc_text.py <doc-id>") + doc_id = sys.argv[1] + chunks_dir = RAW / f"{doc_id}--subagent" / "chunks" + idx_path = RAW / f"{doc_id}--subagent" / "_index.json" + if not idx_path.is_file(): + sys.exit(f"_index.json not found for {doc_id}") + + idx = json.loads(idx_path.read_text(encoding="utf-8")) + entries = sorted(idx.get("chunks", []), key=lambda x: x.get("order_global", 0)) + + out_lines: list[str] = [f"DOCUMENT_ID: {doc_id}", + f"TOTAL_PAGES: {idx.get('total_pages')}", + f"TOTAL_CHUNKS: {len(entries)}", ""] + for entry in entries: + cid = entry.get("chunk_id") + page = entry.get("page") + ctype = entry.get("type", "?") + chunk_path = chunks_dir / f"{cid}.md" + if not chunk_path.is_file(): continue + text = chunk_path.read_text(encoding="utf-8") + fm, body = split_frontmatter(text) + en = extract_en_section(body) + + # For pure-image chunks the EN body itself describes the image. + # Fall back to image_description_en if extracted text is empty. + if not en or len(en) < 5: + # Try the description in frontmatter + en = fm.get("image_description_en") or fm.get("extracted_text") or "" + en = en.strip().strip('"\'') + + if not en: continue + + out_lines.append(f"[chunk {cid} · page {page} · type:{ctype}]") + out_lines.append(en.strip()) + out_lines.append("") + + print("\n".join(out_lines)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/reextract/enums.yaml b/scripts/reextract/enums.yaml new file mode 100644 index 0000000..ea39cea --- /dev/null +++ b/scripts/reextract/enums.yaml @@ -0,0 +1,205 @@ +# Closed enums for Sonnet re-extraction. +# Pós-extração, validator rejeita JSON com qualquer valor fora destas listas. +# Adicione novos valores aqui — NUNCA deixe Sonnet inventar. + +doc_classification: + - mission_report + - intelligence_memo + - fbi_internal_correspondence + - press_clipping + - photo_with_caption + - sketch_or_diagram + - witness_statement + - radio_transcript + - foia_release + - operation_log + - policy_document + - administrative_form + - blank_page_or_separator + - investigation_file + - aviation_incident_report + - debriefing_transcript + - other_specify + +noise_emission: + - none # 100% investigativo + - low # >70% investigativo, alguma rota/carimbo + - medium # 40-70% investigativo + - high # <40% investigativo (rotação interna, ementa, índice) + +investigative_value: + - critical + - high + - medium + - low + - none + +primary_topics: + - flying_disc_sightings + - uap_encounter + - radar_visual_correlation + - aviation_incident + - foreign_object_recovery + - operation_paperclip + - cold_war_intelligence + - nuclear_facility_overflight + - astronaut_observation + - photographic_evidence + - contactee_phenomena + - underwater_unidentified_object + - transmedium_observation + - government_disclosure + - debunking_explanation + - witness_interrogation + - policy_directive + - administrative_routing + - other + +event_class: + - uap_encounter + - press_release + - investigation_opened + - investigation_closed + - testimony_recorded + - document_published + - meeting_held + - flight_operation + - radar_detection + - photo_analysis + - personnel_change + - policy_change + - communication_sent + - communication_received + - arrest + - trial + - death + - launch_event + - recovery_operation + - intercept_attempt + - other_specify + +person_class: + - military_officer + - enlisted_personnel + - civilian_witness + - government_official + - law_enforcement + - scientist + - journalist + - pilot + - radar_operator + - intelligence_officer + - foreign_national + - clergy + - civilian + - astronaut + - politician + - lawyer + - business_person + - unknown + +org_class: + - military_unit + - military_branch + - government_agency + - civilian_agency + - law_enforcement + - intelligence_agency + - research_institution + - civilian_organization + - foreign_government + - media_organization + - contactee_group + - religious_organization + - corporation + - unknown + +geo_class: + - city + - state + - country + - region + - military_base + - airfield + - building + - waterway + - mountain + - desert + - rural_area + - sea_or_ocean + - coastal + - lake + - river + - submerged + - airspace + - unknown + +uap_shape: + - disc + - cigar + - sphere + - triangle + - rectangle + - cluster + - light_only + - cone + - dome + - irregular + - tic_tac + - cylindrical + - cross + - boomerang + - unknown + +uap_medium: + - air + - sea_surface + - submerged + - transmedium + - space + - ground + - multiple + - unknown + +uap_color: + - white + - silver_metallic + - black + - red + - orange + - yellow + - green + - blue + - multicolored + - dark_unspecified + - bright_unspecified + - unknown + +date_confidence: + - exact + - month + - year + - decade + - unknown + +confidence: + - high + - medium + - low + +relation_type: + - witnessed + - occurred_at + - involves_uap + - documented_in + - authored + - signed + - mentioned_by + - employed_by + - operated_by + - investigated + - commanded + - related_to + - similar_to + - precedes + - follows diff --git a/scripts/reextract/prompt-system.md b/scripts/reextract/prompt-system.md new file mode 100644 index 0000000..f0578a7 --- /dev/null +++ b/scripts/reextract/prompt-system.md @@ -0,0 +1,165 @@ +# Sonnet System Prompt — Re-Extração Investigativa + +Você é um analista forense investigativo do **Disclosure Bureau**. Recebe o **texto completo de um documento declassificado UAP/UFO** (já extraído por OCR/vision em pass anterior) e produz **um único JSON estruturado** com as entidades, eventos e relações investigativas que estão NO TEXTO. + +## Cobertura + +**EXTRAIA TUDO** que o texto deste segmento documenta. Não limite contagens. Se o segmento tem 100 eventos distintos, retorne 100. O sistema é alimentado por análise investigativa séria — ausência de evento é perda de evidência. + +Você pode estar recebendo APENAS UM SEGMENTO de um documento maior (ver `SEGMENT N OF M` no início, se presente). Extraia exaustivamente o que está NESTE segmento — outros segmentos cobrirão o resto. + +Critério único de não-inclusão: **falta de evidência textual neste segmento.** + +## Regras invioláveis + +1. **Você está LENDO texto, não vendo imagem.** Cada chunk é precedido pelo marcador `[chunk c0042 · page 7]`. Use esse `chunk_id` em todo campo `evidence_chunks` para apontar de volta ao texto que justifica sua extração. + +2. **NUNCA invente.** Se o nome de uma pessoa está redacted (`[REDACTED]`, `[NAME UNCLEAR]`, `▓▓▓`), retorne `"unknown"` no campo `name` e marque `confidence: "low"`. NÃO complete nomes parciais por inferência (`Mr. [redacted]` ≠ `Mr. Smith`). + +3. **NÃO traduza nomes próprios.** "Roswell" fica "Roswell". "Major Jesse Marcel" fica "Major Jesse Marcel". Locais brasileiros mantêm acentuação ("São Paulo", "Pará"). + +4. **Use APENAS os valores dos enums.** Se o conceito não cabe em nenhum enum, use o valor `other_specify` e adicione campo livre `other_specify_note` com 1 frase. NÃO invente novos valores de enum. + +5. **Cada `event`, `person`, `organization`, `location`, `relation` DEVE ter `evidence_chunks: ["c0042", ...]`.** Sem evidência, não inclua. Esse é o teste de Locard: se não há rastro no texto, não é evidência. + +6. **Não duplique entidades.** Se "Major Jesse Marcel" e "J. Marcel" são a mesma pessoa pelo contexto, escolha 1 `name` canônico e liste todas as ocorrências em `aliases_in_doc`. + +7. **Eventos = instâncias específicas.** "Flying disc sighting reports" no plural genérico NÃO é evento. "Sighting of unknown disc on 1947-07-08 over Roswell NM by William Brazel" é evento. Se não há data + local + observer ou objeto, **não é evento** — é tópico (`primary_topics`). + +8. **Foco investigativo.** Carimbos de roteamento, listas de distribuição, números de série, banners de classificação — **não são entidades**. Catalogue só o que serve para uma análise de caso real. + +9. **Bilíngue só onde pedido.** `narrative_summary` e `narrative_summary_pt_br`. Resto fica em inglês (chave de schema internacional). + +## Modo de operação + +**Você roda em 5 passes separadas por segmento.** Cada chamada (`OUTPUT MODE` block ao final) pede UM tipo só: +1. events +2. people +3. organizations + locations +4. relations +5. doc-level metadata + +Retorne **apenas o JSON descrito no OUTPUT MODE** — não tente preencher campos de outras passes (eles vêm em chamadas separadas). Não envolva em markdown fence. Não adicione preâmbulo nem postscript. JSON puro. + +## Schema de referência (completo, distribuído entre as 5 passes) + +```json +{ + "doc_id": "<o doc_id que recebeu>", + "doc_classification": "<enum doc_classification>", + "doc_classification_note": "<frase opcional se other_specify>", + "doc_period": "<YYYY ou YYYY-YYYY>", + "primary_topics": ["<enum primary_topics>", "..."], + "noise_emission": "<enum noise_emission>", + "investigative_value": "<enum investigative_value>", + "doc_summary_en": "2-3 sentences English summary of what this document IS and why it matters.", + "doc_summary_pt_br": "2-3 frases em português brasileiro: o que é o documento e por que importa.", + + "events": [ + { + "label": "Roswell debris recovery press release", + "date_start": "1947-07-08", + "date_end": "1947-07-08", + "date_confidence": "<enum date_confidence>", + "event_class": "<enum event_class>", + "primary_location_name": "Roswell Army Air Field, New Mexico, USA", + "primary_location_geo_class": "<enum geo_class>", + "observers": [ + {"name": "Major Jesse Marcel", "role_at_event": "<enum person_class>"} + ], + "uap_objects_observed": [ + { + "shape": "<enum uap_shape>", + "color": "<enum uap_color>", + "medium": "<enum uap_medium>", + "size_estimate_m": null, + "altitude_ft": null, + "speed_kts": null, + "maneuver_notes": null + } + ], + "evidence_chunks": ["c0042", "c0043"], + "narrative_summary": "1-2 sentences English.", + "narrative_summary_pt_br": "1-2 frases PT-BR.", + "confidence": "<enum confidence>" + } + ], + + "people": [ + { + "name": "Major Jesse Marcel", + "aliases_in_doc": ["J. Marcel", "Marcel"], + "person_class": "<enum person_class>", + "affiliation": "USAAF 509th Bombardment Group", + "role_at_doc_date": "intelligence officer", + "evidence_chunks": ["c0042", "c0050"], + "confidence": "<enum confidence>" + } + ], + + "organizations": [ + { + "name": "USAAF 509th Bombardment Group", + "aliases_in_doc": ["509th", "509 BG"], + "org_class": "<enum org_class>", + "country": "USA", + "evidence_chunks": ["c0042"], + "confidence": "<enum confidence>" + } + ], + + "locations": [ + { + "name": "Roswell Army Air Field", + "aliases_in_doc": ["RAAF", "Roswell airfield"], + "geo_class": "<enum geo_class>", + "country": "USA", + "region_or_state": "New Mexico", + "evidence_chunks": ["c0042"], + "confidence": "<enum confidence>" + } + ], + + "relations": [ + { + "source_class": "<enum>", // person | event | organization | location | uap_object | document + "source_name": "Major Jesse Marcel", + "type": "<enum relation_type>", + "target_class": "<enum>", + "target_name": "Roswell debris recovery press release", + "evidence_chunks": ["c0042"], + "confidence": "<enum confidence>" + } + ] +} +``` + +## Notas finais + +- Se o documento é puro ruído administrativo (`noise_emission: high`), retorne arrays vazios mas preencha `doc_classification`, `noise_emission`, `doc_summary_*`. Não force a achar eventos onde não há. +- Se observar conexão entre documentos (este memo cita aquele), use `relations` com `target_class: "document"` e `target_name: <doc_id>` quando o doc-id estiver mencionado. +- Se observar UAP submerso ou transmedium (objeto entrando/saindo da água), garanta `uap_medium: submerged` ou `transmedium` e seja explícito no `narrative_summary`. + +## Regras de validação invioláveis (causaram rejeição em rodadas anteriores) + +10. **Formatos de data permitidos** — `YYYY` (só ano), `YYYY-MM` (ano + mês), `YYYY-MM-DD` (data completa), `XXXX` / `XXXX-XX` / `XXXX-XX-XX` (totalmente desconhecida). **NÃO use `2023-09-XX` nem `2023-XX-XX`** — se você sabe o mês mas não o dia, escreva `2023-09`. Se você sabe o ano mas não o mês, escreva `2023`. + +11. **`other_specify` só existe em DOIS enums:** `doc_classification` e `event_class`. Para QUALQUER outro enum — `geo_class`, `person_class`, `org_class`, `uap_shape`, `uap_medium`, `uap_color`, `date_confidence`, `noise_emission`, `investigative_value`, `confidence`, `relation_type`, `primary_topics` — escolha o valor do enum que MAIS SE APROXIMA, ou use `unknown` se nada se aproxima. **NUNCA `other_specify` fora dos 2 enums permitidos.** + +12. **`source_class` e `target_class` em `relations`** SÃO RESTRITOS A exatamente esta lista: `person`, `event`, `organization`, `location`, `uap_object`, `document`. **NÃO use `vehicle`** — se o item é uma cápsula/aeronave que é o objeto observado, mapeie para `uap_object`. Se é uma aeronave operada por militar (não-UAP), use `organization` (a unidade militar) ou simplesmente omita a relação. + +13. **`evidence_chunks` deve usar APENAS os IDs de chunks visíveis neste segmento.** O texto traz marcadores `[chunk c0042 · page 7]`. **ANTES de incluir um chunk_id em `evidence_chunks`, confirme que esse marcador aparece literalmente no texto deste segmento.** Se você acha que viu `c0026` mas não consegue localizá-lo no texto, NÃO use. Inventar chunk_id quebra a procedência (Locard) e é o erro mais grave possível. + +14. **Toda entidade extraída TEM que ter `evidence_chunks` não-vazio.** Se você não consegue apontar para um chunk concreto presente no texto, a entidade não está documentada — **NÃO A INCLUA NA SAÍDA**. Melhor 50 entidades bem-evidenciadas do que 60 onde 1 não tem rastro. + +## Fallbacks explícitos quando o conceito não cabe no enum + +Listas completas dos enums críticos (use APENAS estes valores): + +**`geo_class`** = `city | state | country | region | military_base | airfield | building | waterway | mountain | desert | rural_area | sea_or_ocean | coastal | lake | river | submerged | airspace | unknown`. **Não existe `space`** — para órbita lunar / superfície da Lua / observações de astronauta em espaço, use `unknown` e ponha o contexto no campo livre (`primary_location_name` recebe "Lunar orbit" ou "Apollo capsule, translunar coast"). Não force outro enum. + +**`event_class`** = `uap_encounter | press_release | investigation_opened | investigation_closed | testimony_recorded | document_published | meeting_held | flight_operation | radar_detection | photo_analysis | personnel_change | policy_change | communication_sent | communication_received | arrest | trial | death | launch_event | recovery_operation | intercept_attempt | other_specify`. **Não existe `debriefing_transcript` como event_class** — debriefings de astronauta ou militar mapeiam para `testimony_recorded`. Use `other_specify` + `other_specify_note` somente se realmente nenhum dos 20 se aproxima. + +**`relation_type`** = `witnessed | occurred_at | involves_uap | documented_in | authored | signed | mentioned_by | employed_by | operated_by | investigated | commanded | related_to | similar_to | precedes | follows`. Sem `other_specify` — se nada cabe, use `related_to`. + +Retorne **APENAS** o JSON. Sem texto explicativo antes ou depois. Sem markdown fence (```). JSON puro parseável. diff --git a/scripts/reextract/run.py b/scripts/reextract/run.py new file mode 100644 index 0000000..ec8796f --- /dev/null +++ b/scripts/reextract/run.py @@ -0,0 +1,417 @@ +#!/usr/bin/env python3 +""" +run.py — Re-extract a document via Claude Code OAuth (Sonnet), with chunked +processing for large docs. + +Strategy: + - Build doc text from already-extracted chunks (build_doc_text.py). + - If text fits in one Sonnet window (default 50k tokens input budget), + run a single call producing the full JSON. + - Otherwise, split the doc into overlapping segments of ~50k input tokens + each, run Sonnet on each segment (preserving chunk_id markers), then + MERGE the JSONs deduping by (name, class) within each entity list and + by (source_name, type, target_name, evidence_chunks) for relations. + +The merged JSON faithfully covers the entire document — no entity is dropped +because the doc was "too big". +""" +from __future__ import annotations +import json +import os +import re +import subprocess +import sys +from pathlib import Path + +import yaml + +REX_DIR = Path("/Users/guto/ufo/scripts/reextract") +RAW = Path("/Users/guto/ufo/raw") +BUILD_DOC = REX_DIR / "build_doc_text.py" +PROMPT_SYSTEM = REX_DIR / "prompt-system.md" +ENUMS_YAML = REX_DIR / "enums.yaml" +VALIDATE = REX_DIR / "validate.py" + +# Token budget per call. +# Both Sonnet and Opus cap output at 32k tokens. We partition the extraction +# into 5 separate calls per segment, each producing a small piece of the JSON. +# Each piece is well under the ceiling. +SEGMENT_INPUT_CHARS = 60_000 # ~15k tokens input per segment +SEGMENT_OVERLAP_CHARS = 3_000 + +# Per-segment extraction is split into 5 passes. Each pass gets the same +# document text (so the claude CLI reuses its prompt cache) but a different +# "output mode" instruction asking for ONE category only. +PASSES = [ + ("events", + "Return a JSON object with ONLY the events array, exhaustively extracted " + "from THIS segment:\n\n" + "{\"events\": [{...event objects per the schema...}]}\n\n" + "Use the event schema rules from the system prompt. Include " + "uap_objects_observed, observers (with role_at_event), " + "primary_location_name/geo_class, evidence_chunks, both narratives. " + "Do NOT include people/organizations/locations/relations/doc-level fields."), + ("people", + "Return a JSON object with ONLY the people array, exhaustively extracted " + "from THIS segment:\n\n" + "{\"people\": [{...person objects per the schema...}]}\n\n" + "Use the person schema rules. Each entry: name, aliases_in_doc, " + "person_class, affiliation, role_at_doc_date, evidence_chunks, confidence. " + "Skip pure routing-list entries (FBI distribution slips, etc.) unless they " + "are subjects/witnesses/authors of real content."), + ("orgs_locs", + "Return a JSON object with ONLY organizations[] and locations[], exhaustively " + "extracted from THIS segment:\n\n" + "{\"organizations\": [...], \"locations\": [...]}\n\n" + "Use the schema rules. Include aliases_in_doc, class enums, country, " + "region_or_state (locations), evidence_chunks."), + ("relations", + "Return a JSON object with ONLY the relations array, exhaustively extracted " + "from THIS segment:\n\n" + "{\"relations\": [{...relation objects...}]}\n\n" + "Priority: typed investigative relations (witnessed, occurred_at, signed, " + "involves_uap, investigated, authored, employed_by, commanded). " + "mentioned_by ONLY for clearly investigative people (authors of memos, " + "subjects of investigations); skip mentioned_by for pure routing-list " + "names (Tolson, Ladd routing slips). All other types as found."), + ("doc_meta", + "Return a JSON object with ONLY the document-level fields:\n\n" + "{\"doc_classification\": \"...\", \"doc_classification_note\": null|\"...\", " + "\"doc_period\": \"YYYY\" or \"YYYY-YYYY\", " + "\"primary_topics\": [\"...\"], " + "\"noise_emission\": \"...\", \"investigative_value\": \"...\", " + "\"doc_summary_en\": \"2-3 sentences\", \"doc_summary_pt_br\": \"2-3 frases\"}\n\n" + "These reflect the FULL document this segment is part of (you may not see " + "every page in this segment, but classify based on what you do see)."), +] + + +def build_doc_text(doc_id: str) -> str: + r = subprocess.run( + ["python3", str(BUILD_DOC), doc_id], + capture_output=True, text=True, encoding="utf-8", + ) + if r.returncode != 0: + sys.exit(f"build_doc_text failed: {r.stderr}") + return r.stdout + + +def segment_doc(text: str) -> list[str]: + """Split doc text into overlapping segments at chunk-marker boundaries. + + A segment never breaks a chunk — we split at the `[chunk c... · ...]` + line boundaries closest to the char-budget cap. + """ + if len(text) <= SEGMENT_INPUT_CHARS: + return [text] + + # Find all chunk marker line positions (line starts). + marker_re = re.compile(r"^\[chunk c\d+ ·.*$", re.MULTILINE) + starts = [m.start() for m in marker_re.finditer(text)] + if not starts: return [text] + + segments: list[str] = [] + seg_start = 0 + while seg_start < len(text): + cap = seg_start + SEGMENT_INPUT_CHARS + if cap >= len(text): + segments.append(text[seg_start:]) + break + # Pick the LAST chunk marker before cap (so we never break a chunk) + candidates = [s for s in starts if seg_start < s < cap] + if not candidates: + # No chunk marker fits — cut at cap (shouldn't happen with normal data) + seg_end = cap + else: + seg_end = candidates[-1] + segments.append(text[seg_start:seg_end]) + # Next segment starts at the chunk marker that gives ~OVERLAP_CHARS overlap + target_overlap_start = seg_end - SEGMENT_OVERLAP_CHARS + overlap_candidates = [s for s in starts if target_overlap_start <= s < seg_end] + if overlap_candidates: + seg_start = overlap_candidates[0] + else: + seg_start = seg_end + return segments + + +def build_full_prompt(doc_id: str, doc_text: str, segment_meta: str = "", + pass_instruction: str = "", + error_feedback: str = "") -> str: + system = PROMPT_SYSTEM.read_text(encoding="utf-8") + enums = ENUMS_YAML.read_text(encoding="utf-8") + feedback_block = "" + if error_feedback: + feedback_block = ( + "\n\n## PREVIOUS ATTEMPT FAILED VALIDATION\n\n" + "Your previous JSON had these errors. Fix them and return a corrected JSON:\n" + f"```\n{error_feedback}\n```\n\n" + "Re-emit the FULL corrected JSON.\n" + ) + pass_block = "" + if pass_instruction: + pass_block = ( + "\n\n## OUTPUT MODE (THIS CALL ONLY)\n\n" + f"{pass_instruction}\n\n" + "Return ONLY the JSON object described above. No markdown fence, " + "no preamble, no postscript. JSON only.\n" + ) + return ( + f"{system}\n\n" + "## CLOSED ENUMS (use ONLY these values)\n\n" + f"```yaml\n{enums}\n```\n\n" + f"## DOCUMENT TO ANALYZE — doc_id: {doc_id}\n" + f"{segment_meta}\n\n" + f"```\n{doc_text}\n```\n" + f"{pass_block}" + f"{feedback_block}\n" + "Return the JSON now." + ) + + +def call_claude(prompt: str) -> tuple[str, dict]: + """Returns (response_text, meta_dict). + + Uses `--output-format text` + `--disallowed-tools` and redirects stdout + DIRECTLY to a temp file (not via subprocess.PIPE, which silently truncates + large outputs). Tools are disabled to prevent the model from initiating + multi-turn calls that fail in `-p` mode. + """ + import tempfile + env = {**os.environ, "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000"} + DISALLOWED = ( + "AskUserQuestion,Bash,Edit,Write,Read,Task,Glob,Grep," + "TaskCreate,TaskUpdate,TaskList,TaskGet,TaskStop,TaskOutput," + "Skill,ScheduleWakeup,Monitor,WebSearch,WebFetch,NotebookEdit," + "EnterPlanMode,ExitPlanMode,EnterWorktree,ExitWorktree," + "CronCreate,CronDelete,CronList,RemoteTrigger,ToolSearch," + "PushNotification,ListMcpResourcesTool,ReadMcpResourceTool," + "ShareOnboardingGuide" + ) + with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding="utf-8") as tmp: + tmp_path = tmp.name + try: + with open(tmp_path, "wb") as out_f: + r = subprocess.run( + ["claude", "-p", "--model", "opus", "--output-format", "text", + "--disallowed-tools", DISALLOWED], + input=prompt.encode("utf-8"), + stdout=out_f, + stderr=subprocess.PIPE, + env=env, + ) + if r.returncode != 0: + sys.exit( + f"claude CLI failed (rc={r.returncode})\n" + f"stderr: {r.stderr.decode('utf-8', errors='replace')[:4000]}" + ) + with open(tmp_path, "r", encoding="utf-8") as f: + output = f.read() + return (output, {"stop_reason": "text-format-no-meta", "chars": len(output)}) + finally: + try: os.unlink(tmp_path) + except OSError: pass + + +def extract_json_block(s: str) -> str: + s = s.strip() + if s.startswith("```"): + s = "\n".join(line for line in s.splitlines() if not line.startswith("```")) + s = s.strip() + start = s.find("{") + end = s.rfind("}") + if start >= 0 and end > start: + return s[start:end + 1] + return s + + +def merge_extractions(segments: list[dict], doc_id: str) -> dict: + """Merge per-segment JSONs into a single doc-level JSON. + + Dedup rules: + - events: by (label, date_start) + - people: by canonical name (lowercase, role prefixes stripped) + - orgs: by canonical name (lowercase) + - locations: by canonical name (lowercase) + - relations: by (source_name, type, target_name) — union of evidence_chunks + Top-level fields (classification, summary) come from the FIRST segment with + non-null values; primary_topics is union; noise_emission/investigative_value + is the MAX across segments (worst case takes precedence). + """ + out: dict = { + "doc_id": doc_id, + "doc_classification": None, + "doc_classification_note": None, + "doc_period": None, + "primary_topics": [], + "noise_emission": None, + "investigative_value": None, + "doc_summary_en": None, + "doc_summary_pt_br": None, + "events": [], + "people": [], + "organizations": [], + "locations": [], + "relations": [], + } + + def lower_key(s) -> str: + return (s or "").strip().lower() + + # Top-level pickup from first non-null + for seg in segments: + for k in ("doc_classification", "doc_classification_note", "doc_period", + "doc_summary_en", "doc_summary_pt_br"): + if not out.get(k) and seg.get(k): + out[k] = seg[k] + # primary_topics: union, preserve first-seen order + seen = set() + for seg in segments: + for t in seg.get("primary_topics") or []: + if t and t not in seen: + seen.add(t); out["primary_topics"].append(t) + + # noise / investigative — take WORST across segments + NOISE_ORDER = {"none": 0, "low": 1, "medium": 2, "high": 3} + INV_ORDER = {"none": 0, "low": 1, "medium": 2, "high": 3, "critical": 4} + noise_max, inv_max = None, None + for seg in segments: + n = seg.get("noise_emission") + if n in NOISE_ORDER and (noise_max is None or NOISE_ORDER[n] > NOISE_ORDER[noise_max]): + noise_max = n + i = seg.get("investigative_value") + if i in INV_ORDER and (inv_max is None or INV_ORDER[i] > INV_ORDER[inv_max]): + inv_max = i + out["noise_emission"] = noise_max + out["investigative_value"] = inv_max + + # Entities — dedup by canonical key + def merge_list(key: str, key_fn): + seen_keys: dict = {} + for seg in segments: + for item in seg.get(key) or []: + if not isinstance(item, dict): continue + k = key_fn(item) + if not k: continue + if k in seen_keys: + # merge: union evidence_chunks, aliases_in_doc; keep highest confidence + existing = seen_keys[k] + ev = set(existing.get("evidence_chunks") or []) | set(item.get("evidence_chunks") or []) + existing["evidence_chunks"] = sorted(ev) + al = set(existing.get("aliases_in_doc") or []) | set(item.get("aliases_in_doc") or []) + al.discard(existing.get("name", "")) + existing["aliases_in_doc"] = sorted(al) + # confidence: keep max + conf_order = {"low": 0, "medium": 1, "high": 2} + if conf_order.get(item.get("confidence"), 0) > conf_order.get(existing.get("confidence"), 0): + existing["confidence"] = item["confidence"] + else: + seen_keys[k] = item + return list(seen_keys.values()) + + out["events"] = merge_list("events", + lambda e: (lower_key(e.get("label")), e.get("date_start") or "")) + out["people"] = merge_list("people", + lambda p: lower_key(p.get("name"))) + out["organizations"] = merge_list("organizations", + lambda o: lower_key(o.get("name"))) + out["locations"] = merge_list("locations", + lambda l: lower_key(l.get("name"))) + + # Relations: dedup by (source_class, source_name_lower, type, target_class, target_name_lower) + rel_seen: dict = {} + for seg in segments: + for r in seg.get("relations") or []: + if not isinstance(r, dict): continue + key = ( + r.get("source_class"), + lower_key(r.get("source_name")), + r.get("type"), + r.get("target_class"), + lower_key(r.get("target_name")), + ) + if key in rel_seen: + existing = rel_seen[key] + ev = set(existing.get("evidence_chunks") or []) | set(r.get("evidence_chunks") or []) + existing["evidence_chunks"] = sorted(ev) + else: + rel_seen[key] = r + out["relations"] = list(rel_seen.values()) + return out + + +def main() -> int: + if len(sys.argv) < 2: + sys.exit("usage: run.py <doc-id>") + doc_id = sys.argv[1] + out_path = RAW / f"{doc_id}--subagent" / "_reextract.json" + + print(f"[1/N] Building doc text ...") + doc_text = build_doc_text(doc_id) + print(f" {len(doc_text)} chars (~{len(doc_text) // 4} tokens)") + + segments = segment_doc(doc_text) + n_seg = len(segments) + print(f"[2/N] Splitting into {n_seg} segment(s) of ~{SEGMENT_INPUT_CHARS // 1000}k chars each") + for i, s in enumerate(segments, 1): + print(f" segment {i}: {len(s)} chars") + + from concurrent.futures import ThreadPoolExecutor, as_completed + + def run_pass(seg_idx: int, seg_text: str, pass_name: str, pass_instr: str) -> tuple[int, str, dict | None, str]: + meta_label = f"\n\n[SEGMENT {seg_idx + 1} OF {n_seg}] — extract everything in THIS segment exhaustively.\n" + prompt = build_full_prompt(doc_id, seg_text, segment_meta=meta_label, + pass_instruction=pass_instr) + raw, _meta = call_claude(prompt) + json_text = extract_json_block(raw) + try: + piece = json.loads(json_text) + return (seg_idx, pass_name, piece, "") + except json.JSONDecodeError as e: + return (seg_idx, pass_name, None, f"{e} | raw_len={len(raw)}") + + # Fire ALL (segment, pass) jobs in parallel + extracted: list[dict] = [{"doc_id": doc_id} for _ in range(n_seg)] + n_jobs = n_seg * len(PASSES) + print(f"[3/N] Firing {n_jobs} parallel passes ({n_seg} segments × {len(PASSES)} passes)") + errors: list[str] = [] + completed = 0 + # Cap inner concurrency so outer-WORKERS × inner doesn't fork-bomb the box. + # 5 = one slot per pass; segments process serially within a doc. + INNER_MAX = int(os.environ.get("REEXTRACT_INNER_MAX", "5")) + with ThreadPoolExecutor(max_workers=min(INNER_MAX, n_jobs)) as pool: + futures = [] + for seg_idx, seg in enumerate(segments): + for pass_name, pass_instr in PASSES: + futures.append(pool.submit(run_pass, seg_idx, seg, pass_name, pass_instr)) + for fut in as_completed(futures): + seg_idx, pass_name, piece, err = fut.result() + completed += 1 + tag = f"seg{seg_idx+1}/{pass_name}" + if err: + errors.append(f"{tag}: {err}") + debug = out_path.parent / f"_reextract_raw_seg{seg_idx+1}_{pass_name}.txt" + print(f" [{completed}/{n_jobs}] {tag} FAILED — {err[:120]}") + else: + if isinstance(piece, dict): + for k, v in piece.items(): + extracted[seg_idx][k] = v + print(f" [{completed}/{n_jobs}] {tag} OK") + + print(f"[4/N] Merging {n_seg} extraction(s) ...") + merged = merge_extractions(extracted, doc_id) if n_seg > 1 else {**extracted[0], "doc_id": doc_id} + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(merged, indent=2, ensure_ascii=False), encoding="utf-8") + print(f" saved {out_path}") + + print(f"[5/N] Validating ...") + v = subprocess.run( + ["python3", str(VALIDATE), doc_id, str(out_path)], + capture_output=True, text=True, encoding="utf-8", + ) + print(v.stdout.strip()) + return v.returncode + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/reextract/run_parallel.sh b/scripts/reextract/run_parallel.sh new file mode 100755 index 0000000..87f6c88 --- /dev/null +++ b/scripts/reextract/run_parallel.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +# Parallel re-extraction orchestrator. +# +# - Lists every doc that has raw/<doc>--subagent/_index.json +# - Skips docs that already have _reextract.json (idempotent) +# - Uses an mkdir-based per-doc lock to prevent two workers from racing +# - Runs N workers in parallel (default 8, override via WORKERS=N) +# - Logs each doc to raw/<doc>--subagent/_reextract.log +# +# Run: +# ./run_parallel.sh # all docs, 8 workers +# WORKERS=4 ./run_parallel.sh # 4 workers +# ./run_parallel.sh DOC1 DOC2 # specific docs only +set -uo pipefail + +UFO="/Users/guto/ufo" +RAW="$UFO/raw" +RUN="$UFO/scripts/reextract/run.py" +WORKERS="${WORKERS:-4}" + +# Build list of doc IDs +if [ "$#" -gt 0 ]; then + DOCS=("$@") +else + DOCS=() + for d in "$RAW"/*--subagent; do + [ -f "$d/_index.json" ] || continue + doc_id=$(basename "$d" | sed 's/--subagent$//') + DOCS+=("$doc_id") + done +fi + +echo "=== Re-extract orchestrator ===" +echo " docs queued: ${#DOCS[@]}" +echo " workers: $WORKERS" +echo "" + +process_one() { + local doc_id="$1" + local sub="$RAW/$doc_id--subagent" + local out="$sub/_reextract.json" + local log="$sub/_reextract.log" + local lock="$sub/.reextract.lock" + + # Skip if already extracted + if [ -f "$out" ]; then + # Quick sanity: must parse as JSON + if python3 -c "import json,sys; json.load(open('$out'))" 2>/dev/null; then + echo "[SKIP] $doc_id (already extracted)" + return 0 + fi + fi + + # Acquire lock via mkdir (atomic) + if ! mkdir "$lock" 2>/dev/null; then + echo "[LOCK] $doc_id (another worker has it)" + return 0 + fi + trap "rmdir '$lock' 2>/dev/null || true" EXIT + + local started=$(date +%s) + echo "[BEGIN] $doc_id" + if python3 "$RUN" "$doc_id" > "$log" 2>&1; then + local elapsed=$(($(date +%s) - started)) + echo "[OK] $doc_id (${elapsed}s)" + else + local elapsed=$(($(date +%s) - started)) + echo "[FAIL] $doc_id (${elapsed}s) — see $log" + fi + + rmdir "$lock" 2>/dev/null || true + trap - EXIT +} + +export -f process_one +export RAW RUN + +# Run in parallel via xargs +printf '%s\n' "${DOCS[@]}" | xargs -n 1 -P "$WORKERS" -I {} bash -c 'process_one "$@"' _ {} + +echo "" +echo "=== Done. Summary: ===" +ok=0; skip=0; fail=0 +for d in "${DOCS[@]}"; do + out="$RAW/$d--subagent/_reextract.json" + if [ -f "$out" ]; then + if python3 -c "import json,sys; json.load(open('$out'))" 2>/dev/null; then + ok=$((ok + 1)) + else + fail=$((fail + 1)) + fi + else + fail=$((fail + 1)) + fi +done +echo " OK: $ok" +echo " FAIL: $fail" diff --git a/scripts/reextract/validate.py b/scripts/reextract/validate.py new file mode 100644 index 0000000..10097b0 --- /dev/null +++ b/scripts/reextract/validate.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +""" +validate.py — Validate a Sonnet re-extraction JSON against the closed enums +in enums.yaml. Returns exit 0 if valid; prints errors and exits 1 otherwise. + +Run: + python3 scripts/reextract/validate.py <doc-id> +""" +from __future__ import annotations +import json +import re +import sys +from pathlib import Path + +import yaml + +REX_DIR = Path("/Users/guto/ufo/scripts/reextract") +OUT_DIR = Path("/Users/guto/ufo/raw") + + +def load_enums() -> dict[str, set[str]]: + raw = yaml.safe_load((REX_DIR / "enums.yaml").read_text(encoding="utf-8")) + return {k: set(v) for k, v in raw.items()} + + +def validate(data: dict, enums: dict[str, set[str]], doc_id: str) -> list[str]: + errs: list[str] = [] + + def check_enum(value, enum_name: str, ctx: str): + if value is None: return + if value not in enums.get(enum_name, set()): + errs.append(f"{ctx}: '{value}' not in enum:{enum_name}") + + def check_list_enum(values, enum_name: str, ctx: str): + if not isinstance(values, list): return + for i, v in enumerate(values): + check_enum(v, enum_name, f"{ctx}[{i}]") + + # top-level + if data.get("doc_id") != doc_id: + errs.append(f"top: doc_id mismatch: '{data.get('doc_id')}' != '{doc_id}'") + check_enum(data.get("doc_classification"), "doc_classification", "top.doc_classification") + check_enum(data.get("noise_emission"), "noise_emission", "top.noise_emission") + check_enum(data.get("investigative_value"), "investigative_value", "top.investigative_value") + check_list_enum(data.get("primary_topics"), "primary_topics", "top.primary_topics") + + # known chunk IDs from index — to verify evidence_chunks exist + idx_path = OUT_DIR / f"{doc_id}--subagent" / "_index.json" + known_chunks: set[str] = set() + if idx_path.is_file(): + try: + idx = json.loads(idx_path.read_text(encoding="utf-8")) + known_chunks = {c.get("chunk_id") for c in idx.get("chunks", [])} + except Exception: + pass + + def check_evidence(refs, ctx: str): + if not isinstance(refs, list): + errs.append(f"{ctx}: evidence_chunks must be list") + return + if not refs: + errs.append(f"{ctx}: evidence_chunks empty") + return + for r in refs: + if not isinstance(r, str) or not re.match(r"^c\d+$", r): + errs.append(f"{ctx}: bad chunk_id '{r}'") + elif known_chunks and r not in known_chunks: + errs.append(f"{ctx}: unknown chunk_id '{r}' (not in _index.json)") + + # events + for i, ev in enumerate(data.get("events") or []): + ctx = f"events[{i}]" + if not isinstance(ev, dict): + errs.append(f"{ctx}: not object"); continue + check_enum(ev.get("event_class"), "event_class", f"{ctx}.event_class") + check_enum(ev.get("date_confidence"), "date_confidence", f"{ctx}.date_confidence") + check_enum(ev.get("primary_location_geo_class"), "geo_class", f"{ctx}.primary_location_geo_class") + check_enum(ev.get("confidence"), "confidence", f"{ctx}.confidence") + check_evidence(ev.get("evidence_chunks"), ctx) + for j, o in enumerate(ev.get("observers") or []): + check_enum(o.get("role_at_event") if isinstance(o, dict) else None, + "person_class", f"{ctx}.observers[{j}].role_at_event") + for j, u in enumerate(ev.get("uap_objects_observed") or []): + if not isinstance(u, dict): continue + check_enum(u.get("shape"), "uap_shape", f"{ctx}.uap[{j}].shape") + check_enum(u.get("color"), "uap_color", f"{ctx}.uap[{j}].color") + check_enum(u.get("medium"), "uap_medium", f"{ctx}.uap[{j}].medium") + # date format + for k in ("date_start", "date_end"): + v = ev.get(k) + if v and not re.match(r"^\d{4}(-\d{2}(-\d{2})?)?$|^XXXX(-XX(-XX)?)?$", v): + errs.append(f"{ctx}.{k}: bad date format '{v}'") + + # people + for i, p in enumerate(data.get("people") or []): + ctx = f"people[{i}]" + if not isinstance(p, dict): + errs.append(f"{ctx}: not object"); continue + check_enum(p.get("person_class"), "person_class", f"{ctx}.person_class") + check_enum(p.get("confidence"), "confidence", f"{ctx}.confidence") + check_evidence(p.get("evidence_chunks"), ctx) + + # organizations + for i, o in enumerate(data.get("organizations") or []): + ctx = f"organizations[{i}]" + if not isinstance(o, dict): + errs.append(f"{ctx}: not object"); continue + check_enum(o.get("org_class"), "org_class", f"{ctx}.org_class") + check_enum(o.get("confidence"), "confidence", f"{ctx}.confidence") + check_evidence(o.get("evidence_chunks"), ctx) + + # locations + for i, l in enumerate(data.get("locations") or []): + ctx = f"locations[{i}]" + if not isinstance(l, dict): + errs.append(f"{ctx}: not object"); continue + check_enum(l.get("geo_class"), "geo_class", f"{ctx}.geo_class") + check_enum(l.get("confidence"), "confidence", f"{ctx}.confidence") + check_evidence(l.get("evidence_chunks"), ctx) + + # relations + valid_classes = {"person", "event", "organization", "location", "uap_object", "document"} + for i, r in enumerate(data.get("relations") or []): + ctx = f"relations[{i}]" + if not isinstance(r, dict): + errs.append(f"{ctx}: not object"); continue + check_enum(r.get("type"), "relation_type", f"{ctx}.type") + check_enum(r.get("confidence"), "confidence", f"{ctx}.confidence") + check_evidence(r.get("evidence_chunks"), ctx) + for k in ("source_class", "target_class"): + v = r.get(k) + if v not in valid_classes: + errs.append(f"{ctx}.{k}: '{v}' not in {valid_classes}") + + return errs + + +def main() -> int: + if len(sys.argv) < 2: + sys.exit("usage: validate.py <doc-id> [<json-path>]") + doc_id = sys.argv[1] + json_path = sys.argv[2] if len(sys.argv) > 2 else str(OUT_DIR / f"{doc_id}--subagent" / "_reextract.json") + p = Path(json_path) + if not p.is_file(): + sys.exit(f"json not found: {p}") + try: + data = json.loads(p.read_text(encoding="utf-8")) + except json.JSONDecodeError as e: + sys.exit(f"JSON parse error: {e}") + enums = load_enums() + errs = validate(data, enums, doc_id) + if errs: + print(f"❌ {len(errs)} validation errors for {doc_id}:") + for e in errs[:50]: + print(f" - {e}") + if len(errs) > 50: + print(f" ... +{len(errs) - 50} more") + return 1 + print(f"✓ valid: {doc_id}") + print(f" events: {len(data.get('events') or [])}") + print(f" people: {len(data.get('people') or [])}") + print(f" orgs: {len(data.get('organizations') or [])}") + print(f" locs: {len(data.get('locations') or [])}") + print(f" rels: {len(data.get('relations') or [])}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/synthesize/30_rebuild_wiki_from_reextract.py b/scripts/synthesize/30_rebuild_wiki_from_reextract.py new file mode 100644 index 0000000..477fac6 --- /dev/null +++ b/scripts/synthesize/30_rebuild_wiki_from_reextract.py @@ -0,0 +1,447 @@ +#!/usr/bin/env python3 +""" +30_rebuild_wiki_from_reextract.py — Rebuild wiki/entities/ from scratch using +the 116 _reextract.json files as the SOLE source of truth. + +Pipeline: + 1. Load every raw/<doc>--subagent/_reextract.json + 2. Load every raw/<doc>--subagent/_index.json (chunk_id → page map) + 3. Cross-doc dedup: + person/org/loc: by canonical_name (lowercase, ASCII-fold) + event: by event_id (EV-YYYY-MM-DD-slug) + uap_object: per (event, observed_index) — never deduped cross-event + 4. Generate IDs per CLAUDE.md regex + 5. Write wiki/entities/{type}/<id>.md (clean frontmatter + EN/PT-BR body stubs) + 6. Print summary + +Does NOT touch DB. DB sync is a separate step. +Idempotent: re-running with same inputs produces same outputs (deterministic). +""" +from __future__ import annotations +import json +import re +import sys +import unicodedata +from collections import defaultdict +from datetime import datetime, timezone +from pathlib import Path + +import yaml + +UFO = Path("/Users/guto/ufo") +RAW = UFO / "raw" +ENT = UFO / "wiki" / "entities" + +SCHEMA_VERSION = "0.1.0" +WIKI_VERSION = "0.1.0" +NOW = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def canonicalize_name(name: str) -> str: + """Generic name → kebab-case ASCII-fold id (mirrors scripts/03-dedup-entities.py).""" + if not name: + return "" + nfkd = unicodedata.normalize("NFKD", name) + ascii_str = "".join(c for c in nfkd if not unicodedata.combining(c)) + lower = ascii_str.lower() + replaced = re.sub(r"[^a-z0-9-]", "-", lower) + collapsed = re.sub(r"-+", "-", replaced).strip("-") + if collapsed and collapsed[0].isdigit(): + collapsed = "x-" + collapsed + return collapsed + + +def event_id_from(label: str, date_start: str | None) -> str: + slug = canonicalize_name(label or "")[:40].strip("-") or "unlabeled" + date = date_start or "" + m = re.match(r"^(\d{4})-(\d{2})-(\d{2})$", date) + if m: return f"EV-{m.group(1)}-{m.group(2)}-{m.group(3)}-{slug}" + m = re.match(r"^(\d{4})-(\d{2})$", date) + if m: return f"EV-{m.group(1)}-{m.group(2)}-XX-{slug}" + m = re.match(r"^(\d{4})$", date) + if m: return f"EV-{m.group(1)}-XX-XX-{slug}" + return f"EV-XXXX-XX-XX-{slug}" + + +def uap_object_id(event_id: str, index: int) -> str: + if event_id.startswith("EV-"): + parts = event_id[3:].split("-", 4) + if len(parts) >= 4: + year = parts[0] + slug = "-".join(parts[3:]) + compact = re.sub(r"[^A-Z0-9]", "", slug.upper())[:20] or "UNK" + return f"OBJ-EV{year}-{compact}-{index:02d}" + return f"OBJ-UNK-{index:02d}" + + +def dump_yaml(obj: dict) -> str: + """Stable YAML dump matching existing entity file style.""" + return yaml.safe_dump(obj, sort_keys=False, allow_unicode=True, + default_flow_style=False, width=10_000).strip() + + +def write_entity(path: Path, frontmatter: dict, body_title: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + content = ( + f"---\n{dump_yaml(frontmatter)}\n---\n\n" + f"# {body_title}\n\n" + f"## Description (EN)\n\n" + f"## Descrição (PT-BR)\n" + ) + path.write_text(content, encoding="utf-8") + + +def load_chunk_to_page(doc_id: str) -> dict[str, int]: + idx_path = RAW / f"{doc_id}--subagent" / "_index.json" + if not idx_path.is_file(): return {} + try: + idx = json.loads(idx_path.read_text(encoding="utf-8")) + return {c.get("chunk_id"): c.get("page") for c in (idx.get("chunks") or []) + if c.get("chunk_id") and c.get("page") is not None} + except Exception: + return {} + + +def page_refs_for(doc_id: str, chunks: list[str], chunk_to_page: dict[str, int]) -> list[str]: + pages = set() + for c in chunks or []: + p = chunk_to_page.get(c) + if p is not None: pages.add(int(p)) + return [f"[[{doc_id}/p{p:03d}]]" for p in sorted(pages)] + + +# ───────────────────────────────────────────────────────────────────────────── +# AGGREGATION +# ───────────────────────────────────────────────────────────────────────────── + +class EntityBucket: + """Aggregates one entity across multiple documents.""" + __slots__ = ("ent_id", "canonical_name", "aliases", "first_class", + "by_doc", "extra") + + def __init__(self, ent_id: str, canonical_name: str): + self.ent_id = ent_id + self.canonical_name = canonical_name + self.aliases: set[str] = set() + self.first_class: str | None = None + # doc_id → {chunks: list, raw: dict} + self.by_doc: dict[str, dict] = {} + self.extra: dict = {} # type-specific scratch (affiliation, geo_class, etc.) + + def add_occurrence(self, doc_id: str, raw_entity: dict, ent_class: str | None): + if self.first_class is None and ent_class: + self.first_class = ent_class + if raw_entity.get("name") or raw_entity.get("label"): + self.aliases.add((raw_entity.get("name") or raw_entity.get("label")).strip()) + for a in raw_entity.get("aliases_in_doc") or []: + if a and a.strip(): self.aliases.add(a.strip()) + self.by_doc.setdefault(doc_id, {"chunks": [], "raw": raw_entity}) + ev = raw_entity.get("evidence_chunks") or [] + self.by_doc[doc_id]["chunks"] = sorted(set(self.by_doc[doc_id]["chunks"]) | set(ev)) + + +def merge_dates(buckets: dict[str, EntityBucket], get_date) -> dict[str, dict]: + """Aggregate per-bucket dates from per-doc raw_entity. (For events only.)""" + out = {} + for k, b in buckets.items(): + for doc_id, occ in b.by_doc.items(): + d = get_date(occ["raw"]) + if d: + out.setdefault(k, {}).setdefault("dates", set()).add(d) + return out + + +def aggregate_all() -> dict: + """Walk all _reextract.json files. Return a structured aggregation.""" + people: dict[str, EntityBucket] = {} + orgs: dict[str, EntityBucket] = {} + locs: dict[str, EntityBucket] = {} + events: dict[str, EntityBucket] = {} + uap_objs: dict[str, EntityBucket] = {} # per (doc, event, idx) — never deduped + relations: list[dict] = [] + docs_processed = 0 + chunk_maps: dict[str, dict[str, int]] = {} + + for jpath in sorted(RAW.glob("*--subagent/_reextract.json")): + doc_id = jpath.parent.name.removesuffix("--subagent") + try: + data = json.loads(jpath.read_text(encoding="utf-8")) + except Exception as e: + print(f" skip {doc_id}: {e}", file=sys.stderr); continue + docs_processed += 1 + chunk_maps[doc_id] = load_chunk_to_page(doc_id) + + # people + for p in data.get("people") or []: + name = (p.get("name") or "").strip() + if not name or name.lower() == "unknown": continue + pid = canonicalize_name(name) + if not pid: continue + bucket = people.setdefault(pid, EntityBucket(pid, name)) + bucket.add_occurrence(doc_id, p, p.get("person_class")) + + # organizations + for o in data.get("organizations") or []: + name = (o.get("name") or "").strip() + if not name or name.lower() == "unknown": continue + oid = canonicalize_name(name) + if not oid: continue + bucket = orgs.setdefault(oid, EntityBucket(oid, name)) + bucket.add_occurrence(doc_id, o, o.get("org_class")) + + # locations + for l in data.get("locations") or []: + name = (l.get("name") or "").strip() + if not name or name.lower() == "unknown": continue + lid = canonicalize_name(name) + if not lid: continue + bucket = locs.setdefault(lid, EntityBucket(lid, name)) + bucket.add_occurrence(doc_id, l, l.get("geo_class")) + + # events + for e in data.get("events") or []: + label = (e.get("label") or "").strip() + if not label: continue + eid = event_id_from(label, e.get("date_start")) + bucket = events.setdefault(eid, EntityBucket(eid, label)) + bucket.add_occurrence(doc_id, e, e.get("event_class")) + + # uap_objects — never cross-event-deduped; inherit parent event's evidence_chunks + event_chunks = e.get("evidence_chunks") or [] + for i, u in enumerate(e.get("uap_objects_observed") or [], 1): + if not isinstance(u, dict): continue + uid = uap_object_id(eid, i) + ubucket = uap_objs.setdefault(uid, EntityBucket(uid, f"{label} — object {i}")) + u_with_evidence = {**u, "evidence_chunks": u.get("evidence_chunks") or event_chunks} + ubucket.add_occurrence(doc_id, u_with_evidence, u.get("shape")) + ubucket.extra.setdefault("event_id", eid) + + # relations — collected raw, mapped to canonical IDs later + for r in data.get("relations") or []: + if not isinstance(r, dict): continue + relations.append({"doc_id": doc_id, **r}) + + return { + "docs_processed": docs_processed, + "people": people, "organizations": orgs, "locations": locs, + "events": events, "uap_objects": uap_objs, + "relations": relations, "chunk_maps": chunk_maps, + } + + +# ───────────────────────────────────────────────────────────────────────────── +# WRITERS +# ───────────────────────────────────────────────────────────────────────────── + +def write_person(b: EntityBucket, chunk_maps: dict) -> None: + mentioned_in = sorted({ + ref + for doc_id, occ in b.by_doc.items() + for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {})) + }) + affiliations = sorted({ + (occ["raw"].get("affiliation") or "").strip() + for occ in b.by_doc.values() if occ["raw"].get("affiliation") + } - {""}) + roles = sorted({ + (occ["raw"].get("role_at_doc_date") or "").strip() + for occ in b.by_doc.values() if occ["raw"].get("role_at_doc_date") + } - {""}) + fm = { + "schema_version": SCHEMA_VERSION, + "type": "entity", + "entity_class": "person", + "person_id": b.ent_id, + "canonical_name": b.canonical_name, + "aliases": sorted(b.aliases), + "person_class": b.first_class, + "affiliations": affiliations, + "roles": roles, + "mentioned_in": mentioned_in, + "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()), + "documents_count": len(b.by_doc), + "enrichment_status": "none", + "last_ingest": NOW, + "wiki_version": WIKI_VERSION, + "source": "reextract-v1", + } + write_entity(ENT / "people" / f"{b.ent_id}.md", fm, b.canonical_name) + + +def write_org(b: EntityBucket, chunk_maps: dict) -> None: + mentioned_in = sorted({ + ref for doc_id, occ in b.by_doc.items() + for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {})) + }) + countries = sorted({ + (occ["raw"].get("country") or "").strip() + for occ in b.by_doc.values() if occ["raw"].get("country") + } - {""}) + fm = { + "schema_version": SCHEMA_VERSION, + "type": "entity", + "entity_class": "organization", + "organization_id": b.ent_id, + "canonical_name": b.canonical_name, + "aliases": sorted(b.aliases), + "org_class": b.first_class, + "countries": countries, + "mentioned_in": mentioned_in, + "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()), + "documents_count": len(b.by_doc), + "enrichment_status": "none", + "last_ingest": NOW, + "wiki_version": WIKI_VERSION, + "source": "reextract-v1", + } + write_entity(ENT / "organizations" / f"{b.ent_id}.md", fm, b.canonical_name) + + +def write_location(b: EntityBucket, chunk_maps: dict) -> None: + mentioned_in = sorted({ + ref for doc_id, occ in b.by_doc.items() + for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {})) + }) + countries = sorted({ + (occ["raw"].get("country") or "").strip() + for occ in b.by_doc.values() if occ["raw"].get("country") + } - {""}) + regions = sorted({ + (occ["raw"].get("region_or_state") or "").strip() + for occ in b.by_doc.values() if occ["raw"].get("region_or_state") + } - {""}) + fm = { + "schema_version": SCHEMA_VERSION, + "type": "entity", + "entity_class": "location", + "location_id": b.ent_id, + "canonical_name": b.canonical_name, + "aliases": sorted(b.aliases), + "geo_class": b.first_class, + "countries": countries, + "regions_or_states": regions, + "mentioned_in": mentioned_in, + "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()), + "documents_count": len(b.by_doc), + "enrichment_status": "none", + "last_ingest": NOW, + "wiki_version": WIKI_VERSION, + "source": "reextract-v1", + } + write_entity(ENT / "locations" / f"{b.ent_id}.md", fm, b.canonical_name) + + +def write_event(b: EntityBucket, chunk_maps: dict) -> None: + mentioned_in = sorted({ + ref for doc_id, occ in b.by_doc.items() + for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {})) + }) + date_starts = sorted({occ["raw"].get("date_start") for occ in b.by_doc.values() if occ["raw"].get("date_start")}) + date_ends = sorted({occ["raw"].get("date_end") for occ in b.by_doc.values() if occ["raw"].get("date_end")}) + primary_locs = sorted({ + (occ["raw"].get("primary_location_name") or "").strip() + for occ in b.by_doc.values() if occ["raw"].get("primary_location_name") + } - {""}) + geos = sorted({ + occ["raw"].get("primary_location_geo_class") + for occ in b.by_doc.values() if occ["raw"].get("primary_location_geo_class") + } - {None}) + # narrative: take the longest non-empty + def best(field): + best_val = "" + for occ in b.by_doc.values(): + v = (occ["raw"].get(field) or "").strip() + if len(v) > len(best_val): best_val = v + return best_val or None + + fm = { + "schema_version": SCHEMA_VERSION, + "type": "entity", + "entity_class": "event", + "event_id": b.ent_id, + "canonical_name": b.canonical_name, + "aliases": sorted(b.aliases), + "event_class": b.first_class, + "date_start": date_starts[0] if date_starts else None, + "date_end": date_ends[-1] if date_ends else None, + "date_confidence": None, + "primary_location_names": primary_locs, + "primary_location_geo_classes": geos, + "narrative_summary_en": best("narrative_summary"), + "narrative_summary_pt_br": best("narrative_summary_pt_br"), + "mentioned_in": mentioned_in, + "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()), + "documents_count": len(b.by_doc), + "enrichment_status": "none", + "last_ingest": NOW, + "wiki_version": WIKI_VERSION, + "source": "reextract-v1", + } + write_entity(ENT / "events" / f"{b.ent_id}.md", fm, b.canonical_name) + + +def write_uap_object(b: EntityBucket, chunk_maps: dict) -> None: + mentioned_in = sorted({ + ref for doc_id, occ in b.by_doc.items() + for ref in page_refs_for(doc_id, occ["chunks"], chunk_maps.get(doc_id, {})) + }) + raw_first = next(iter(b.by_doc.values()))["raw"] + fm = { + "schema_version": SCHEMA_VERSION, + "type": "entity", + "entity_class": "uap_object", + "uap_object_id": b.ent_id, + "canonical_name": b.canonical_name, + "event_id": b.extra.get("event_id"), + "shape": raw_first.get("shape"), + "color": raw_first.get("color"), + "medium": raw_first.get("medium"), + "size_estimate_m": raw_first.get("size_estimate_m"), + "altitude_ft": raw_first.get("altitude_ft"), + "speed_kts": raw_first.get("speed_kts"), + "maneuver_notes": raw_first.get("maneuver_notes"), + "mentioned_in": mentioned_in, + "total_mentions": sum(len(occ["chunks"]) for occ in b.by_doc.values()), + "documents_count": len(b.by_doc), + "last_ingest": NOW, + "wiki_version": WIKI_VERSION, + "source": "reextract-v1", + } + write_entity(ENT / "uap-objects" / f"{b.ent_id}.md", fm, b.canonical_name) + + +def main(): + print(f"[1/3] Aggregating from {RAW}/*--subagent/_reextract.json ...") + agg = aggregate_all() + print(f" docs processed: {agg['docs_processed']}") + print(f" unique people: {len(agg['people'])}") + print(f" unique orgs: {len(agg['organizations'])}") + print(f" unique locs: {len(agg['locations'])}") + print(f" unique events: {len(agg['events'])}") + print(f" uap objects: {len(agg['uap_objects'])}") + print(f" raw relations: {len(agg['relations'])}") + + print(f"\n[2/3] Writing entity markdown files ...") + cmaps = agg["chunk_maps"] + written = {"people": 0, "organizations": 0, "locations": 0, "events": 0, "uap_objects": 0} + for b in agg["people"].values(): write_person(b, cmaps); written["people"] += 1 + for b in agg["organizations"].values(): write_org(b, cmaps); written["organizations"] += 1 + for b in agg["locations"].values(): write_location(b, cmaps); written["locations"] += 1 + for b in agg["events"].values(): write_event(b, cmaps); written["events"] += 1 + for b in agg["uap_objects"].values(): write_uap_object(b, cmaps);written["uap_objects"] += 1 + for k, n in written.items(): print(f" {k}: {n}") + + print(f"\n[3/3] Saving relations index to wiki/entities/_relations.json (for downstream ingest)") + rels_path = ENT / "_relations.json" + rels_path.write_text(json.dumps({ + "schema_version": SCHEMA_VERSION, + "rebuilt_at": NOW, + "count": len(agg["relations"]), + "relations": agg["relations"], + }, indent=2, ensure_ascii=False), encoding="utf-8") + print(f" saved {len(agg['relations'])} relations to {rels_path}") + print(f"\n✓ done.") + + +if __name__ == "__main__": + main() diff --git a/scripts/synthesize/31_aggregate_pages_from_chunks.py b/scripts/synthesize/31_aggregate_pages_from_chunks.py new file mode 100644 index 0000000..1f9dfa2 --- /dev/null +++ b/scripts/synthesize/31_aggregate_pages_from_chunks.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 +""" +31_aggregate_pages_from_chunks.py — Generate thin wiki/pages/<doc>/p<NNN>.md +files for pages where the chunks/ already have content but the per-page vision +pipeline (02-vision-page.py) never produced an aggregator file. + +Source of truth: raw/<doc>--subagent/_index.json + chunks/c*.md (Sonnet-extracted) +Output: wiki/pages/<doc>/p<NNN>.md (thin aggregator, tagged source:chunk-aggregator) + +Skips pages that already have a wiki/pages/.md (idempotent). + +Run: + python3 scripts/synthesize/31_aggregate_pages_from_chunks.py + python3 scripts/synthesize/31_aggregate_pages_from_chunks.py --doc-id <id> # one doc + python3 scripts/synthesize/31_aggregate_pages_from_chunks.py --dry-run +""" +from __future__ import annotations +import argparse +import json +import re +import sys +from collections import defaultdict +from datetime import datetime, timezone +from pathlib import Path + +UFO = Path("/Users/guto/ufo") +RAW = UFO / "raw" +PNG_BASE = UFO / "processing" / "png" +PAGES_BASE = UFO / "wiki" / "pages" + +SCHEMA_VERSION = "0.1.0" +WIKI_VERSION = "0.1.0" +NOW = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def split_frontmatter(text: str) -> tuple[dict, str]: + if not text.startswith("---"): return {}, text + parts = text.split("---", 2) + if len(parts) < 3: return {}, text + fm: dict = {} + for line in parts[1].splitlines(): + m = re.match(r"^([a-zA-Z_]+):\s*(.*)$", line) + if not m: continue + fm[m.group(1)] = m.group(2).strip() + return fm, parts[2] + + +def extract_bilingual(body: str) -> tuple[str, str]: + """Return (en, pt_br) text from a chunk body. Either may be empty.""" + en_m = re.search(r"\*\*EN:\*\*\s*(.*?)(?:\n\s*\*\*PT-BR:\*\*|\Z)", body, re.S) + pt_m = re.search(r"\*\*PT-BR:\*\*\s*(.*?)\Z", body, re.S) + en = (en_m.group(1).strip() if en_m else "").strip() + pt = (pt_m.group(1).strip() if pt_m else "").strip() + return en, pt + + +def find_missing_pages() -> dict[str, list[int]]: + """For each doc, return sorted list of pages where PNG exists but wiki/pages/.md doesn't.""" + missing: dict[str, list[int]] = defaultdict(list) + for png in PNG_BASE.glob("*/p-*.png"): + doc_id = png.parent.name + m = re.match(r"p-(\d+)\.png$", png.name) + if not m: continue + n = int(m.group(1)) + wiki = PAGES_BASE / doc_id / f"p{n:03d}.md" + if not wiki.is_file(): + missing[doc_id].append(n) + return {d: sorted(ps) for d, ps in missing.items()} + + +def build_page_md(doc_id: str, page_num: int) -> str | None: + """Assemble a single page.md from the doc's _index.json + chunks/.""" + sub = RAW / f"{doc_id}--subagent" + idx_path = sub / "_index.json" + if not idx_path.is_file(): return None + idx = json.loads(idx_path.read_text(encoding="utf-8")) + chunks_for_page = [c for c in (idx.get("chunks") or []) if c.get("page") == page_num] + if not chunks_for_page: + return None # no chunk data → can't aggregate + chunks_for_page.sort(key=lambda x: x.get("order_in_page", 0)) + + total_pages = idx.get("total_pages") + rel_png = f"../../../processing/png/{doc_id}/p-{page_num:03d}.png" + + # Aggregate per-chunk EN/PT/metadata + body_blocks: list[str] = [] + types_seen: set[str] = set() + chunk_ids: list[str] = [] + has_redaction = has_image = has_table = has_stamp = has_signature = False + classifications: set[str] = set() + + for c in chunks_for_page: + cid = c.get("chunk_id") + chunk_ids.append(cid) + ctype = c.get("type") or "?" + types_seen.add(ctype) + chunk_path = sub / "chunks" / f"{cid}.md" + if not chunk_path.is_file(): continue + text = chunk_path.read_text(encoding="utf-8") + fm, body = split_frontmatter(text) + en, pt = extract_bilingual(body) + + if not en and not pt: + # fall back to extracted_text / image_description fields + en = (fm.get("image_description_en") or fm.get("extracted_text") or "").strip().strip('"\'') + pt = (fm.get("image_description_pt_br") or "").strip().strip('"\'') + + # Heuristic flags + if ctype in ("redaction", "redacted_block"): has_redaction = True + if "image" in ctype or "photo" in ctype or "diagram" in ctype or "sketch" in ctype or "map" in ctype: + has_image = True + if "table" in ctype: has_table = True + if "stamp" in ctype: has_stamp = True + if "signature" in ctype: has_signature = True + cls = fm.get("classification") + if cls and cls != "null": classifications.add(cls) + + # Body block + block = f"### Chunk `{cid}` — type: {ctype}\n" + bbox = c.get("bbox") or {} + if bbox: + block += f"_bbox_: x={bbox.get('x')}, y={bbox.get('y')}, w={bbox.get('w')}, h={bbox.get('h')}\n\n" + if en: block += f"**EN:** {en}\n\n" + if pt: block += f"**PT-BR:** {pt}\n" + body_blocks.append(block.rstrip()) + + # Content classification + content_class = [] + if has_image: content_class.append("contains-photos") + if has_table: content_class.append("contains-tables") + if has_stamp: content_class.append("contains-stamps") + if has_signature: content_class.append("contains-signatures") + if has_redaction: content_class.append("redaction-heavy") + if not content_class: content_class.append("text-only") + + # Page-level inferred type (best-effort) + if "classification_banner" in types_seen and len(types_seen) <= 3: + page_type = "cover" + elif "header" in types_seen and "transcript_block" in types_seen: + page_type = "transcript" + elif has_table and not body_blocks: + page_type = "table_only" + elif "letterhead" in types_seen: + page_type = "memo" + else: + page_type = "mixed" + + # Frontmatter + fm = { + "schema_version": SCHEMA_VERSION, + "type": "page", + "page_id": f"{doc_id}/p{page_num:03d}", + "doc_id": doc_id, + "page_number": page_num, + "total_pages": total_pages, + "png_path": rel_png, + "page_type": page_type, + "content_classification": content_class, + "classification_markings": [{"level": c} for c in sorted(classifications)] if classifications else [], + "chunks_on_page": chunk_ids, + "chunk_count": len(chunk_ids), + "source": "chunk-aggregator", + "source_note": "Page-md generated from chunks built by Sonnet vision (raw/<doc>--subagent/chunks/). Per-page vision Haiku pipeline (02-vision-page.py) never produced an output for this page.", + "last_ingest": NOW, + "wiki_version": WIKI_VERSION, + } + import yaml + yaml_block = yaml.safe_dump(fm, sort_keys=False, allow_unicode=True, + default_flow_style=False, width=10_000).rstrip() + body = "\n\n".join(body_blocks) if body_blocks else "_(no extractable text — see chunk files directly)_" + return f"---\n{yaml_block}\n---\n\n# Page {page_num} of {doc_id}\n\n{body}\n" + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--doc-id", default=None) + ap.add_argument("--dry-run", action="store_true") + args = ap.parse_args() + + missing = find_missing_pages() + if args.doc_id: + missing = {args.doc_id: missing.get(args.doc_id, [])} + + total_missing = sum(len(ps) for ps in missing.values()) + print(f"[1/2] Inventory: {sum(1 for d, ps in missing.items() if ps)} docs, {total_missing} missing pages") + if args.dry_run: + for d, ps in sorted(missing.items(), key=lambda kv: -len(kv[1])): + if ps: print(f" {d}: {len(ps)}") + return 0 + + print(f"\n[2/2] Generating thin aggregator page.md files ...") + written = 0 + skipped_no_chunks = 0 + for doc_id, pages in missing.items(): + for n in pages: + md = build_page_md(doc_id, n) + if md is None: + skipped_no_chunks += 1 + continue + out = PAGES_BASE / doc_id / f"p{n:03d}.md" + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(md, encoding="utf-8") + written += 1 + + print(f" written: {written}") + print(f" skipped (no chunk data): {skipped_no_chunks}") + print(f"\n✓ done.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/synthesize/32_reprocess_missing_pages.py b/scripts/synthesize/32_reprocess_missing_pages.py new file mode 100644 index 0000000..37ad996 --- /dev/null +++ b/scripts/synthesize/32_reprocess_missing_pages.py @@ -0,0 +1,375 @@ +#!/usr/bin/env python3 +""" +32_reprocess_missing_pages.py — Reprocess pages that the doc-rebuilder agent +silently dropped due to context-window overflow. + +For each doc: + 1. Read raw/<doc>--subagent/_index.json (current chunk inventory) + 2. Find missing pages: PNGs that exist but have no chunks + 3. For each missing page, call `claude -p --model sonnet` with the page PNG + and ask for a chunks JSON (matching the page-rebuilder schema) + 4. Append new chunks to _index.json with continued global IDs (chunk_id, order_global) + 5. Write new chunks/c<NNNN>.md files + +Idempotent — re-running skips pages already processed. +Uses WORKERS=2 to avoid hammering OAuth rate limits. + +Usage: + python3 scripts/synthesize/32_reprocess_missing_pages.py --dry-run + python3 scripts/synthesize/32_reprocess_missing_pages.py --doc-id <id> + WORKERS=2 python3 scripts/synthesize/32_reprocess_missing_pages.py +""" +from __future__ import annotations +import argparse +import json +import os +import re +import subprocess +import sys +import tempfile +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime, timezone +from pathlib import Path + +UFO = Path("/Users/guto/ufo") +RAW = UFO / "raw" +PNG_BASE = UFO / "processing" / "png" + +NOW = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") +SONNET_MODEL = "sonnet" +WORKERS = int(os.environ.get("WORKERS", "2")) + +PROMPT_TEMPLATE = """You are a page-rebuilder for a declassified UAP/UFO government document. + +You will receive the path of ONE page image. Read it with the Read tool, then analyze it carefully and extract ALL content as structured chunks. + +DOCUMENT_ID: {doc_id} +PAGE_NUMBER: {page_num} +PNG_PATH: {png_path} + +Return ONE JSON object with this exact structure (no markdown fence, no preamble, no postscript): + +{{ + "page_number": {page_num}, + "chunks": [ + {{ + "order_in_page": 1, + "type": "<chunk_type>", + "content_en": "<English verbatim text or visual description>", + "content_pt_br": "<Brazilian Portuguese translation>", + "bbox": {{"x": 0.0, "y": 0.0, "w": 1.0, "h": 0.1}}, + "classification": null, + "formatting": [], + "cross_page_hint": "self_contained", + "ocr_confidence": 0.85, + "redaction_code": null, + "redaction_inferred_content_type": null, + "image_type": null, + "ufo_anomaly_detected": false, + "ufo_anomaly_type": null, + "ufo_anomaly_rationale": null, + "cryptid_anomaly_detected": false, + "cryptid_anomaly_type": null, + "cryptid_anomaly_rationale": null, + "image_description_en": null, + "image_description_pt_br": null, + "extracted_text": null + }} + ] +}} + +CHUNK TYPES (use only these): letterhead, classification_banner, header, subheader, paragraph, list_item, caption, footnote, page_number, signature_block, stamp, redaction_block, image, table_marker, form_field, watermark, separator, blank, annotation, transcript_block. + +RULES: +1. Extract EVERY element on the page — nothing is skipped. +2. bbox is normalized coords (0.0..1.0) relative to the page image. +3. content_en is verbatim OCR text for text chunks; for images, describe what you see. +4. content_pt_br is Brazilian Portuguese (NOT European Portuguese). Preserve UTF-8 accents. +5. For redactions: type="redaction_block", content_en="[REDACTED]", content_pt_br="[REDACTADO]". +6. For images/photos/diagrams/sketches/maps: type="image", describe in image_description_en/pt_br. +7. For stamps: type="stamp". +8. classification: extract markings if visible ("SECRET", "CONFIDENTIAL", "UNCLASSIFIED", etc.). +9. formatting: any of ["bold", "italic", "underline", "all_caps", "handwritten", "typewritten"]. +10. cross_page_hint: "self_contained" | "continues_to_next" | "continues_from_prev". +11. ufo_anomaly_detected: true if chunk contains UAP/UFO sighting data, coordinates, witness accounts, anomalous phenomena. +12. If page is truly blank: return one chunk with type="blank". +13. Order chunks top-to-bottom, left-to-right. + +Return ONLY the JSON. No markdown. No commentary. +""" + +DISALLOWED = ( + "AskUserQuestion,Bash,Edit,Write,Task,Glob,Grep," + "TaskCreate,TaskUpdate,TaskList,TaskGet,TaskStop,TaskOutput," + "Skill,ScheduleWakeup,Monitor,WebSearch,WebFetch,NotebookEdit," + "EnterPlanMode,ExitPlanMode,EnterWorktree,ExitWorktree," + "CronCreate,CronDelete,CronList,RemoteTrigger,ToolSearch," + "PushNotification,ListMcpResourcesTool,ReadMcpResourceTool," + "ShareOnboardingGuide" +) # NOTE: Read is allowed (we need vision) + + +def extract_json_block(s: str) -> str: + s = s.strip() + if s.startswith("```"): + s = "\n".join(line for line in s.splitlines() if not line.startswith("```")) + s = s.strip() + start = s.find("{") + end = s.rfind("}") + if start >= 0 and end > start: return s[start:end + 1] + return s + + +def call_sonnet_vision(doc_id: str, page_num: int) -> dict | None: + png_path = PNG_BASE / doc_id / f"p-{page_num:03d}.png" + if not png_path.is_file(): return None + prompt = PROMPT_TEMPLATE.format(doc_id=doc_id, page_num=page_num, png_path=str(png_path)) + + env = {**os.environ, "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000"} + with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding="utf-8") as tmp: + tmp_path = tmp.name + try: + with open(tmp_path, "wb") as out_f: + r = subprocess.run( + ["claude", "-p", "--model", SONNET_MODEL, "--output-format", "text", + "--disallowed-tools", DISALLOWED], + input=prompt.encode("utf-8"), + stdout=out_f, stderr=subprocess.PIPE, env=env, + timeout=300, + ) + if r.returncode != 0: + print(f" [FAIL] {doc_id} p{page_num:03d} — claude rc={r.returncode}: {r.stderr.decode('utf-8', errors='replace')[:300]}", file=sys.stderr) + return None + with open(tmp_path, "r", encoding="utf-8") as f: + raw = f.read() + js = extract_json_block(raw) + try: + return json.loads(js) + except json.JSONDecodeError as e: + print(f" [JSON] {doc_id} p{page_num:03d} — {e} | raw_len={len(raw)}", file=sys.stderr) + return None + finally: + try: os.unlink(tmp_path) + except OSError: pass + + +def find_missing_pages_per_doc() -> dict[str, list[int]]: + """For each doc, find pages that have a PNG but no chunks in _index.json. + Excludes the Poppler-phantom (last) page only if pdf_pages is known and PNG == pdf+1.""" + result: dict[str, list[int]] = {} + import subprocess as sp + # Try to map pdf_pages by exact filename matching + pdf_pages_map: dict[str, int] = {} + for p in RAW.glob("*.pdf"): + try: + out = sp.run(["pdfinfo", str(p)], capture_output=True, text=True, timeout=30).stdout + m = re.search(r"Pages:\s+(\d+)", out) + if m: + # filename → doc_id (same algorithm as page-rebuilder did) + import unicodedata + nfd = unicodedata.normalize("NFD", p.stem) + ascii_str = "".join(c for c in nfd if not unicodedata.combining(c)) + slug = re.sub(r"-+", "-", re.sub(r"[^a-z0-9-]", "-", ascii_str.lower())).strip("-") + if slug and slug[0].isdigit(): slug = "doc-" + slug + pdf_pages_map[slug] = int(m.group(1)) + except Exception: pass + + for png_dir in PNG_BASE.glob("*/"): + doc_id = png_dir.name + pngs = sorted( + int(re.match(r"p-(\d+)\.png", p.name).group(1)) + for p in png_dir.glob("p-*.png") if re.match(r"p-\d+\.png", p.name) + ) + if not pngs: continue + idx_path = RAW / f"{doc_id}--subagent" / "_index.json" + if not idx_path.is_file(): continue + try: + idx = json.loads(idx_path.read_text(encoding="utf-8")) + except Exception: continue + pages_in_chunks = {c.get("page") for c in idx.get("chunks", []) if c.get("page")} + + # Filter: only pages 1..pdf_pages (avoid Poppler phantom) + pdf_pages = pdf_pages_map.get(doc_id) + upper_bound = pdf_pages if pdf_pages else pngs[-1] + missing = [p for p in pngs if p <= upper_bound and p not in pages_in_chunks] + if missing: result[doc_id] = missing + return result + + +def render_chunk_md(chunk: dict) -> str: + """Render a chunk dict to the chunk.md format.""" + import yaml + body_en = chunk.pop("_body_en", "") + body_pt = chunk.pop("_body_pt", "") + # YAML keys in stable order + fm_keys = [ + "chunk_id", "type", "page", "order_in_page", "order_global", "bbox", + "classification", "formatting", "cross_page_hint", "prev_chunk", "next_chunk", + "related_image", "related_table", "ocr_confidence", "ocr_source_lines", + "redaction_code", "redaction_inferred_content_type", "image_type", + "ufo_anomaly_detected", "cryptid_anomaly_detected", + "ufo_anomaly_type", "ufo_anomaly_rationale", + "cryptid_anomaly_type", "cryptid_anomaly_rationale", + "image_description_en", "image_description_pt_br", "extracted_text", + "source_png", + ] + fm = {k: chunk.get(k) for k in fm_keys if k in chunk} + yaml_block = yaml.safe_dump(fm, sort_keys=False, allow_unicode=True, + default_flow_style=False, width=10_000).rstrip() + body = f"**EN:** {body_en}\n\n**PT-BR:** {body_pt}\n" if (body_en or body_pt) else "" + return f"---\n{yaml_block}\n---\n\n{body}" + + +def integrate_page_chunks(doc_id: str, page_num: int, page_result: dict, idx: dict) -> int: + """Add new page chunks to idx + write chunk .md files. Returns chunks added.""" + chunks = page_result.get("chunks") or [] + if not chunks: return 0 + sub = RAW / f"{doc_id}--subagent" + chunks_dir = sub / "chunks" + chunks_dir.mkdir(exist_ok=True) + # Determine next global order + next_global = max((c.get("order_global", 0) for c in idx.get("chunks") or []), default=0) + 1 + # Determine next chunk_id numeric + next_id_num = next_global + rel_png = f"../../processing/png/{doc_id}/p-{page_num:03d}.png" + added = 0 + new_index_entries = [] + for i, c in enumerate(chunks, 1): + cid = f"c{next_id_num:04d}" + ctype = c.get("type") or "paragraph" + en = c.get("content_en") or "" + pt = c.get("content_pt_br") or "" + entry = { + "chunk_id": cid, + "type": ctype, + "page": page_num, + "order_in_page": c.get("order_in_page") or i, + "order_global": next_id_num, + "file": f"chunks/{cid}.md", + "bbox": c.get("bbox") or {"x": 0.0, "y": 0.0, "w": 1.0, "h": 1.0}, + "preview": (en or pt or "")[:120], + } + new_index_entries.append(entry) + + chunk_dict = { + "chunk_id": cid, + "type": ctype, + "page": page_num, + "order_in_page": entry["order_in_page"], + "order_global": next_id_num, + "bbox": entry["bbox"], + "classification": c.get("classification"), + "formatting": c.get("formatting") or [], + "cross_page_hint": c.get("cross_page_hint") or "self_contained", + "prev_chunk": f"c{next_id_num-1:04d}" if next_id_num > 1 else None, + "next_chunk": None, # patched after all known + "related_image": None, + "related_table": None, + "ocr_confidence": c.get("ocr_confidence") or 0.85, + "ocr_source_lines": [], + "redaction_code": c.get("redaction_code"), + "redaction_inferred_content_type": c.get("redaction_inferred_content_type"), + "image_type": c.get("image_type"), + "ufo_anomaly_detected": bool(c.get("ufo_anomaly_detected")), + "cryptid_anomaly_detected": bool(c.get("cryptid_anomaly_detected")), + "ufo_anomaly_type": c.get("ufo_anomaly_type"), + "ufo_anomaly_rationale": c.get("ufo_anomaly_rationale"), + "cryptid_anomaly_type": c.get("cryptid_anomaly_type"), + "cryptid_anomaly_rationale": c.get("cryptid_anomaly_rationale"), + "image_description_en": c.get("image_description_en"), + "image_description_pt_br": c.get("image_description_pt_br"), + "extracted_text": c.get("extracted_text"), + "source_png": rel_png, + "_body_en": en, "_body_pt": pt, + } + (chunks_dir / f"{cid}.md").write_text(render_chunk_md(chunk_dict), encoding="utf-8") + next_id_num += 1 + added += 1 + + idx.setdefault("chunks", []).extend(new_index_entries) + return added + + +import threading + +# One lock per doc_id (only contended when 2+ workers process pages of same doc) +_doc_locks: dict[str, threading.Lock] = {} +_locks_mutex = threading.Lock() +def _doc_lock(doc_id: str) -> threading.Lock: + with _locks_mutex: + if doc_id not in _doc_locks: _doc_locks[doc_id] = threading.Lock() + return _doc_locks[doc_id] + + +def process_one_page(doc_id: str, page_num: int) -> tuple[bool, int]: + """Process a single page and persist to _index.json under doc lock. + Returns (ok, chunks_added).""" + result = call_sonnet_vision(doc_id, page_num) + if not result: + print(f" [SKIP] {doc_id} p{page_num:03d} — no result", flush=True) + return (False, 0) + sub = RAW / f"{doc_id}--subagent" + idx_path = sub / "_index.json" + with _doc_lock(doc_id): + idx = json.loads(idx_path.read_text(encoding="utf-8")) + # Idempotent: if page already integrated meanwhile, skip + if any(c.get("page") == page_num for c in idx.get("chunks") or []): + print(f" [SKIP] {doc_id} p{page_num:03d} — already present", flush=True) + return (False, 0) + try: + n = integrate_page_chunks(doc_id, page_num, result, idx) + except Exception as e: + print(f" [ERR ] {doc_id} p{page_num:03d} — integrate: {e}", flush=True) + return (False, 0) + idx_path.write_text(json.dumps(idx, indent=2, ensure_ascii=False), encoding="utf-8") + print(f" [OK ] {doc_id} p{page_num:03d} — {n} chunks", flush=True) + return (True, n) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--doc-id", default=None) + ap.add_argument("--page", type=int, default=None, help="single page for testing") + ap.add_argument("--dry-run", action="store_true") + args = ap.parse_args() + + missing = find_missing_pages_per_doc() + if args.doc_id: + missing = {args.doc_id: missing.get(args.doc_id, [])} + if args.page and args.doc_id: + missing = {args.doc_id: [args.page]} + + # Flatten (doc, page) job list — page-level parallelism + jobs: list[tuple[str, int]] = [] + for d, ps in missing.items(): + for p in ps: jobs.append((d, p)) + total = len(jobs) + print(f"[1/2] {len(missing)} docs · {total} page-jobs") + if args.dry_run: + for d, ps in sorted(missing.items(), key=lambda kv: -len(kv[1])): + if ps: print(f" {d}: {len(ps)} pages → {ps[:5]}{'...' if len(ps)>5 else ''}") + return 0 + if total == 0: print("Nothing to do."); return 0 + + print(f"\n[2/2] Processing with WORKERS={WORKERS} (page-level parallel) ...") + pages_done = chunks_added = 0 + completed = 0 + with ThreadPoolExecutor(max_workers=WORKERS) as pool: + futs = {pool.submit(process_one_page, d, p): (d, p) for d, p in jobs} + for fut in as_completed(futs): + d, p = futs[fut] + completed += 1 + try: + ok, n = fut.result() + if ok: pages_done += 1; chunks_added += n + except Exception as e: + print(f" [ERR ] {d} p{p:03d}: {e}", flush=True) + if completed % 25 == 0: + print(f" ... [progress] {completed}/{total} pages_done={pages_done} chunks={chunks_added}", flush=True) + + print(f"\n✓ {pages_done}/{total} pages processed, {chunks_added} new chunks.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/web/app/api/relations/route.ts b/web/app/api/relations/route.ts new file mode 100644 index 0000000..e53af0d --- /dev/null +++ b/web/app/api/relations/route.ts @@ -0,0 +1,60 @@ +/** + * /api/relations — read typed relations for an entity. + * + * GET /api/relations?class=person&id=j-edgar-hoover + * → Returns relations where this entity is source OR target, + * grouped by relation_type and direction. + */ +import { NextRequest } from "next/server"; +import { pgQuery } from "@/lib/retrieval/db"; + +export const runtime = "nodejs"; +export const dynamic = "force-dynamic"; + +function json(data: unknown, status = 200) { + return new Response(JSON.stringify(data), { + status, + headers: { "content-type": "application/json" }, + }); +} + +interface Relation { + source_class: string; + source_id: string; + relation_type: string; + target_class: string; + target_id: string; + evidence_ref: string | null; + confidence: string; +} + +export async function GET(req: NextRequest) { + const u = new URL(req.url); + const cls = u.searchParams.get("class") ?? ""; + const id = u.searchParams.get("id") ?? ""; + if (!cls || !id) return json({ error: "class and id required" }, 400); + + try { + const outgoing = await pgQuery<Relation>( + `SELECT source_class, source_id, relation_type, target_class, target_id, + evidence_ref, confidence + FROM public.relations + WHERE source_class = $1 AND source_id = $2 + ORDER BY confidence DESC, relation_type, target_class, target_id + LIMIT 200`, + [cls, id], + ); + const incoming = await pgQuery<Relation>( + `SELECT source_class, source_id, relation_type, target_class, target_id, + evidence_ref, confidence + FROM public.relations + WHERE target_class = $1 AND target_id = $2 + ORDER BY confidence DESC, relation_type, source_class, source_id + LIMIT 200`, + [cls, id], + ); + return json({ outgoing, incoming }); + } catch (e) { + return json({ error: "db_unavailable", message: (e as Error).message }, 503); + } +} diff --git a/web/app/api/timeline/route.ts b/web/app/api/timeline/route.ts index ee7422b..c70167a 100644 --- a/web/app/api/timeline/route.ts +++ b/web/app/api/timeline/route.ts @@ -103,6 +103,8 @@ export async function GET(req: Request) { ) as TimelineEntry["summary_status"]; // Default: hide events without a real narrative. if (!includeUnsynthesized && summary_status === "none") continue; + // Always hide generic concept-entities (categories, not real events). + if (fm.is_generic === true) continue; if (q && !canonical.toLowerCase().includes(q) && !narrative.toLowerCase().includes(q)) { continue; } diff --git a/web/app/d/[docId]/[page]/page.tsx b/web/app/d/[docId]/[page]/page.tsx index 35cab3e..87fef64 100644 --- a/web/app/d/[docId]/[page]/page.tsx +++ b/web/app/d/[docId]/[page]/page.tsx @@ -53,8 +53,6 @@ export default async function DocPageView({ if (!idx) notFound(); const pageChunks = byPage.get(pageNum) ?? []; - if (pageChunks.length === 0) notFound(); - const pngUrl = `/api/static/processing/png/${docId}/p-${m[1]}.png`; const totalPages = idx.total_pages; @@ -123,7 +121,18 @@ export default async function DocPageView({ <h2 className="font-mono text-xs uppercase tracking-widest text-[#7fdbff] mb-2"> trechos (ordem de leitura) </h2> - <DocRendererV2 docId={docId} chunksByPage={[[pageNum, pageChunks]]} /> + {pageChunks.length === 0 ? ( + <div className="border border-[rgba(0,255,156,0.15)] bg-[#0a121e] rounded p-6 text-sm text-[#c8d4e6]"> + <p className="font-mono text-[#7fdbff] mb-2">▍ página sem trechos extraídos</p> + <p className="text-[#5a6678] text-xs"> + O scan existe (veja à esquerda) mas o processo de chunking não gerou trechos + para esta página específica. Pode ser página em branco, divisor de seção + ou conteúdo sem texto extraível. Próxima execução do chunker preencherá. + </p> + </div> + ) : ( + <DocRendererV2 docId={docId} chunksByPage={[[pageNum, pageChunks]]} /> + )} </article> </div> diff --git a/web/app/e/[cls]/[id]/page.tsx b/web/app/e/[cls]/[id]/page.tsx index 4a5cc8f..12ee8f0 100644 --- a/web/app/e/[cls]/[id]/page.tsx +++ b/web/app/e/[cls]/[id]/page.tsx @@ -10,6 +10,7 @@ import { MarkdownBody } from "@/components/markdown-body"; import { ChatBubble } from "@/components/chat-bubble"; import { AuthBar } from "@/components/auth-bar"; import { EntityGraphMini } from "@/components/entity-graph-mini"; +import { EntityRelations } from "@/components/entity-relations"; import { getEntityCore, getEntityMentionsByDoc, @@ -101,7 +102,11 @@ export default async function EntityPage({ const totalMentions = core?.total_mentions ?? 0; const documentsCount = core?.documents_count ?? 0; const strength = core?.signal_strength ?? "unverified"; - const sigs = core?.signal_sources ?? { db_chunks: 0, page_refs: 0, cross_refs: 0 }; + const sigs = core?.signal_sources ?? { db_chunks: 0, page_refs: 0, cross_refs: 0, text_refs: 0 }; + // Derived display class: orphan + curated narrative is not noise — it's a + // knowledge-curated entity the corpus simply doesn't mention. Label it apart. + const displayStrength: "strong" | "weak" | "curated" | "orphan" | "unverified" = + strength === "orphan" && core?.summary_status === "curated" ? "curated" : (strength as any); const classColor = CLASS_COLOR[folder as EntityClass]; const classBg = CLASS_BG[folder as EntityClass]; @@ -167,42 +172,54 @@ export default async function EntityPage({ )} <div className={`px-4 py-3 bg-[#0a121e] border rounded ${ - strength === "strong" + displayStrength === "strong" ? "border-[#00ff9c]" - : strength === "weak" + : displayStrength === "weak" ? "border-[#ffa500]" - : strength === "orphan" - ? "border-[#ff6b6b]" - : "border-[#5a6678]" + : displayStrength === "curated" + ? "border-[#a78bfa]" + : displayStrength === "orphan" + ? "border-[#ff6b6b]" + : "border-[#5a6678]" }`} - title="Cruzamento dos 3 sinais que confirmam esta entidade no corpus." + title="Cruzamento dos sinais que confirmam esta entidade no corpus." > <div className="font-mono text-[10px] uppercase tracking-widest text-[#5a6678]"> força do sinal </div> <div className={`font-mono text-sm mt-0.5 ${ - strength === "strong" + displayStrength === "strong" ? "text-[#00ff9c]" - : strength === "weak" + : displayStrength === "weak" ? "text-[#ffa500]" - : strength === "orphan" - ? "text-[#ff6b6b]" - : "text-[#8896aa]" + : displayStrength === "curated" + ? "text-[#a78bfa]" + : displayStrength === "orphan" + ? "text-[#ff6b6b]" + : "text-[#8896aa]" }`} > - {strength === "strong" && "forte"} - {strength === "weak" && "fraca"} - {strength === "orphan" && "órfã"} - {strength === "unverified" && "não verificada"} + {displayStrength === "strong" && "forte"} + {displayStrength === "weak" && "fraca"} + {displayStrength === "curated" && "curado"} + {displayStrength === "orphan" && "órfã"} + {displayStrength === "unverified" && "não verificada"} </div> <div className="font-mono text-[9px] text-[#5a6678] mt-1 leading-tight"> - {sigs.db_chunks} chunks · {sigs.page_refs} págs · {sigs.cross_refs} backlinks + {sigs.db_chunks} chunks · {sigs.page_refs} págs · {sigs.cross_refs} backlinks · {sigs.text_refs} textuais </div> </div> </div> - {strength === "orphan" && ( + {strength === "orphan" && core?.summary_status === "curated" && ( + <p className="mt-4 text-xs text-[#a78bfa] font-mono leading-relaxed"> + 📚 conhecimento curado · este evento/entidade faz parte do registro UAP/UFO + mundial mas <strong>não foi mencionado</strong> nos PDFs deste corpus (war.gov/ufo). + Narrativa abaixo vem de fonte curada manualmente, não de extração. + </p> + )} + {strength === "orphan" && core?.summary_status !== "curated" && ( <p className="mt-4 text-xs text-[#ff6b6b] font-mono"> ⚠ entidade não confirmada: nenhuma página, chunk ou outra entidade aponta para ela. Pode ser extração ruidosa do pipeline original. @@ -287,10 +304,17 @@ export default async function EntityPage({ <aside className="lg:sticky lg:top-6 lg:self-start space-y-6"> <section> <h3 className="font-mono text-[10px] text-[#8896aa] uppercase tracking-widest mb-2"> - Aparece em {documentsCount} documento(s) + Aparece em {mentionGroups.length} documento(s) </h3> {mentionGroups.length === 0 ? ( - <p className="text-[#5a6678] text-xs italic">Sem dados de mention ainda.</p> + displayStrength === "curated" ? ( + <p className="text-[#a78bfa] text-xs italic leading-relaxed"> + Não documentado nos PDFs deste corpus. Conteúdo abaixo vem de fonte + curada (registro UAP mundial), não de extração de documentos. + </p> + ) : ( + <p className="text-[#5a6678] text-xs italic">Sem dados de mention ainda.</p> + ) ) : ( <ul className="space-y-1 max-h-[50vh] overflow-y-auto pr-1"> {mentionGroups.map((m) => ( @@ -303,6 +327,14 @@ export default async function EntityPage({ <span className="text-[#7fdbff] group-hover:text-[#00ff9c] truncate flex-1"> {m.canonical_title ?? m.doc_id} </span> + {m.text_only && ( + <span + title="Menção textual encontrada via back-fill (alias dentro do corpo narrativo); pipeline estruturado não pegou." + className="text-[9px] text-[#a78bfa] border border-[rgba(167,139,250,0.40)] px-1 rounded shrink-0" + > + texto + </span> + )} <span className="text-[#00ff9c] tabular-nums shrink-0">{m.mention_count}×</span> </div> <div className="flex items-center gap-2 mt-0.5 font-mono text-[10px] text-[#5a6678]"> @@ -318,6 +350,13 @@ export default async function EntityPage({ )} </section> + <section className="border-t border-[rgba(0,255,156,0.12)] pt-4"> + <h3 className="font-mono text-[10px] text-[#8896aa] uppercase tracking-widest mb-3"> + Relações tipadas + </h3> + <EntityRelations entityClass={entityClassSingular} entityId={id} /> + </section> + <EntityGraphMini entityClassSingular={entityClassSingular} entityId={id} diff --git a/web/app/e/[cls]/page.tsx b/web/app/e/[cls]/page.tsx index 115b4f6..11ca351 100644 --- a/web/app/e/[cls]/page.tsx +++ b/web/app/e/[cls]/page.tsx @@ -46,7 +46,7 @@ interface EntityRow { enrichment_status: string | null; } -async function listEntities(cls: EntityClass): Promise<EntityRow[]> { +async function listEntities(cls: EntityClass, includeGeneric = false): Promise<EntityRow[]> { const dir = path.join(WIKI, "entities", cls); let files: string[] = []; try { @@ -59,6 +59,9 @@ async function listEntities(cls: EntityClass): Promise<EntityRow[]> { try { const raw = await fs.readFile(path.join(dir, f), "utf-8"); const fm = matter(raw).data as Record<string, unknown>; + // Hide generic concept-entities (e.g. "Flying disc sighting reports") — + // they're categories, not real instances. Opt-in via ?include_generic=1. + if (!includeGeneric && fm.is_generic === true) continue; rows.push({ id: f.replace(/\.md$/, ""), canonical_name: String(fm.canonical_name ?? f.replace(/\.md$/, "")), @@ -77,13 +80,17 @@ async function listEntities(cls: EntityClass): Promise<EntityRow[]> { export default async function EntityListPage({ params, + searchParams, }: { params: Promise<{ cls: string }>; + searchParams?: Promise<{ include_generic?: string }>; }) { const { cls } = await params; + const sp = (await searchParams) ?? {}; + const includeGeneric = sp.include_generic === "1"; const folder = classKeyToFolder(cls); if (!folder) notFound(); - const entities = await listEntities(folder as EntityClass); + const entities = await listEntities(folder as EntityClass, includeGeneric); return ( <main className="min-h-screen p-6 md:p-10 max-w-5xl mx-auto"> diff --git a/web/components/entity-relations.tsx b/web/components/entity-relations.tsx new file mode 100644 index 0000000..5fd1406 --- /dev/null +++ b/web/components/entity-relations.tsx @@ -0,0 +1,152 @@ +/** + * EntityRelations — typed relations panel for an entity page. + * + * Renders semantically-typed edges (Person witnessed Event, Event documented_in + * Document, etc.) grouped by relation_type and direction, instead of the + * noisy co-mention list. + */ +"use client"; + +import { useEffect, useState } from "react"; +import Link from "next/link"; + +interface Relation { + source_class: string; + source_id: string; + relation_type: string; + target_class: string; + target_id: string; + evidence_ref: string | null; + confidence: string; +} + +interface ApiResponse { + outgoing: Relation[]; + incoming: Relation[]; + error?: string; +} + +const TYPE_LABEL_PT: Record<string, { out: string; in: string }> = { + witnessed: { out: "testemunhou", in: "foi testemunhado por" }, + occurred_at: { out: "ocorreu em", in: "foi local de" }, + involves_uap: { out: "envolve UAP", in: "observado em" }, + documented_in: { out: "documentado em", in: "documenta" }, + authored: { out: "autoria de", in: "autor:" }, + signed: { out: "assinou", in: "assinado por" }, + mentioned_by: { out: "mencionado em", in: "menciona" }, + employed_by: { out: "trabalhou em", in: "empregou" }, + operated_by: { out: "operada por", in: "operou" }, + investigated: { out: "investigou", in: "investigado por" }, + commanded: { out: "comandou", in: "comandado por" }, + related_to: { out: "relacionado a", in: "relacionado por" }, + similar_to: { out: "similar a", in: "similar de" }, + precedes: { out: "precede", in: "precedido por" }, + follows: { out: "segue", in: "seguido por" }, +}; + +const ENTITY_FOLDER: Record<string, string> = { + person: "people", + organization: "organizations", + location: "locations", + event: "events", + uap_object: "uap-objects", + vehicle: "vehicles", + operation: "operations", + concept: "concepts", +}; + +function entityHref(cls: string, id: string): string { + if (cls === "document") return `/d/${id}`; + const folder = ENTITY_FOLDER[cls] ?? cls; + return `/e/${folder}/${id}`; +} + +export function EntityRelations({ + entityClass, + entityId, +}: { + entityClass: string; + entityId: string; +}) { + const [data, setData] = useState<ApiResponse | null>(null); + const [loading, setLoading] = useState(true); + + useEffect(() => { + let aborted = false; + setLoading(true); + fetch(`/api/relations?class=${entityClass}&id=${encodeURIComponent(entityId)}`) + .then((r) => r.json()) + .then((j) => { if (!aborted) { setData(j); setLoading(false); } }) + .catch(() => { if (!aborted) { setData({ outgoing: [], incoming: [] }); setLoading(false); } }); + return () => { aborted = true; }; + }, [entityClass, entityId]); + + if (loading) { + return <div className="font-mono text-xs text-[#5a6678]">carregando relações…</div>; + } + if (!data || (data.outgoing.length === 0 && data.incoming.length === 0)) { + return ( + <div className="font-mono text-xs text-[#5a6678] italic"> + sem relações tipadas extraídas para esta entidade. + </div> + ); + } + + // Group by relation_type for outgoing and incoming separately + const groupOut: Record<string, Relation[]> = {}; + for (const r of data.outgoing) (groupOut[r.relation_type] ||= []).push(r); + const groupIn: Record<string, Relation[]> = {}; + for (const r of data.incoming) (groupIn[r.relation_type] ||= []).push(r); + + const renderGroup = (type: string, list: Relation[], dir: "out" | "in") => { + const label = TYPE_LABEL_PT[type]?.[dir] ?? type; + return ( + <div key={`${dir}-${type}`} className="mb-3"> + <div className="font-mono text-[10px] uppercase tracking-widest text-[#7fdbff] mb-1"> + {label} · <span className="text-[#5a6678]">{list.length}</span> + </div> + <ul className="space-y-0.5 text-xs"> + {list.slice(0, 12).map((r, i) => { + const otherClass = dir === "out" ? r.target_class : r.source_class; + const otherId = dir === "out" ? r.target_id : r.source_id; + return ( + <li key={i} className="font-mono text-[#c8d4e6]"> + <Link + href={entityHref(otherClass, otherId)} + className="hover:text-[#00ff9c] truncate" + title={`${otherClass}/${otherId}`} + > + <span className="text-[#5a6678]">[{otherClass[0]}]</span> {otherId} + </Link> + </li> + ); + })} + {list.length > 12 && ( + <li className="font-mono text-[10px] text-[#5a6678]">… +{list.length - 12}</li> + )} + </ul> + </div> + ); + }; + + return ( + <div className="space-y-4"> + {Object.keys(groupOut).length > 0 && ( + <section> + <h3 className="font-mono text-xs uppercase tracking-widest text-[#00ff9c] mb-2"> + Relações desta entidade → + </h3> + {Object.entries(groupOut).map(([t, list]) => renderGroup(t, list, "out"))} + </section> + )} + {Object.keys(groupIn).length > 0 && ( + <section> + <h3 className="font-mono text-xs uppercase tracking-widest text-[#ffa500] mb-2"> + ← Entidades que apontam para esta + </h3> + {Object.entries(groupIn).map(([t, list]) => renderGroup(t, list, "in"))} + </section> + )} + </div> + ); +} diff --git a/web/lib/retrieval/entity-pages.ts b/web/lib/retrieval/entity-pages.ts index 6fadb89..fa378c7 100644 --- a/web/lib/retrieval/entity-pages.ts +++ b/web/lib/retrieval/entity-pages.ts @@ -43,8 +43,10 @@ export interface EntityCore { db_chunks: number; page_refs: number; cross_refs: number; + text_refs: number; }; - mentioned_in: string[]; // [[doc-id/p007]] + mentioned_in: string[]; // [[doc-id/p007]] — structured page refs (Haiku) + text_mentioned_in: string[]; // [[doc-id/p007]] — text-only matches (back-fill) referenced_by: string[]; // [[class/id]] cross-links enrichment_status: string | null; narrative_summary: string | null; @@ -132,8 +134,10 @@ export async function getEntityCore( db_chunks: num(sigSources.db_chunks, 0), page_refs: num(sigSources.page_refs, 0), cross_refs: num(sigSources.cross_refs, 0), + text_refs: num(sigSources.text_refs, 0), }, mentioned_in: arr(fm.mentioned_in), + text_mentioned_in: arr(fm.text_mentioned_in), referenced_by: arr(fm.referenced_by), enrichment_status: strOrNull(fm.enrichment_status), narrative_summary: strOrNull(fm.narrative_summary), @@ -150,6 +154,7 @@ export interface EntityMentionGroup { classification: string | null; mention_count: number; pages: number[]; + text_only: boolean; // true when all refs came from the text back-fill (no structured Haiku evidence) } /** @@ -164,22 +169,25 @@ export async function getEntityMentionsByDoc( ): Promise<EntityMentionGroup[]> { const fm = await readEntityYaml(entityClass, entityId); if (!fm) return []; - const refs = arr(fm.mentioned_in); + const structuredRefs = arr(fm.mentioned_in); + const textRefs = arr(fm.text_mentioned_in); // Each ref looks like "[[doc-id/p007]]". Strip wikilink delimiters. - const byDoc = new Map<string, Set<number>>(); - for (const ref of refs) { + const byDoc = new Map<string, { structured: Set<number>; text: Set<number> }>(); + const addRef = (ref: string, source: "structured" | "text") => { const m = ref.match(/\[\[([^\]|]+?)\]\]/); const target = (m ? m[1] : ref).trim(); const [docId, pageStr] = target.split("/", 2); - if (!docId) continue; + if (!docId) return; const pageNum = pageStr ? parseInt(pageStr.replace(/^p/, ""), 10) : NaN; - if (!byDoc.has(docId)) byDoc.set(docId, new Set()); - if (Number.isFinite(pageNum)) byDoc.get(docId)!.add(pageNum); - } + if (!byDoc.has(docId)) byDoc.set(docId, { structured: new Set(), text: new Set() }); + if (Number.isFinite(pageNum)) byDoc.get(docId)![source].add(pageNum); + }; + for (const r of structuredRefs) addRef(r, "structured"); + for (const r of textRefs) addRef(r, "text"); // Hydrate each doc's metadata from wiki/documents/<doc-id>.md const groups: EntityMentionGroup[] = []; - for (const [docId, pages] of byDoc) { + for (const [docId, sets] of byDoc) { let canonical_title: string | null = null; let collection: string | null = null; let page_count: number | null = null; @@ -197,14 +205,16 @@ export async function getEntityMentionsByDoc( } catch { /* doc missing — use raw id */ } + const merged = new Set<number>([...sets.structured, ...sets.text]); groups.push({ doc_id: docId, canonical_title, collection, page_count, classification, - mention_count: pages.size, - pages: Array.from(pages).sort((a, b) => a - b), + mention_count: merged.size, + pages: Array.from(merged).sort((a, b) => a - b), + text_only: sets.structured.size === 0 && sets.text.size > 0, }); } groups.sort((a, b) => b.mention_count - a.mention_count); diff --git a/web/lib/retrieval/graph.ts b/web/lib/retrieval/graph.ts index 7d58031..40e234b 100644 --- a/web/lib/retrieval/graph.ts +++ b/web/lib/retrieval/graph.ts @@ -70,7 +70,7 @@ export async function getNeighbors( e.total_mentions, e.documents_count, c.weight, c.sample_chunks FROM coloc c JOIN public.entities e ON e.entity_pk = c.other_pk - WHERE 1=1 ${classFilter} + WHERE NOT e.is_generic ${classFilter} ORDER BY c.weight DESC LIMIT $${params.length}`, params, @@ -142,6 +142,7 @@ export async function getGraphSeed(opts: { WHERE LENGTH(TRIM(canonical_name)) >= 4 AND canonical_name !~ '^[A-Z]{1,3}$' AND canonical_name !~ '^[0-9.()-]+$' + AND NOT is_generic ${classFilter} ) SELECT entity_pk, entity_class, entity_id, canonical_name, total_mentions, documents_count, entity_class_short