/** * 00-extract-war-gov.js — Console-based extractor for war.gov/UFO/Release-NN/ * * Works on any release page (Release-01, Release-02, etc.) because it derives * everything from the DOM, not from a hardcoded release number. * * USAGE (Chrome on https://www.war.gov/UFO/Release-NN/): * 1. Wait for the page to load — scroll to the bottom to trigger lazy-load * if there are images you haven't scrolled past * 2. Open DevTools (Cmd+Option+I / F12) → Console * 3. Paste this ENTIRE file. Press Enter. * 4. Wait ~3-5 minutes (158 docs × ~2s each click+wait). * 5. The JSON is opened in a new tab AND copied to clipboard. * Save it under /Users/guto/ufo/processing/war-gov-metadata/ * as all-documents-release-NN.json (or paste it back to me). * * What it captures per document: * - record_id (record-001..record-NNN — internal id) * - title (as printed in the modal heading; case correct) * - agency * - release_date, incident_date, incident_location, document_type * - description (the unique paragraph shown in the detail overlay) * - thumbnail_url (Akamai-hosted JPG preview) * - pdf_url_inferred (replaces "/thumbnail/" with "/" and ".jpg" with the * proper extension based on document_type) * * The script is READ-ONLY — it never submits, never modifies the page beyond * opening and closing the detail modal. */ (async function extractWarGovFull() { const sleep = ms => new Promise(r => setTimeout(r, ms)); function pollUntil(predicate, opts = {}) { const timeout = opts.timeout ?? 3000; const interval = opts.interval ?? 50; return new Promise((resolve, reject) => { const start = Date.now(); const tick = () => { const v = predicate(); if (v) return resolve(v); if (Date.now() - start > timeout) return reject(new Error("timeout")); setTimeout(tick, interval); }; tick(); }); } function strip(s) { return (s || "").replace(/^\s*\[(.*)\]\s*$/, "$1").trim(); } function parseModal() { const modal = document.querySelector('.record-modal-shell, [data-record-modal-shell]'); if (!modal) return null; const out = {}; const titleEl = modal.querySelector('[data-record-modal-title], #record-modal-title'); out.title = titleEl ? titleEl.innerText.trim() : null; const agencyEl = modal.querySelector('[data-record-modal-agency]'); out.agency = strip(agencyEl?.innerText); const descEl = modal.querySelector('[data-record-modal-copy], .record-modal-copy'); out.description = descEl ? descEl.innerText.trim() : null; // dl facts modal.querySelectorAll('.record-modal-fact').forEach(fact => { const dt = fact.querySelector('dt'); const dd = fact.querySelector('dd'); if (dt && dd) { const key = dt.innerText.trim().toLowerCase().replace(/[^a-z0-9]+/g, "_").replace(/^_|_$/g, ""); out[key] = strip(dd.innerText); } }); // Thumbnail const img = modal.querySelector('#record-main-image, img'); if (img && img.src) { out.thumbnail_url = img.src; // Infer PDF/asset url: drop "/thumbnail" segment, restore extension based on document_type const ext = (out.document_type || ".pdf").toLowerCase().replace(/^\[?\.?/, ".").replace(/\]$/, ""); out.pdf_url_inferred = img.src.replace("/thumbnail/", "/").replace(/\.jpg$/i, ext); } // Record kind from modal data attr (pdf|vid|img) out.record_kind = modal.getAttribute("data-record-kind") || null; return out; } async function clickRowAndCapture(row) { const recordId = row.dataset.recordId || row.getAttribute("data-record-id"); // open modal row.click(); let modalData = null; try { await pollUntil(() => { const m = document.querySelector('.record-modal-shell'); if (!m) return null; // Wait until the title matches the row's title (modal can be stale from previous open) const t = m.querySelector('[data-record-modal-title]')?.innerText?.trim(); const expected = row.querySelector('.record-title')?.innerText?.trim(); if (t && expected && t.toLowerCase() === expected.toLowerCase()) return m; // Or just any visible modal after some delay return null; }, { timeout: 2500 }); modalData = parseModal(); } catch (e) { console.warn(` ${recordId}: modal did not load for "${row.querySelector('.record-title')?.innerText}"`); } // close modal const closeBtn = document.querySelector('.record-modal-close, [data-record-modal-close]'); if (closeBtn) closeBtn.click(); else { // Press Escape document.dispatchEvent(new KeyboardEvent("keydown", { key: "Escape" })); } await pollUntil(() => !document.querySelector('.record-modal-shell'), { timeout: 1500 }).catch(() => {}); return { record_id: recordId, ...(modalData || {}) }; } async function captureCurrentPage() { const rows = Array.from(document.querySelectorAll('button.record-row')); const out = []; for (const row of rows) { const record = await clickRowAndCapture(row); out.push(record); await sleep(120); // small breather between cards } return out; } const findNext = () => document.querySelector('button.pagination-next'); const release = (location.pathname.match(/Release-(\d+)/i) || [, "01"])[1].padStart(2, "0"); console.log(`[extract] starting on Release-${release}`); const all = []; const seen = new Set(); const MAX_PAGES = 25; let pageIdx = 0; while (pageIdx < MAX_PAGES) { pageIdx++; const firstBefore = document.querySelector('button.record-row')?.dataset.recordId; const t0 = performance.now(); const captured = await captureCurrentPage(); let added = 0; for (const r of captured) { const key = r.record_id || `${r.title}|${r.incident_date}`; if (seen.has(key)) continue; seen.add(key); all.push(r); added++; } console.log(`[extract] page ${pageIdx}: captured ${captured.length} (+${added} new, total ${all.length}, ${(performance.now()-t0|0)}ms)`); const next = findNext(); if (!next || next.disabled || next.getAttribute("aria-disabled") === "true") { console.log("[extract] NEXT disabled — last page reached"); break; } next.click(); // Wait until row content changes await pollUntil(() => { const f = document.querySelector('button.record-row')?.dataset.recordId; return f && f !== firstBefore ? f : null; }, { timeout: 3000 }).catch(() => {}); await sleep(200); } const result = { extracted_at: new Date().toISOString(), source_url: location.href, release: `Release-${release}`, total_documents: all.length, pages_visited: pageIdx, documents: all, }; const jsonStr = JSON.stringify(result, null, 2); console.log(`[extract] DONE — ${all.length} documents extracted across ${pageIdx} pages`); console.log(`[extract] full-metadata count: ${all.filter(d => d.description && d.asset_file_name).length}`); try { await navigator.clipboard.writeText(jsonStr); console.log("[extract] ✓ JSON copied to clipboard"); } catch (e) { console.warn("[extract] clipboard failed (focus the tab and re-run if needed):", e.message); } const blob = new Blob([jsonStr], { type: "application/json" }); window.open(URL.createObjectURL(blob), "_blank"); console.log("[extract] ✓ JSON opened in new tab — save with Cmd+S"); console.log("[extract] sample doc:", all[0]); return result; })();