disclosure-bureau/scripts/00-extract-war-gov.js

182 lines
7.4 KiB
JavaScript
Raw Normal View History

/**
* 00-extract-war-gov.js Console-based extractor for war.gov/UFO/Release-NN/
*
* Works on any release page (Release-01, Release-02, etc.) because it derives
* everything from the DOM, not from a hardcoded release number.
*
* USAGE (Chrome on https://www.war.gov/UFO/Release-NN/):
* 1. Wait for the page to load scroll to the bottom to trigger lazy-load
* if there are images you haven't scrolled past
* 2. Open DevTools (Cmd+Option+I / F12) Console
* 3. Paste this ENTIRE file. Press Enter.
* 4. Wait ~3-5 minutes (158 docs × ~2s each click+wait).
* 5. The JSON is opened in a new tab AND copied to clipboard.
* Save it under /Users/guto/ufo/processing/war-gov-metadata/
* as all-documents-release-NN.json (or paste it back to me).
*
* What it captures per document:
* - record_id (record-001..record-NNN internal id)
* - title (as printed in the modal heading; case correct)
* - agency
* - release_date, incident_date, incident_location, document_type
* - description (the unique paragraph shown in the detail overlay)
* - thumbnail_url (Akamai-hosted JPG preview)
* - pdf_url_inferred (replaces "/thumbnail/" with "/" and ".jpg" with the
* proper extension based on document_type)
*
* The script is READ-ONLY it never submits, never modifies the page beyond
* opening and closing the detail modal.
*/
(async function extractWarGovFull() {
const sleep = ms => new Promise(r => setTimeout(r, ms));
function pollUntil(predicate, opts = {}) {
const timeout = opts.timeout ?? 3000;
const interval = opts.interval ?? 50;
return new Promise((resolve, reject) => {
const start = Date.now();
const tick = () => {
const v = predicate();
if (v) return resolve(v);
if (Date.now() - start > timeout) return reject(new Error("timeout"));
setTimeout(tick, interval);
};
tick();
});
}
function strip(s) { return (s || "").replace(/^\s*\[(.*)\]\s*$/, "$1").trim(); }
function parseModal() {
const modal = document.querySelector('.record-modal-shell, [data-record-modal-shell]');
if (!modal) return null;
const out = {};
const titleEl = modal.querySelector('[data-record-modal-title], #record-modal-title');
out.title = titleEl ? titleEl.innerText.trim() : null;
const agencyEl = modal.querySelector('[data-record-modal-agency]');
out.agency = strip(agencyEl?.innerText);
const descEl = modal.querySelector('[data-record-modal-copy], .record-modal-copy');
out.description = descEl ? descEl.innerText.trim() : null;
// dl facts
modal.querySelectorAll('.record-modal-fact').forEach(fact => {
const dt = fact.querySelector('dt');
const dd = fact.querySelector('dd');
if (dt && dd) {
const key = dt.innerText.trim().toLowerCase().replace(/[^a-z0-9]+/g, "_").replace(/^_|_$/g, "");
out[key] = strip(dd.innerText);
}
});
// Thumbnail
const img = modal.querySelector('#record-main-image, img');
if (img && img.src) {
out.thumbnail_url = img.src;
// Infer PDF/asset url: drop "/thumbnail" segment, restore extension based on document_type
const ext = (out.document_type || ".pdf").toLowerCase().replace(/^\[?\.?/, ".").replace(/\]$/, "");
out.pdf_url_inferred = img.src.replace("/thumbnail/", "/").replace(/\.jpg$/i, ext);
}
// Record kind from modal data attr (pdf|vid|img)
out.record_kind = modal.getAttribute("data-record-kind") || null;
return out;
}
async function clickRowAndCapture(row) {
const recordId = row.dataset.recordId || row.getAttribute("data-record-id");
// open modal
row.click();
let modalData = null;
try {
await pollUntil(() => {
const m = document.querySelector('.record-modal-shell');
if (!m) return null;
// Wait until the title matches the row's title (modal can be stale from previous open)
const t = m.querySelector('[data-record-modal-title]')?.innerText?.trim();
const expected = row.querySelector('.record-title')?.innerText?.trim();
if (t && expected && t.toLowerCase() === expected.toLowerCase()) return m;
// Or just any visible modal after some delay
return null;
}, { timeout: 2500 });
modalData = parseModal();
} catch (e) {
console.warn(` ${recordId}: modal did not load for "${row.querySelector('.record-title')?.innerText}"`);
}
// close modal
const closeBtn = document.querySelector('.record-modal-close, [data-record-modal-close]');
if (closeBtn) closeBtn.click();
else {
// Press Escape
document.dispatchEvent(new KeyboardEvent("keydown", { key: "Escape" }));
}
await pollUntil(() => !document.querySelector('.record-modal-shell'), { timeout: 1500 }).catch(() => {});
return { record_id: recordId, ...(modalData || {}) };
}
async function captureCurrentPage() {
const rows = Array.from(document.querySelectorAll('button.record-row'));
const out = [];
for (const row of rows) {
const record = await clickRowAndCapture(row);
out.push(record);
await sleep(120); // small breather between cards
}
return out;
}
const findNext = () => document.querySelector('button.pagination-next');
const release = (location.pathname.match(/Release-(\d+)/i) || [, "01"])[1].padStart(2, "0");
console.log(`[extract] starting on Release-${release}`);
const all = [];
const seen = new Set();
const MAX_PAGES = 25;
let pageIdx = 0;
while (pageIdx < MAX_PAGES) {
pageIdx++;
const firstBefore = document.querySelector('button.record-row')?.dataset.recordId;
const t0 = performance.now();
const captured = await captureCurrentPage();
let added = 0;
for (const r of captured) {
const key = r.record_id || `${r.title}|${r.incident_date}`;
if (seen.has(key)) continue;
seen.add(key);
all.push(r);
added++;
}
console.log(`[extract] page ${pageIdx}: captured ${captured.length} (+${added} new, total ${all.length}, ${(performance.now()-t0|0)}ms)`);
const next = findNext();
if (!next || next.disabled || next.getAttribute("aria-disabled") === "true") {
console.log("[extract] NEXT disabled — last page reached");
break;
}
next.click();
// Wait until row content changes
await pollUntil(() => {
const f = document.querySelector('button.record-row')?.dataset.recordId;
return f && f !== firstBefore ? f : null;
}, { timeout: 3000 }).catch(() => {});
await sleep(200);
}
const result = {
extracted_at: new Date().toISOString(),
source_url: location.href,
release: `Release-${release}`,
total_documents: all.length,
pages_visited: pageIdx,
documents: all,
};
const jsonStr = JSON.stringify(result, null, 2);
console.log(`[extract] DONE — ${all.length} documents extracted across ${pageIdx} pages`);
console.log(`[extract] full-metadata count: ${all.filter(d => d.description && d.asset_file_name).length}`);
try { await navigator.clipboard.writeText(jsonStr); console.log("[extract] ✓ JSON copied to clipboard"); }
catch (e) { console.warn("[extract] clipboard failed (focus the tab and re-run if needed):", e.message); }
const blob = new Blob([jsonStr], { type: "application/json" });
window.open(URL.createObjectURL(blob), "_blank");
console.log("[extract] ✓ JSON opened in new tab — save with Cmd+S");
console.log("[extract] sample doc:", all[0]);
return result;
})();