disclosure-bureau/scripts/00-extract-war-gov.js

181 lines
7.4 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* 00-extract-war-gov.js — Console-based extractor for war.gov/UFO/Release-NN/
*
* Works on any release page (Release-01, Release-02, etc.) because it derives
* everything from the DOM, not from a hardcoded release number.
*
* USAGE (Chrome on https://www.war.gov/UFO/Release-NN/):
* 1. Wait for the page to load — scroll to the bottom to trigger lazy-load
* if there are images you haven't scrolled past
* 2. Open DevTools (Cmd+Option+I / F12) → Console
* 3. Paste this ENTIRE file. Press Enter.
* 4. Wait ~3-5 minutes (158 docs × ~2s each click+wait).
* 5. The JSON is opened in a new tab AND copied to clipboard.
* Save it under /Users/guto/ufo/processing/war-gov-metadata/
* as all-documents-release-NN.json (or paste it back to me).
*
* What it captures per document:
* - record_id (record-001..record-NNN — internal id)
* - title (as printed in the modal heading; case correct)
* - agency
* - release_date, incident_date, incident_location, document_type
* - description (the unique paragraph shown in the detail overlay)
* - thumbnail_url (Akamai-hosted JPG preview)
* - pdf_url_inferred (replaces "/thumbnail/" with "/" and ".jpg" with the
* proper extension based on document_type)
*
* The script is READ-ONLY — it never submits, never modifies the page beyond
* opening and closing the detail modal.
*/
(async function extractWarGovFull() {
const sleep = ms => new Promise(r => setTimeout(r, ms));
function pollUntil(predicate, opts = {}) {
const timeout = opts.timeout ?? 3000;
const interval = opts.interval ?? 50;
return new Promise((resolve, reject) => {
const start = Date.now();
const tick = () => {
const v = predicate();
if (v) return resolve(v);
if (Date.now() - start > timeout) return reject(new Error("timeout"));
setTimeout(tick, interval);
};
tick();
});
}
function strip(s) { return (s || "").replace(/^\s*\[(.*)\]\s*$/, "$1").trim(); }
function parseModal() {
const modal = document.querySelector('.record-modal-shell, [data-record-modal-shell]');
if (!modal) return null;
const out = {};
const titleEl = modal.querySelector('[data-record-modal-title], #record-modal-title');
out.title = titleEl ? titleEl.innerText.trim() : null;
const agencyEl = modal.querySelector('[data-record-modal-agency]');
out.agency = strip(agencyEl?.innerText);
const descEl = modal.querySelector('[data-record-modal-copy], .record-modal-copy');
out.description = descEl ? descEl.innerText.trim() : null;
// dl facts
modal.querySelectorAll('.record-modal-fact').forEach(fact => {
const dt = fact.querySelector('dt');
const dd = fact.querySelector('dd');
if (dt && dd) {
const key = dt.innerText.trim().toLowerCase().replace(/[^a-z0-9]+/g, "_").replace(/^_|_$/g, "");
out[key] = strip(dd.innerText);
}
});
// Thumbnail
const img = modal.querySelector('#record-main-image, img');
if (img && img.src) {
out.thumbnail_url = img.src;
// Infer PDF/asset url: drop "/thumbnail" segment, restore extension based on document_type
const ext = (out.document_type || ".pdf").toLowerCase().replace(/^\[?\.?/, ".").replace(/\]$/, "");
out.pdf_url_inferred = img.src.replace("/thumbnail/", "/").replace(/\.jpg$/i, ext);
}
// Record kind from modal data attr (pdf|vid|img)
out.record_kind = modal.getAttribute("data-record-kind") || null;
return out;
}
async function clickRowAndCapture(row) {
const recordId = row.dataset.recordId || row.getAttribute("data-record-id");
// open modal
row.click();
let modalData = null;
try {
await pollUntil(() => {
const m = document.querySelector('.record-modal-shell');
if (!m) return null;
// Wait until the title matches the row's title (modal can be stale from previous open)
const t = m.querySelector('[data-record-modal-title]')?.innerText?.trim();
const expected = row.querySelector('.record-title')?.innerText?.trim();
if (t && expected && t.toLowerCase() === expected.toLowerCase()) return m;
// Or just any visible modal after some delay
return null;
}, { timeout: 2500 });
modalData = parseModal();
} catch (e) {
console.warn(` ${recordId}: modal did not load for "${row.querySelector('.record-title')?.innerText}"`);
}
// close modal
const closeBtn = document.querySelector('.record-modal-close, [data-record-modal-close]');
if (closeBtn) closeBtn.click();
else {
// Press Escape
document.dispatchEvent(new KeyboardEvent("keydown", { key: "Escape" }));
}
await pollUntil(() => !document.querySelector('.record-modal-shell'), { timeout: 1500 }).catch(() => {});
return { record_id: recordId, ...(modalData || {}) };
}
async function captureCurrentPage() {
const rows = Array.from(document.querySelectorAll('button.record-row'));
const out = [];
for (const row of rows) {
const record = await clickRowAndCapture(row);
out.push(record);
await sleep(120); // small breather between cards
}
return out;
}
const findNext = () => document.querySelector('button.pagination-next');
const release = (location.pathname.match(/Release-(\d+)/i) || [, "01"])[1].padStart(2, "0");
console.log(`[extract] starting on Release-${release}`);
const all = [];
const seen = new Set();
const MAX_PAGES = 25;
let pageIdx = 0;
while (pageIdx < MAX_PAGES) {
pageIdx++;
const firstBefore = document.querySelector('button.record-row')?.dataset.recordId;
const t0 = performance.now();
const captured = await captureCurrentPage();
let added = 0;
for (const r of captured) {
const key = r.record_id || `${r.title}|${r.incident_date}`;
if (seen.has(key)) continue;
seen.add(key);
all.push(r);
added++;
}
console.log(`[extract] page ${pageIdx}: captured ${captured.length} (+${added} new, total ${all.length}, ${(performance.now()-t0|0)}ms)`);
const next = findNext();
if (!next || next.disabled || next.getAttribute("aria-disabled") === "true") {
console.log("[extract] NEXT disabled — last page reached");
break;
}
next.click();
// Wait until row content changes
await pollUntil(() => {
const f = document.querySelector('button.record-row')?.dataset.recordId;
return f && f !== firstBefore ? f : null;
}, { timeout: 3000 }).catch(() => {});
await sleep(200);
}
const result = {
extracted_at: new Date().toISOString(),
source_url: location.href,
release: `Release-${release}`,
total_documents: all.length,
pages_visited: pageIdx,
documents: all,
};
const jsonStr = JSON.stringify(result, null, 2);
console.log(`[extract] DONE — ${all.length} documents extracted across ${pageIdx} pages`);
console.log(`[extract] full-metadata count: ${all.filter(d => d.description && d.asset_file_name).length}`);
try { await navigator.clipboard.writeText(jsonStr); console.log("[extract] ✓ JSON copied to clipboard"); }
catch (e) { console.warn("[extract] clipboard failed (focus the tab and re-run if needed):", e.message); }
const blob = new Blob([jsonStr], { type: "application/json" });
window.open(URL.createObjectURL(blob), "_blank");
console.log("[extract] ✓ JSON opened in new tab — save with Cmd+S");
console.log("[extract] sample doc:", all[0]);
return result;
})();