181 lines
7.4 KiB
JavaScript
181 lines
7.4 KiB
JavaScript
/**
|
||
* 00-extract-war-gov.js — Console-based extractor for war.gov/UFO/Release-NN/
|
||
*
|
||
* Works on any release page (Release-01, Release-02, etc.) because it derives
|
||
* everything from the DOM, not from a hardcoded release number.
|
||
*
|
||
* USAGE (Chrome on https://www.war.gov/UFO/Release-NN/):
|
||
* 1. Wait for the page to load — scroll to the bottom to trigger lazy-load
|
||
* if there are images you haven't scrolled past
|
||
* 2. Open DevTools (Cmd+Option+I / F12) → Console
|
||
* 3. Paste this ENTIRE file. Press Enter.
|
||
* 4. Wait ~3-5 minutes (158 docs × ~2s each click+wait).
|
||
* 5. The JSON is opened in a new tab AND copied to clipboard.
|
||
* Save it under /Users/guto/ufo/processing/war-gov-metadata/
|
||
* as all-documents-release-NN.json (or paste it back to me).
|
||
*
|
||
* What it captures per document:
|
||
* - record_id (record-001..record-NNN — internal id)
|
||
* - title (as printed in the modal heading; case correct)
|
||
* - agency
|
||
* - release_date, incident_date, incident_location, document_type
|
||
* - description (the unique paragraph shown in the detail overlay)
|
||
* - thumbnail_url (Akamai-hosted JPG preview)
|
||
* - pdf_url_inferred (replaces "/thumbnail/" with "/" and ".jpg" with the
|
||
* proper extension based on document_type)
|
||
*
|
||
* The script is READ-ONLY — it never submits, never modifies the page beyond
|
||
* opening and closing the detail modal.
|
||
*/
|
||
|
||
(async function extractWarGovFull() {
|
||
const sleep = ms => new Promise(r => setTimeout(r, ms));
|
||
|
||
function pollUntil(predicate, opts = {}) {
|
||
const timeout = opts.timeout ?? 3000;
|
||
const interval = opts.interval ?? 50;
|
||
return new Promise((resolve, reject) => {
|
||
const start = Date.now();
|
||
const tick = () => {
|
||
const v = predicate();
|
||
if (v) return resolve(v);
|
||
if (Date.now() - start > timeout) return reject(new Error("timeout"));
|
||
setTimeout(tick, interval);
|
||
};
|
||
tick();
|
||
});
|
||
}
|
||
|
||
function strip(s) { return (s || "").replace(/^\s*\[(.*)\]\s*$/, "$1").trim(); }
|
||
|
||
function parseModal() {
|
||
const modal = document.querySelector('.record-modal-shell, [data-record-modal-shell]');
|
||
if (!modal) return null;
|
||
const out = {};
|
||
const titleEl = modal.querySelector('[data-record-modal-title], #record-modal-title');
|
||
out.title = titleEl ? titleEl.innerText.trim() : null;
|
||
const agencyEl = modal.querySelector('[data-record-modal-agency]');
|
||
out.agency = strip(agencyEl?.innerText);
|
||
const descEl = modal.querySelector('[data-record-modal-copy], .record-modal-copy');
|
||
out.description = descEl ? descEl.innerText.trim() : null;
|
||
// dl facts
|
||
modal.querySelectorAll('.record-modal-fact').forEach(fact => {
|
||
const dt = fact.querySelector('dt');
|
||
const dd = fact.querySelector('dd');
|
||
if (dt && dd) {
|
||
const key = dt.innerText.trim().toLowerCase().replace(/[^a-z0-9]+/g, "_").replace(/^_|_$/g, "");
|
||
out[key] = strip(dd.innerText);
|
||
}
|
||
});
|
||
// Thumbnail
|
||
const img = modal.querySelector('#record-main-image, img');
|
||
if (img && img.src) {
|
||
out.thumbnail_url = img.src;
|
||
// Infer PDF/asset url: drop "/thumbnail" segment, restore extension based on document_type
|
||
const ext = (out.document_type || ".pdf").toLowerCase().replace(/^\[?\.?/, ".").replace(/\]$/, "");
|
||
out.pdf_url_inferred = img.src.replace("/thumbnail/", "/").replace(/\.jpg$/i, ext);
|
||
}
|
||
// Record kind from modal data attr (pdf|vid|img)
|
||
out.record_kind = modal.getAttribute("data-record-kind") || null;
|
||
return out;
|
||
}
|
||
|
||
async function clickRowAndCapture(row) {
|
||
const recordId = row.dataset.recordId || row.getAttribute("data-record-id");
|
||
// open modal
|
||
row.click();
|
||
let modalData = null;
|
||
try {
|
||
await pollUntil(() => {
|
||
const m = document.querySelector('.record-modal-shell');
|
||
if (!m) return null;
|
||
// Wait until the title matches the row's title (modal can be stale from previous open)
|
||
const t = m.querySelector('[data-record-modal-title]')?.innerText?.trim();
|
||
const expected = row.querySelector('.record-title')?.innerText?.trim();
|
||
if (t && expected && t.toLowerCase() === expected.toLowerCase()) return m;
|
||
// Or just any visible modal after some delay
|
||
return null;
|
||
}, { timeout: 2500 });
|
||
modalData = parseModal();
|
||
} catch (e) {
|
||
console.warn(` ${recordId}: modal did not load for "${row.querySelector('.record-title')?.innerText}"`);
|
||
}
|
||
// close modal
|
||
const closeBtn = document.querySelector('.record-modal-close, [data-record-modal-close]');
|
||
if (closeBtn) closeBtn.click();
|
||
else {
|
||
// Press Escape
|
||
document.dispatchEvent(new KeyboardEvent("keydown", { key: "Escape" }));
|
||
}
|
||
await pollUntil(() => !document.querySelector('.record-modal-shell'), { timeout: 1500 }).catch(() => {});
|
||
return { record_id: recordId, ...(modalData || {}) };
|
||
}
|
||
|
||
async function captureCurrentPage() {
|
||
const rows = Array.from(document.querySelectorAll('button.record-row'));
|
||
const out = [];
|
||
for (const row of rows) {
|
||
const record = await clickRowAndCapture(row);
|
||
out.push(record);
|
||
await sleep(120); // small breather between cards
|
||
}
|
||
return out;
|
||
}
|
||
|
||
const findNext = () => document.querySelector('button.pagination-next');
|
||
|
||
const release = (location.pathname.match(/Release-(\d+)/i) || [, "01"])[1].padStart(2, "0");
|
||
console.log(`[extract] starting on Release-${release}`);
|
||
|
||
const all = [];
|
||
const seen = new Set();
|
||
const MAX_PAGES = 25;
|
||
let pageIdx = 0;
|
||
while (pageIdx < MAX_PAGES) {
|
||
pageIdx++;
|
||
const firstBefore = document.querySelector('button.record-row')?.dataset.recordId;
|
||
const t0 = performance.now();
|
||
const captured = await captureCurrentPage();
|
||
let added = 0;
|
||
for (const r of captured) {
|
||
const key = r.record_id || `${r.title}|${r.incident_date}`;
|
||
if (seen.has(key)) continue;
|
||
seen.add(key);
|
||
all.push(r);
|
||
added++;
|
||
}
|
||
console.log(`[extract] page ${pageIdx}: captured ${captured.length} (+${added} new, total ${all.length}, ${(performance.now()-t0|0)}ms)`);
|
||
const next = findNext();
|
||
if (!next || next.disabled || next.getAttribute("aria-disabled") === "true") {
|
||
console.log("[extract] NEXT disabled — last page reached");
|
||
break;
|
||
}
|
||
next.click();
|
||
// Wait until row content changes
|
||
await pollUntil(() => {
|
||
const f = document.querySelector('button.record-row')?.dataset.recordId;
|
||
return f && f !== firstBefore ? f : null;
|
||
}, { timeout: 3000 }).catch(() => {});
|
||
await sleep(200);
|
||
}
|
||
|
||
const result = {
|
||
extracted_at: new Date().toISOString(),
|
||
source_url: location.href,
|
||
release: `Release-${release}`,
|
||
total_documents: all.length,
|
||
pages_visited: pageIdx,
|
||
documents: all,
|
||
};
|
||
|
||
const jsonStr = JSON.stringify(result, null, 2);
|
||
console.log(`[extract] DONE — ${all.length} documents extracted across ${pageIdx} pages`);
|
||
console.log(`[extract] full-metadata count: ${all.filter(d => d.description && d.asset_file_name).length}`);
|
||
try { await navigator.clipboard.writeText(jsonStr); console.log("[extract] ✓ JSON copied to clipboard"); }
|
||
catch (e) { console.warn("[extract] clipboard failed (focus the tab and re-run if needed):", e.message); }
|
||
const blob = new Blob([jsonStr], { type: "application/json" });
|
||
window.open(URL.createObjectURL(blob), "_blank");
|
||
console.log("[extract] ✓ JSON opened in new tab — save with Cmd+S");
|
||
console.log("[extract] sample doc:", all[0]);
|
||
return result;
|
||
})();
|