/** * sitemap.xml — dynamic, regenerated per request. * * Aggregates: * - Static pages (home, bureau, sub-pages) * - Every declassified document (/d/) * - Every case report (/c/) * - Every entity with an AI summary (/e//) — these get a * lastModified from summary_generated_at, which helps crawlers * re-index when we re-enrich. * - Every top-mentioned entity even without summary (cap at 500 per * class so the sitemap doesn't balloon past Google's 50k limit). * * Per Next.js the file must export a default function returning a flat * MetadataRoute.Sitemap array. ChangeFreq/priority are honoured by most * crawlers as hints. */ import type { MetadataRoute } from "next"; import { listDocuments } from "@/lib/wiki"; import { pgQuery } from "@/lib/retrieval/db"; import { readdir } from "node:fs/promises"; import path from "node:path"; // Without these, Next.js statically generates the sitemap at build time // when the DB is unreachable from the build container — which is why we // were getting only 9 static URLs in production. export const dynamic = "force-dynamic"; export const revalidate = 3600; const SITE_URL = process.env.NEXT_PUBLIC_SITE_URL ?? "https://disclosure.top"; const CASE_ROOT = process.env.CASE_ROOT ?? "/data/ufo/case"; type Url = MetadataRoute.Sitemap[number]; const ENTITY_FOLDER_BY_CLASS: Record = { event: "events", person: "people", uap_object: "uap-objects", location: "locations", organization: "organizations", }; export default async function sitemap(): Promise { const out: Url[] = []; const now = new Date(); // 1. Top-level pages const STATIC_PAGES = [ { url: "/", priority: 1.0, changeFrequency: "daily" as const }, { url: "/bureau", priority: 0.9, changeFrequency: "weekly" as const }, { url: "/sightings", priority: 0.9, changeFrequency: "weekly" as const }, { url: "/witnesses", priority: 0.8, changeFrequency: "weekly" as const }, { url: "/objects", priority: 0.8, changeFrequency: "weekly" as const }, { url: "/locations", priority: 0.8, changeFrequency: "weekly" as const }, { url: "/operations", priority: 0.8, changeFrequency: "weekly" as const }, { url: "/documents", priority: 0.8, changeFrequency: "weekly" as const }, { url: "/search", priority: 0.5, changeFrequency: "monthly" as const }, ]; for (const p of STATIC_PAGES) { out.push({ url: `${SITE_URL}${p.url}`, lastModified: now, changeFrequency: p.changeFrequency, priority: p.priority, }); } // 2. Documents try { const docIds = await listDocuments(); for (const id of docIds) { out.push({ url: `${SITE_URL}/d/${id}`, lastModified: now, changeFrequency: "monthly", priority: 0.7, }); } } catch { /* fs failure — skip docs */ } // 3. Case reports — read filesystem for /c/[slug] try { const dir = path.join(CASE_ROOT, "reports"); const files = await readdir(dir); for (const f of files.filter((x) => x.endsWith(".md"))) { out.push({ url: `${SITE_URL}/c/${f.replace(/\.md$/, "")}`, lastModified: now, changeFrequency: "monthly", priority: 0.95, }); } } catch { /* no case files yet */ } // 4. Entities — surface those with summaries first (high priority), plus // the top by mention count up to 500/class. Cap per class avoids blowing // past sitemap size limits (Google: 50k urls, 50MB). for (const [klass, folder] of Object.entries(ENTITY_FOLDER_BY_CLASS)) { try { const rows = await pgQuery<{ entity_id: string; summary_generated_at: string | null; summary_status: string | null; total_mentions: number; }>( `SELECT entity_id, summary_generated_at, summary_status, total_mentions FROM public.entities WHERE entity_class = $1 AND total_mentions >= 1 ORDER BY (summary_status IN ('ai_generated','curated')) DESC, total_mentions DESC, entity_id ASC LIMIT 500`, [klass], ); for (const r of rows) { const hasSummary = r.summary_status === "ai_generated" || r.summary_status === "curated"; out.push({ url: `${SITE_URL}/e/${folder}/${r.entity_id}`, lastModified: r.summary_generated_at ? new Date(r.summary_generated_at) : now, changeFrequency: "monthly", priority: hasSummary ? 0.7 : 0.4, }); } } catch { /* db unavailable for this class — skip */ } } return out; }