136 lines
5.5 KiB
Python
136 lines
5.5 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
40_reading_version.py — Generate a clean, webdesigner-grade reading version of a
|
||
|
|
scanned document from its already-extracted chunks.
|
||
|
|
|
||
|
|
The scanned PDFs are messy: duplicate transcriptions (typed + handwritten),
|
||
|
|
OCR noise, repeated headers/banners, two classification variants of the same
|
||
|
|
narrative. This pass uses an LLM to produce ONE clean, deduplicated, well-
|
||
|
|
structured bilingual Markdown document for reading — faithful to the content
|
||
|
|
(invents nothing) but merging duplicate versions and dropping page furniture.
|
||
|
|
|
||
|
|
Output: raw/<doc>--subagent/reading.md (frontmatter + EN + PT-BR sections)
|
||
|
|
|
||
|
|
The raw chunks and per-page scan view stay untouched ("ver scan original").
|
||
|
|
|
||
|
|
Run:
|
||
|
|
python3 scripts/synthesize/40_reading_version.py <doc-id>
|
||
|
|
"""
|
||
|
|
from __future__ import annotations
|
||
|
|
import os
|
||
|
|
import subprocess
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
UFO = Path("/Users/guto/ufo")
|
||
|
|
RAW = UFO / "raw"
|
||
|
|
BUILD_DOC = UFO / "scripts" / "reextract" / "build_doc_text.py"
|
||
|
|
|
||
|
|
PROMPT = """You are a meticulous archivist-typographer for The Disclosure Bureau, an
|
||
|
|
investigative wiki of declassified UAP/UFO documents. You receive the raw
|
||
|
|
machine-extracted text of ONE scanned document (chunk by chunk, with page
|
||
|
|
markers). The scan is messy: it often contains the SAME content twice (e.g. a
|
||
|
|
typed transcript followed by a handwritten re-transcription, or a SECRET//NOFORN
|
||
|
|
narrative immediately followed by a near-identical SECRET//REL version), plus
|
||
|
|
OCR noise, repeated letterheads, classification banners, page numbers and
|
||
|
|
routing stamps.
|
||
|
|
|
||
|
|
Produce ONE clean, faithful, beautifully structured reading version in Markdown.
|
||
|
|
|
||
|
|
RULES (non-negotiable):
|
||
|
|
1. FAITHFUL — never invent facts, names, dates, codes, or quotes. Use only what
|
||
|
|
is in the text. If something is redacted/illegible, keep a marker like
|
||
|
|
[redacted] / [ilegível].
|
||
|
|
2. DEDUPLICATE — when the same content appears more than once (typed vs
|
||
|
|
handwritten, NOFORN vs REL), MERGE into a single best version. Prefer the
|
||
|
|
most complete/legible wording. Never drop unique details that appear in only
|
||
|
|
one version (e.g. a line spoken by a different person).
|
||
|
|
3. DROP PAGE FURNITURE — remove repeated letterheads, classification banners,
|
||
|
|
bare page numbers, routing stamps, "DISPATCHED" stamps, distribution lists,
|
||
|
|
and OCR garbage. Keep ONE classification line at the top if present.
|
||
|
|
4. STRUCTURE — use clear Markdown: a top H1 title, short intro line, logical
|
||
|
|
headings (## sections), and for transcripts use a clean dialogue format
|
||
|
|
(**SPEAKER:** line). Preserve chronological/communication order.
|
||
|
|
5. BILINGUAL — output BOTH languages. First the full English reading version
|
||
|
|
under "## Reading (EN)", then the full Brazilian-Portuguese version under
|
||
|
|
"## Leitura (PT-BR)". PT-BR must be natural Brazilian Portuguese with correct
|
||
|
|
accents.
|
||
|
|
6. PRESERVE INVESTIGATIVE SUBSTANCE — every sighting detail, coordinate, time,
|
||
|
|
witness name, object description, and quote that matters to an investigation
|
||
|
|
must survive the cleanup.
|
||
|
|
|
||
|
|
Return ONLY the Markdown body (no code fence, no preamble). Start directly with
|
||
|
|
the H1 title line.
|
||
|
|
|
||
|
|
DOCUMENT (doc_id: {doc_id}) — raw extracted chunks follow:
|
||
|
|
|
||
|
|
{doc_text}
|
||
|
|
"""
|
||
|
|
|
||
|
|
DISALLOWED = (
|
||
|
|
"AskUserQuestion,Bash,Edit,Write,Read,Task,Glob,Grep,"
|
||
|
|
"TaskCreate,TaskUpdate,TaskList,TaskGet,TaskStop,TaskOutput,"
|
||
|
|
"Skill,ScheduleWakeup,Monitor,WebSearch,WebFetch,NotebookEdit,"
|
||
|
|
"EnterPlanMode,ExitPlanMode,EnterWorktree,ExitWorktree,"
|
||
|
|
"CronCreate,CronDelete,CronList,RemoteTrigger,ToolSearch,"
|
||
|
|
"PushNotification,ListMcpResourcesTool,ReadMcpResourceTool,"
|
||
|
|
"ShareOnboardingGuide"
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def build_doc_text(doc_id: str) -> str:
|
||
|
|
r = subprocess.run(["python3", str(BUILD_DOC), doc_id],
|
||
|
|
capture_output=True, text=True, encoding="utf-8")
|
||
|
|
if r.returncode != 0:
|
||
|
|
sys.exit(f"build_doc_text failed: {r.stderr}")
|
||
|
|
return r.stdout
|
||
|
|
|
||
|
|
|
||
|
|
def call_llm(prompt: str) -> str:
|
||
|
|
import tempfile
|
||
|
|
env = {**os.environ, "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000"}
|
||
|
|
with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False, encoding="utf-8") as t:
|
||
|
|
tmp = t.name
|
||
|
|
try:
|
||
|
|
with open(tmp, "wb") as out:
|
||
|
|
r = subprocess.run(
|
||
|
|
["claude", "-p", "--model", "sonnet", "--output-format", "text",
|
||
|
|
"--disallowed-tools", DISALLOWED],
|
||
|
|
input=prompt.encode("utf-8"), stdout=out, stderr=subprocess.PIPE, env=env,
|
||
|
|
timeout=600,
|
||
|
|
)
|
||
|
|
if r.returncode != 0:
|
||
|
|
sys.exit(f"claude failed rc={r.returncode}: {r.stderr.decode('utf-8','replace')[:500]}")
|
||
|
|
return Path(tmp).read_text(encoding="utf-8")
|
||
|
|
finally:
|
||
|
|
try: os.unlink(tmp)
|
||
|
|
except OSError: pass
|
||
|
|
|
||
|
|
|
||
|
|
def main() -> int:
|
||
|
|
if len(sys.argv) < 2:
|
||
|
|
sys.exit("usage: 40_reading_version.py <doc-id>")
|
||
|
|
doc_id = sys.argv[1]
|
||
|
|
out_path = RAW / f"{doc_id}--subagent" / "reading.md"
|
||
|
|
|
||
|
|
print(f"[1/3] building doc text for {doc_id} ...")
|
||
|
|
doc_text = build_doc_text(doc_id)
|
||
|
|
print(f" {len(doc_text)} chars (~{len(doc_text)//4} tokens)")
|
||
|
|
|
||
|
|
print("[2/3] generating reading version (Sonnet) ...")
|
||
|
|
md = call_llm(PROMPT.format(doc_id=doc_id, doc_text=doc_text)).strip()
|
||
|
|
if md.startswith("```"):
|
||
|
|
md = "\n".join(l for l in md.splitlines() if not l.startswith("```")).strip()
|
||
|
|
|
||
|
|
front = (
|
||
|
|
f"---\nschema_version: \"0.1.0\"\ntype: reading\ndoc_id: {doc_id}\n"
|
||
|
|
f"generator: sonnet-reading-v1\n---\n\n"
|
||
|
|
)
|
||
|
|
out_path.write_text(front + md + "\n", encoding="utf-8")
|
||
|
|
print(f"[3/3] saved {out_path} ({len(md)} chars)")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
sys.exit(main())
|