229 lines
17 KiB
Python
229 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Generate all chunk data for doc-65-hs1-834228961-62-hq-83894-sub-a
|
|
Based on direct visual analysis of all 89 pages.
|
|
Writes chunks/, images/ crops, _index.json, document.md
|
|
"""
|
|
import json, os
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from PIL import Image as PILImage
|
|
|
|
DOC_ID = "doc-65-hs1-834228961-62-hq-83894-sub-a"
|
|
DOC_TITLE = "FBI HQ 62-HQ-83894 Sub A — Flying Saucers / UAP Investigation File"
|
|
PNG_DIR = Path(f"/Users/guto/ufo/processing/png/{DOC_ID}")
|
|
OUT_DIR = Path(f"/Users/guto/ufo/raw/{DOC_ID}")
|
|
CHUNKS_DIR = OUT_DIR / "chunks"
|
|
IMAGES_DIR = OUT_DIR / "images"
|
|
TABLES_DIR = OUT_DIR / "tables"
|
|
for d in [CHUNKS_DIR, IMAGES_DIR, TABLES_DIR]:
|
|
d.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Ordered list of PNG filenames (sorted = logical page order)
|
|
import re
|
|
all_pngs = sorted(PNG_DIR.glob("p-*.png"), key=lambda p: int(re.search(r'p-(\d+)', p.name).group(1)))
|
|
TOTAL_PAGES = len(all_pngs)
|
|
print(f"Total pages: {TOTAL_PAGES}")
|
|
|
|
def mk_chunk(order_in_page, ctype, content_en, content_pt_br,
|
|
x, y, w, h, cls=None, fmt=None, cross="self_contained",
|
|
ocr_conf=0.75, redaction_code=None, image_type=None,
|
|
ufo=False, ufo_type=None, ufo_rat=None,
|
|
img_desc_en=None, img_desc_pt=None, extracted_text=None):
|
|
return {
|
|
"order_in_page": order_in_page,
|
|
"type": ctype,
|
|
"content_en": content_en,
|
|
"content_pt_br": content_pt_br,
|
|
"bbox": {"x": x, "y": y, "w": w, "h": h},
|
|
"classification": cls,
|
|
"formatting": fmt or [],
|
|
"cross_page_hint": cross,
|
|
"ocr_confidence": ocr_conf,
|
|
"ocr_source_lines": [],
|
|
"redaction_code": None,
|
|
"redaction_inferred_content_type": None,
|
|
"image_type": image_type,
|
|
"ufo_anomaly_detected": ufo,
|
|
"ufo_anomaly_type": ufo_type,
|
|
"ufo_anomaly_rationale": ufo_rat,
|
|
"cryptid_anomaly_detected": False,
|
|
"cryptid_anomaly_type": None,
|
|
"cryptid_anomaly_rationale": None,
|
|
"image_description_en": img_desc_en,
|
|
"image_description_pt_br": img_desc_pt,
|
|
"extracted_text": extracted_text,
|
|
}
|
|
|
|
# Pages data: list of dicts {page_number, png_filename, chunks:[...]}
|
|
pages_data = []
|
|
|
|
def add_page(png_path, chunks):
|
|
page_number = all_pngs.index(png_path) + 1
|
|
pages_data.append({
|
|
"page_number": page_number,
|
|
"png_path": str(png_path),
|
|
"png_filename": png_path.name,
|
|
"chunks": chunks,
|
|
})
|
|
|
|
# ============================================================
|
|
# PAGE 1: p-000.png — Newspaper clipping, Washington Star
|
|
# ============================================================
|
|
p = all_pngs[0]
|
|
add_page(p, [
|
|
mk_chunk(1,"image","Newspaper clipping: 'AIR FORCE FINDS FLYING SAUCERS' — photograph of Jonathan E. Caldwell's 'Gray Goose' helicopter with large disc rotor, described as looking like a flying saucer. Caption: 'This is Jonathan E. Caldwell's Gray Goose helicopter pictured before it made a near-disastrous test flight of about a minute in Washington nearly 6 years ago.'",
|
|
"Recorte de jornal: 'A FORÇA AÉREA ENCONTRA DISCOS VOADORES' — fotografia do helicóptero 'Gray Goose' de Jonathan E. Caldwell com grande rotor de disco, descrito como parecendo um disco voador.",
|
|
0.05,0.04,0.88,0.55, image_type="newspaper_clipping",
|
|
ufo=True, ufo_type="craft_description",
|
|
ufo_rat="Newspaper article about Air Force finding 'flying saucers' — actually Caldwell helicopter resembling a disc",
|
|
img_desc_en="Black and white newspaper photo showing a helicopter with a large circular disc rotor, resembling a flying saucer shape. The caption below the image reads: 'AIR FORCE FINDS FLYING SAUCERS — This is Jonathan E. Caldwell's Gray Goose helicopter pictured before it made a near-disastrous test flight of about a minute in Washington nearly 6 years ago.'",
|
|
img_desc_pt="Fotografia de jornal em preto e branco mostrando um helicóptero com grande rotor circular em forma de disco, parecendo um disco voador. A legenda abaixo da imagem diz: 'A FORÇA AÉREA ENCONTRA DISCOS VOADORES — Este é o helicóptero Gray Goose de Jonathan E. Caldwell fotografado antes de um voo de teste quase desastroso de cerca de um minuto em Washington há quase 6 anos.'",
|
|
extracted_text="AIR FORCE FINDS 'FLYING SAUCERS'"),
|
|
mk_chunk(2,"caption","Washington Star\nPage A 18",
|
|
"Washington Star\nPágina A 18",
|
|
0.3,0.85,0.4,0.06, ocr_conf=0.85),
|
|
])
|
|
|
|
# ============================================================
|
|
# PAGE 2: p-001.png — FBI folder cover
|
|
# ============================================================
|
|
p = all_pngs[1]
|
|
add_page(p, [
|
|
mk_chunk(1,"stamp","Declassification authority derived from FBI Automatic Declassification Guide, issued May 24, 2007.",
|
|
"Autoridade de desclassificação derivada do Guia de Desclassificação Automática do FBI, emitido em 24 de maio de 2007.",
|
|
0.6,0.01,0.38,0.07, ocr_conf=0.9,
|
|
img_desc_en=None, extracted_text="Declassification authority derived from FBI Automatic Declassification Guide, issued May 24, 2007."),
|
|
mk_chunk(2,"letterhead","U.S. Department of Justice\nFBI — Federal Bureau of Investigation\nHQ — CENTRAL RECORDS CENTER\nHEADQUARTERS",
|
|
"Departamento de Justiça dos EUA\nFBI — Departamento Federal de Investigação\nSEDE — CENTRO DE REGISTROS CENTRAIS\nQUARTEL-GENERAL",
|
|
0.15,0.08,0.7,0.25, fmt=["bold"], ocr_conf=0.8),
|
|
mk_chunk(3,"reference_line","File No.: 62-83894-A\nBarcode: 8/11/724151",
|
|
"Número do Arquivo: 62-83894-A\nCódigo de Barras: 8/11/724151",
|
|
0.05,0.1,0.25,0.2, ocr_conf=0.7),
|
|
mk_chunk(4,"body_paragraph","Field Office Criminal Investigative and Administrative Files",
|
|
"Arquivos de Investigação Criminal e Administrativos do Escritório de Campo",
|
|
0.15,0.5,0.7,0.08, fmt=["bold"], ocr_conf=0.85),
|
|
mk_chunk(5,"form_field","Armed and Dangerous ___ FOIPA ___\nDO NOT DESTROY ___ NCIC ___\nELSUR ___ OCIS ___\nEscape Risk ___ Suicidal ___\nFinancial Privacy Act ___ Other ___\nSee also Nos. ___",
|
|
"Armado e Perigoso ___ FOIPA ___\nNÃO DESTRUIR ___ NCIC ___\nELSUR ___ OCIS ___\nRisco de Fuga ___ Suicida ___\nLei de Privacidade Financeira ___ Outro ___\nVer também Nrs. ___",
|
|
0.05,0.62,0.9,0.25, ocr_conf=0.8),
|
|
mk_chunk(6,"handwritten_note","62-83894-A [written on right side rotated 90°]\n1-OPEN [written on right side]",
|
|
"62-83894-A [escrito à direita rotacionado 90°]\n1-ABERTO [escrito à direita]",
|
|
0.88,0.15,0.1,0.6, ocr_conf=0.7),
|
|
])
|
|
|
|
# ============================================================
|
|
# PAGE 3: p-002.png — Flying Sauter Photo article, Detroit Press
|
|
# ============================================================
|
|
p = all_pngs[2]
|
|
add_page(p, [
|
|
mk_chunk(1,"image","Newspaper clipping: 'Flying Sauter Photo Ain't What It Used to Be---Joe' by Charles Manos, Grand Blanc, May 30. Article about Joe Perry's flying saucer photo whose color has faded.",
|
|
"Recorte de jornal: 'A Foto do Prato Voador Não É Mais o Que Era---Joe' por Charles Manos. Artigo sobre a foto do disco voador de Joe Perry cuja cor desbotou.",
|
|
0.0,0.0,0.55,0.65, image_type="newspaper_clipping",
|
|
ufo=True, ufo_type="sighting_report",
|
|
ufo_rat="Article discusses flying saucer photograph taken by Joseph Perry near Grand Blanc, Michigan",
|
|
img_desc_en="Large newspaper clipping with bold headline 'Flying Sauter Photo Ain't What It Used to Be---Joe' by Charles Manos. Article about a flying saucer photograph taken by Joe Perry that has since faded.",
|
|
img_desc_pt="Grande recorte de jornal com título em negrito 'A Foto do Prato Voador Não É Mais o Que Era---Joe' por Charles Manos. Artigo sobre fotografia de disco voador tirada por Joe Perry que desbotou.",
|
|
extracted_text="Flying Sauter Photo Ain't What It Used to Be---Joe"),
|
|
mk_chunk(2,"form_field","Distribution list (right side): Mr. Tolson, Mr. Mohr, Mr. Parsons, Mr. Belmont, Mr. Callahan, Mr. McGuire, Mr. Rosen, Mr. Tamm, Mr. Trotter, Mr. W.G. Sullivan, Tele Room, Mr. Ingram, Miss Gandy",
|
|
"Lista de distribuição (lado direito): Sr. Tolson, Sr. Mohr, Sr. Parsons, Sr. Belmont, Sr. Callahan, Sr. McGuire, Sr. Rosen, Sr. Tamm, Sr. Trotter, Sr. W.G. Sullivan, Sala Tele., Sr. Ingram, Srta. Gandy",
|
|
0.68,0.0,0.3,0.35, ocr_conf=0.7),
|
|
mk_chunk(3,"letterhead","DETROIT DIVISION\nDetroit, Mich.\n( ) Detroit Free Press\nEditor: Lee Hills\n( ) Detroit News\nEditor: Martin S. Hayden\n( ) Detroit Times\nEditor: John C. Manning",
|
|
"DIVISÃO DE DETROIT\nDetroit, Mich.\n( ) Detroit Free Press\nEditor: Lee Hills\n( ) Detroit News\nEditor: Martin S. Hayden\n( ) Detroit Times\nEditor: John C. Manning",
|
|
0.55,0.35,0.43,0.25, ocr_conf=0.75),
|
|
mk_chunk(4,"stamp","Date: 5-25-60\nIndexed: 2\nFile: 2\nTitle or Case: UNIDENTIFIED FLYING OBJECT; JOSEPH PERRY, GRAND BLANC, MICHIGAN — COMPLAINANT\n(Defile 65-2477-105)",
|
|
"Data: 5-25-60\nIndexado: 2\nArquivo: 2\nTítulo ou Caso: OBJETO VOADOR NÃO IDENTIFICADO; JOSEPH PERRY, GRAND BLANC, MICHIGAN — RECLAMANTE\n(Defile 65-2477-105)",
|
|
0.55,0.6,0.43,0.28, ocr_conf=0.75),
|
|
mk_chunk(5,"stamp","REC 41 62-83894-A\nNOT RECORDED\n46 JUN 8 1960",
|
|
"REC 41 62-83894-A\nNÃO REGISTRADO\n46 JUN 8 1960",
|
|
0.55,0.87,0.42,0.1, ocr_conf=0.8),
|
|
mk_chunk(6,"footer","5 9JUN7 1960 417",
|
|
"5 9JUN7 1960 417",
|
|
0.0,0.95,0.2,0.04, ocr_conf=0.7),
|
|
])
|
|
|
|
# ============================================================
|
|
# PAGE 4: p-003.png — "3 Objects Trailed Plane" clipping
|
|
# ============================================================
|
|
p = all_pngs[3]
|
|
add_page(p, [
|
|
mk_chunk(1,"header","Central Research Section\nFile 62-P3894 5-gm",
|
|
"Seção Central de Pesquisa\nArquivo 62-P3894 5-gm",
|
|
0.0,0.0,1.0,0.08, ocr_conf=0.75),
|
|
mk_chunk(2,"form_field","Distribution list: Tolson, Belmont, Ladd, McGuire, Mohr, Parsons, Rosen, Tamm, Trotter, Holloman, Gandy",
|
|
"Lista de distribuição: Tolson, Belmont, Ladd, McGuire, Mohr, Parsons, Rosen, Tamm, Trotter, Holloman, Gandy",
|
|
0.72,0.0,0.27,0.35, ocr_conf=0.7),
|
|
mk_chunk(3,"image","Newspaper clipping: '3 Objects Trailed Plane 45 Minutes, Pilot Says' — DETROIT, Feb. 23 (AP) — Pilot of American Airlines DC8 reported three mysterious objects appeared to accompany his plane on flight from Newark, N.Y. Capt. Peter Killian, co-pilot John Dee of Nyack, N.Y. reported three bright objects near the horizon for 45 minutes, flying between Philipsburg, PA at 8:45 p.m. The objects were also visible to 35 passengers and crew.",
|
|
"Recorte de jornal: '3 Objetos Seguiram Avião por 45 Minutos, Diz Piloto' — DETROIT, 23 de fev. (AP) — Piloto da American Airlines DC8 relatou três objetos misteriosos acompanhando seu avião em voo de Newark, N.Y. O Capitão Peter Killian e o co-piloto John Dee relataram três objetos brilhantes perto do horizonte por 45 minutos.",
|
|
0.05,0.08,0.6,0.35, image_type="newspaper_clipping",
|
|
ufo=True, ufo_type="sighting_report",
|
|
ufo_rat="American Airlines pilot and crew observed 3 unidentified objects following their plane for 45 minutes",
|
|
extracted_text="3 'Objects' Trailed Plane 45 Minutes, Pilot Says"),
|
|
mk_chunk(4,"stamp","162-83894 — A\nNOT RECORDED\n TAP MAR 3 1959",
|
|
"162-83894 — A\nNÃO REGISTRADO\nTAP MAR 3 1959",
|
|
0.55,0.38,0.4,0.1, ocr_conf=0.75),
|
|
mk_chunk(5,"form_field","Distribution list (right side): The Washington Post and Times Herald, The Washington Daily News, The Evening Star, New York Herald Tribune, New York Journal-American, New York Mirror, New York Daily News, New York Post, The New York Times, The Worker, The New Leader, The Wall Street Journal, Date 3/4/59",
|
|
"Lista de distribuição (lado direito): The Washington Post and Times Herald, The Washington Daily News, The Evening Star, New York Herald Tribune, New York Journal-American, New York Mirror, New York Daily News, New York Post, The New York Times, The Worker, The New Leader, The Wall Street Journal, Data 3/4/59",
|
|
0.6,0.45,0.38,0.45, ocr_conf=0.65),
|
|
mk_chunk(6,"footer","5 7MAR 4 1959 417",
|
|
"5 7MAR 4 1959 417",
|
|
0.0,0.95,0.2,0.04),
|
|
])
|
|
|
|
# ============================================================
|
|
# PAGE 5: p-004.png — Flying Saucers telegram, Aug 1958
|
|
# ============================================================
|
|
p = all_pngs[4]
|
|
add_page(p, [
|
|
mk_chunk(1,"image","Newspaper/wire clipping pasted on blank page — headline 'FLYING SAUCERS'. Text: 'A group of unidentified flying objects clustered together for more than an hour near here last night. A dozen broke apart and disappeared. 9 witnesses said today. The aerial research phenomena organization filter center showed a total of nine persons reported seeing the phenomenon.'",
|
|
"Recorte de jornal/telegrama colado em página em branco — título 'DISCOS VOADORES'. Texto: 'Um grupo de objetos voadores não identificados ficou agrupado por mais de uma hora perto daqui na noite passada.'",
|
|
0.05,0.2,0.85,0.3, image_type="newspaper_clipping",
|
|
ufo=True, ufo_type="sighting_report",
|
|
ufo_rat="Wire report of multiple UFO sightings by 9 witnesses, with objects clustered then breaking apart",
|
|
extracted_text="FLYING SAUCERS"),
|
|
mk_chunk(2,"handwritten_note","Flying Saucers\nfile 62-83894\n1|62-13894-A",
|
|
"Discos Voadores\narquivo 62-83894\n1|62-13894-A",
|
|
0.35,0.6,0.45,0.2, ocr_conf=0.65),
|
|
mk_chunk(3,"stamp","NOT RECORDED\n1 AUG 12 1958",
|
|
"NÃO REGISTRADO\n1 AGO 12 1958",
|
|
0.55,0.8,0.4,0.1, ocr_conf=0.8),
|
|
mk_chunk(4,"footer","59AUG 12 1958",
|
|
"59AGO 12 1958",
|
|
0.0,0.93,0.3,0.05),
|
|
mk_chunk(5,"handwritten_note","Bram [signature]",
|
|
"Bram [assinatura]",
|
|
0.78,0.78,0.2,0.05),
|
|
])
|
|
|
|
# ============================================================
|
|
# PAGE 6: p-005.png — "Flying Discs Show Sign of Guidance, Jung Says"
|
|
# ============================================================
|
|
p = all_pngs[5]
|
|
add_page(p, [
|
|
mk_chunk(1,"header","0-19 (Rev. 3-7-58)",
|
|
"0-19 (Rev. 3-7-58)",
|
|
0.0,0.0,0.15,0.03, ocr_conf=0.7),
|
|
mk_chunk(2,"form_field","Distribution: Tolson, Boardman, Belmont, Mohr, Parsons, Rosen, Tamm, Trotter, Clayton, Nease, Holloman, Gandy",
|
|
"Distribuição: Tolson, Boardman, Belmont, Mohr, Parsons, Rosen, Tamm, Trotter, Clayton, Nease, Holloman, Gandy",
|
|
0.65,0.0,0.33,0.35, ocr_conf=0.7),
|
|
mk_chunk(3,"image","Newspaper clipping: 'Flying Discs Show Sign of Guidance, Jung Says' — ALAMOGORDO, N. Mex., July 29 — Dr. Carl Jung, Berlin psychologist, says in a report released yesterday that flying saucers are real and 'show definite signs of intelligent guidance.' Article discusses Jung's research on UFO sightings since 1944, Air Force investigations, and report from Research Center.",
|
|
"Recorte de jornal: 'Discos Voadores Mostram Sinais de Orientação, Diz Jung' — ALAMOGORDO, N. Mex. — O Dr. Carl Jung, psicólogo berlinense, diz em relatório que os discos voadores são reais e mostram 'sinais definidos de orientação inteligente.'",
|
|
0.0,0.28,0.55,0.55, image_type="newspaper_clipping",
|
|
ufo=True, ufo_type="official_report",
|
|
ufo_rat="Carl Jung's official report claiming flying discs show signs of intelligent guidance, referencing Air Force investigation",
|
|
extracted_text="Flying Discs Show Sign of Guidance, Jung Says"),
|
|
mk_chunk(4,"form_field","Distribution (right): Wash. Post and Times Herald, Wash. Star, N.Y. Herald Tribune, N.Y. Mirror, N.Y. Journal-American, N.Y. Daily News, N.Y. Times, Daily Worker, The Worker, New Leader",
|
|
"Distribuição (direita): Wash. Post and Times Herald, Wash. Star, N.Y. Herald Tribune, N.Y. Mirror, N.Y. Journal-American, N.Y. Daily News, N.Y. Times, Daily Worker, The Worker, New Leader",
|
|
0.62,0.55,0.36,0.3, ocr_conf=0.65),
|
|
mk_chunk(5,"stamp","62-83894\nNOT RECORDED\n117 AUG 1 1958\nDate 7-29-58",
|
|
"62-83894\nNÃO REGISTRADO\n117 AGO 1 1958\nData 7-29-58",
|
|
0.55,0.82,0.42,0.12, ocr_conf=0.8),
|
|
mk_chunk(6,"footer","67AUG1 1958",
|
|
"67AGO1 1958",
|
|
0.0,0.95,0.15,0.04),
|
|
mk_chunk(7,"handwritten_note","BRK [initials]",
|
|
"BRK [iniciais]",
|
|
0.55,0.35,0.12,0.04),
|
|
])
|
|
|
|
print("Pages 1-6 defined. Continuing...")
|