Add reextract pipeline (scripts/reextract/) that rebuilds doc-level entity JSON from Sonnet-vision chunks via Opus, replacing the noisy per-page extraction. Add synthesize scripts to regenerate wiki/entities from the 116 _reextract.json (30), aggregate missing page.md from chunks (31), and reprocess 805 pages the doc-rebuilder agent dropped on context overflow (32). Add maintain scripts 43-56 for chunk-page sync, dedup, generic-entity marking, and typed relation extraction. Web: wire relations API + entity-relations component; entity/timeline/doc pages consume the rebuilt layer. Note: raw/, processing/, wiki/ remain gitignored (bulk data managed separately); the 116 reextract JSONs and 7,798 rebuilt entity files live on disk only. The 27 curated anchor events under wiki/entities/events/ are preserved. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
205 lines
3.4 KiB
YAML
205 lines
3.4 KiB
YAML
# Closed enums for Sonnet re-extraction.
|
|
# Pós-extração, validator rejeita JSON com qualquer valor fora destas listas.
|
|
# Adicione novos valores aqui — NUNCA deixe Sonnet inventar.
|
|
|
|
doc_classification:
|
|
- mission_report
|
|
- intelligence_memo
|
|
- fbi_internal_correspondence
|
|
- press_clipping
|
|
- photo_with_caption
|
|
- sketch_or_diagram
|
|
- witness_statement
|
|
- radio_transcript
|
|
- foia_release
|
|
- operation_log
|
|
- policy_document
|
|
- administrative_form
|
|
- blank_page_or_separator
|
|
- investigation_file
|
|
- aviation_incident_report
|
|
- debriefing_transcript
|
|
- other_specify
|
|
|
|
noise_emission:
|
|
- none # 100% investigativo
|
|
- low # >70% investigativo, alguma rota/carimbo
|
|
- medium # 40-70% investigativo
|
|
- high # <40% investigativo (rotação interna, ementa, índice)
|
|
|
|
investigative_value:
|
|
- critical
|
|
- high
|
|
- medium
|
|
- low
|
|
- none
|
|
|
|
primary_topics:
|
|
- flying_disc_sightings
|
|
- uap_encounter
|
|
- radar_visual_correlation
|
|
- aviation_incident
|
|
- foreign_object_recovery
|
|
- operation_paperclip
|
|
- cold_war_intelligence
|
|
- nuclear_facility_overflight
|
|
- astronaut_observation
|
|
- photographic_evidence
|
|
- contactee_phenomena
|
|
- underwater_unidentified_object
|
|
- transmedium_observation
|
|
- government_disclosure
|
|
- debunking_explanation
|
|
- witness_interrogation
|
|
- policy_directive
|
|
- administrative_routing
|
|
- other
|
|
|
|
event_class:
|
|
- uap_encounter
|
|
- press_release
|
|
- investigation_opened
|
|
- investigation_closed
|
|
- testimony_recorded
|
|
- document_published
|
|
- meeting_held
|
|
- flight_operation
|
|
- radar_detection
|
|
- photo_analysis
|
|
- personnel_change
|
|
- policy_change
|
|
- communication_sent
|
|
- communication_received
|
|
- arrest
|
|
- trial
|
|
- death
|
|
- launch_event
|
|
- recovery_operation
|
|
- intercept_attempt
|
|
- other_specify
|
|
|
|
person_class:
|
|
- military_officer
|
|
- enlisted_personnel
|
|
- civilian_witness
|
|
- government_official
|
|
- law_enforcement
|
|
- scientist
|
|
- journalist
|
|
- pilot
|
|
- radar_operator
|
|
- intelligence_officer
|
|
- foreign_national
|
|
- clergy
|
|
- civilian
|
|
- astronaut
|
|
- politician
|
|
- lawyer
|
|
- business_person
|
|
- unknown
|
|
|
|
org_class:
|
|
- military_unit
|
|
- military_branch
|
|
- government_agency
|
|
- civilian_agency
|
|
- law_enforcement
|
|
- intelligence_agency
|
|
- research_institution
|
|
- civilian_organization
|
|
- foreign_government
|
|
- media_organization
|
|
- contactee_group
|
|
- religious_organization
|
|
- corporation
|
|
- unknown
|
|
|
|
geo_class:
|
|
- city
|
|
- state
|
|
- country
|
|
- region
|
|
- military_base
|
|
- airfield
|
|
- building
|
|
- waterway
|
|
- mountain
|
|
- desert
|
|
- rural_area
|
|
- sea_or_ocean
|
|
- coastal
|
|
- lake
|
|
- river
|
|
- submerged
|
|
- airspace
|
|
- unknown
|
|
|
|
uap_shape:
|
|
- disc
|
|
- cigar
|
|
- sphere
|
|
- triangle
|
|
- rectangle
|
|
- cluster
|
|
- light_only
|
|
- cone
|
|
- dome
|
|
- irregular
|
|
- tic_tac
|
|
- cylindrical
|
|
- cross
|
|
- boomerang
|
|
- unknown
|
|
|
|
uap_medium:
|
|
- air
|
|
- sea_surface
|
|
- submerged
|
|
- transmedium
|
|
- space
|
|
- ground
|
|
- multiple
|
|
- unknown
|
|
|
|
uap_color:
|
|
- white
|
|
- silver_metallic
|
|
- black
|
|
- red
|
|
- orange
|
|
- yellow
|
|
- green
|
|
- blue
|
|
- multicolored
|
|
- dark_unspecified
|
|
- bright_unspecified
|
|
- unknown
|
|
|
|
date_confidence:
|
|
- exact
|
|
- month
|
|
- year
|
|
- decade
|
|
- unknown
|
|
|
|
confidence:
|
|
- high
|
|
- medium
|
|
- low
|
|
|
|
relation_type:
|
|
- witnessed
|
|
- occurred_at
|
|
- involves_uap
|
|
- documented_in
|
|
- authored
|
|
- signed
|
|
- mentioned_by
|
|
- employed_by
|
|
- operated_by
|
|
- investigated
|
|
- commanded
|
|
- related_to
|
|
- similar_to
|
|
- precedes
|
|
- follows
|