From 5e18df63b111e219c444b79376fdbbf97dbf3654 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 11 Jun 2026 12:12:02 +0200 Subject: [PATCH] feat(iace): ESAW accident-stats RAG pipeline + real 2023 risk anchors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Executes the accident-statistics pipeline for the risk anchors: - Refresh contactModeEvidence with real Eurostat ESAW figures (dataset hsw_ph3_08, reference year 2023): impact 24.0%/21.4%, struck-by 13.0%/23.8%, sharp 14.5%, trapped/crushed 13.8% (fatal), + new physical/mental-stress mode 24.7% → ergonomic. GT-calibrated tier VALUES unchanged; the real data confirms the ordering. - Add the versioned source document (datasources/esaw_accident_stats_2023.md, ESAW CC BY 4.0 + OSHA public-domain context) that is ingested into the core RAG collection bp_iace_accident_stats for searchable evidence. - Whitelist bp_iace_accident_stats in the RAG search handler so seeding can full-text search the statistics with citation at seed time. Two-layer design: the small license-tagged code table stays the deterministic tier/citation lookup; the RAG holds the searchable source evidence. Co-Authored-By: Claude Opus 4.7 --- .../internal/api/handlers/rag_handlers.go | 19 ++++--- .../internal/iace/DATA_SOURCES.md | 13 ++++- .../datasources/esaw_accident_stats_2023.md | 57 +++++++++++++++++++ .../internal/iace/risk_data_sources.go | 27 ++++++--- 4 files changed, 95 insertions(+), 21 deletions(-) create mode 100644 ai-compliance-sdk/internal/iace/datasources/esaw_accident_stats_2023.md diff --git a/ai-compliance-sdk/internal/api/handlers/rag_handlers.go b/ai-compliance-sdk/internal/api/handlers/rag_handlers.go index 5ae880f7..2d8ddde3 100644 --- a/ai-compliance-sdk/internal/api/handlers/rag_handlers.go +++ b/ai-compliance-sdk/internal/api/handlers/rag_handlers.go @@ -24,15 +24,16 @@ func NewRAGHandlers(corpusVersionStore *ucca.CorpusVersionStore) *RAGHandlers { // AllowedCollections is the whitelist of Qdrant collections that can be queried. var AllowedCollections = map[string]bool{ - "bp_compliance_ce": true, - "bp_compliance_gesetze": true, - "bp_compliance_datenschutz": true, - "bp_compliance_gdpr": true, - "bp_dsfa_corpus": true, - "bp_dsfa_templates": true, - "bp_dsfa_risks": true, - "bp_legal_templates": true, - "bp_iace_libraries": true, + "bp_compliance_ce": true, + "bp_compliance_gesetze": true, + "bp_compliance_datenschutz": true, + "bp_compliance_gdpr": true, + "bp_dsfa_corpus": true, + "bp_dsfa_templates": true, + "bp_dsfa_risks": true, + "bp_legal_templates": true, + "bp_iace_libraries": true, + "bp_iace_accident_stats": true, } // SearchRequest represents a RAG search request. diff --git a/ai-compliance-sdk/internal/iace/DATA_SOURCES.md b/ai-compliance-sdk/internal/iace/DATA_SOURCES.md index 0b77c0d9..2fc1096e 100644 --- a/ai-compliance-sdk/internal/iace/DATA_SOURCES.md +++ b/ai-compliance-sdk/internal/iace/DATA_SOURCES.md @@ -27,9 +27,16 @@ do **not** reproduce any standard's risk-graph table, decision tree or matrix. any generated risk-assessment export that shows engine risk numbers. - **URL:** https://ec.europa.eu/eurostat/statistics-explained/index.php/Accidents_at_work_-_statistics_on_causes_and_circumstances - **Aggregate facts used (anchor only):** contact-mode shares of accidents at - work, e.g. impact with stationary object ~24%, struck by moving object ~13% - (non-fatal) / ~24% (fatal), trapped/crushed ~14% (fatal), contact with sharp - agent ~15%. Retrieved 2026-06. + work. **Dataset `hsw_ph3_08`, reference year 2023** (Figure 7, "contact — + mode of injury"), EU shares: + - Physical/mental stress: 24.7% (non-fatal) + - Impact with stationary object (victim in motion): 24.0% (non-fatal) / 21.4% (fatal) + - Contact with sharp/pointed/rough agent: 14.5% (non-fatal) + - Struck by object in motion / collision: 13.0% (non-fatal) / 23.8% (fatal) + - Trapped / crushed: 13.8% (fatal) + + Retrieved 2026-06. The source document is also ingested into the core RAG + collection `bp_iace_accident_stats` for searchable evidence at seeding time. ## Acceptable supplements diff --git a/ai-compliance-sdk/internal/iace/datasources/esaw_accident_stats_2023.md b/ai-compliance-sdk/internal/iace/datasources/esaw_accident_stats_2023.md new file mode 100644 index 00000000..539c1548 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/datasources/esaw_accident_stats_2023.md @@ -0,0 +1,57 @@ +# Accidents at work — contact mode of injury (EU, 2023) + +Canonical, citable source document for the IACE risk-frequency/severity anchors. +This file is the versioned artifact that is ingested into the core RAG +collection `bp_iace_accident_stats` so seeding can full-text search the evidence +and surface the figure with its citation. + +## Primary source — Eurostat ESAW + +- **Source:** Eurostat — European Statistics on Accidents at Work (ESAW) +- **Dataset:** `hsw_ph3_08` — accidents at work by contact / mode of injury +- **Reference year:** 2023 (Statistics Explained, Figure 7) +- **License:** CC BY 4.0 (reuse permitted, source acknowledgement required) +- **Attribution:** `Quelle: Eurostat (ESAW) hsw_ph3_08, Bezugsjahr 2023, CC BY 4.0` +- **Retrieved:** 2026-06 +- **URL:** https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Accidents_at_work_-_statistics_on_causes_and_circumstances + +### Contact mode of injury — EU shares, 2023 + +| Contact mode | Non-fatal | Fatal | +|---|---|---| +| Physical or mental stress | 24.7 % | — | +| Impact with a stationary object (victim in motion) | 24.0 % | 21.4 % | +| Contact with a sharp / pointed / rough-coarse agent | 14.5 % | — | +| Being struck by an object in motion / collision | 13.0 % | 23.8 % | +| Being trapped or crushed | — | 13.8 % | +| No contact / no information | 9.6 % | 15.1 % | + +Reading: the non-fatal column anchors the **frequency / probability tier (W)** +of a contact mode; the fatal column (and the fatal-vs-non-fatal gap) anchors its +typical **severity (S)**. Struck-by and trapped/crushed are comparatively rare +among non-fatal but over-represented among fatal accidents — i.e. lower +frequency, higher severity. + +## Supplementary context — US OSHA (public domain) + +- **Source:** OSHA — Commonly Used Statistics (U.S. Government work, public domain) +- **Retrieved:** 2026-06 · **URL:** https://www.osha.gov/data/commonstats +- 2023: **5,283** fatal work injuries in the US (3.5 per 100,000 FTE workers). +- Most frequently violated standards: Fall Protection, Hazard Communication, + Control of Hazardous Energy (Lockout/Tagout). + +US BLS CFOI/SOII event-level tables (public domain) are an intended further +supplement; the BLS site blocks automated retrieval, so those figures are to be +added from a manually downloaded release. + +## How these numbers are used + +1. **Anchor (ordering):** the relative frequency/severity ordering of contact + modes above sets the *direction* of the W and S tiers in `risk_estimation.go` + (`contactModeTable`). +2. **Calibrate (values):** tier *values* are adjusted to BreakPilot ground truth; + well-sampled modes use the GT mean, sparse modes use conservative defaults — + no overfitting to a small GT sample. + +No standard's risk-graph table, decision tree or SIL/PL matrix is reproduced. +Excluded by license: DGUV statistics, DIN/Beuth/ISO/IEC tables. diff --git a/ai-compliance-sdk/internal/iace/risk_data_sources.go b/ai-compliance-sdk/internal/iace/risk_data_sources.go index fe7e0a25..d0ad029f 100644 --- a/ai-compliance-sdk/internal/iace/risk_data_sources.go +++ b/ai-compliance-sdk/internal/iace/risk_data_sources.go @@ -9,9 +9,17 @@ import "sort" // this table only carries the provenance so generated risk numbers are // auditable and correctly attributed. No raw dataset is vendored; only these // aggregate facts. Excluded by license: DGUV, DIN/Beuth/ISO/IEC. See -// DATA_SOURCES.md. RAG/Qdrant ingestion is deliberately NOT used here: ~a dozen -// stable aggregate facts are better served by a license-tagged code table than -// by vector retrieval. +// DATA_SOURCES.md. +// +// Two-layer design: this small license-tagged CODE table is the deterministic +// tier/citation lookup (fast, stable, no nondeterminism). The underlying SOURCE +// documents are additionally ingested into the core RAG collection +// `bp_iace_accident_stats` so the seeding UI / an auditor can full-text search +// the evidence and pull the original figure — the RAG is the evidence/search +// layer, not the tier lookup. +// +// Figures below are the EU aggregate shares from Eurostat ESAW dataset +// hsw_ph3_08, reference year 2023 (Figure 7, "contact - mode of injury"). // RiskEvidence is the public-statistics provenance for one contact mode. type RiskEvidence struct { @@ -25,9 +33,9 @@ type RiskEvidence struct { } const ( - esawSource = "Eurostat (ESAW)" + esawSource = "Eurostat (ESAW, hsw_ph3_08, 2023)" esawLicense = "CC BY 4.0" - esawAttribution = "Quelle: Eurostat (ESAW), CC BY 4.0" + esawAttribution = "Quelle: Eurostat (ESAW) hsw_ph3_08, Bezugsjahr 2023, CC BY 4.0" esawRetrieved = "2026-06" ) @@ -40,10 +48,11 @@ func esawEvidence(mode, label, stat string) RiskEvidence { // figure is documented; other modes are anchored by the ESAW ordering and // GT-calibrated without a single citable share, so they carry no fabricated stat. var contactModeEvidence = map[string]RiskEvidence{ - "impact_stationary": esawEvidence("impact_stationary", "Anstoßen an ruhendem Objekt", "~24 % der Arbeitsunfälle"), - "struck_by": esawEvidence("struck_by", "Getroffen von bewegtem Objekt", "~13 % (nicht-tödlich) / ~24 % (tödlich)"), - "crushing": esawEvidence("crushing", "Quetschen / Einklemmen", "~14 % der tödlichen Arbeitsunfälle"), - "cutting": esawEvidence("cutting", "Kontakt mit scharfem Gegenstand", "~15 % der Arbeitsunfälle"), + "impact_stationary": esawEvidence("impact_stationary", "Anstoßen an ruhendem Objekt", "24,0 % (nicht-tödlich) / 21,4 % (tödlich)"), + "struck_by": esawEvidence("struck_by", "Getroffen von bewegtem Objekt", "13,0 % (nicht-tödlich) / 23,8 % (tödlich)"), + "crushing": esawEvidence("crushing", "Eingeklemmt / zerquetscht", "13,8 % (tödlich)"), + "cutting": esawEvidence("cutting", "Kontakt mit scharfem/spitzem Agens", "14,5 % (nicht-tödlich)"), + "ergonomic": esawEvidence("ergonomic", "Physische/psychische Belastung", "24,7 % (nicht-tödlich)"), } // RiskEvidenceFor returns the documented public statistic for a contact mode.