Files
breakpilot-compliance/ai-compliance-sdk/internal/iace/gt_risk_benchmark_test.go
T
Benjamin Admin 02a31b711c
CI / detect-changes (push) Successful in 6s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-backend (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / build-sha-integrity (push) Failing after 5s
CI / validate-canonical-controls (push) Successful in 11s
CI / loc-budget (push) Successful in 14s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Failing after 37s
CI / iace-gt-coverage (push) Successful in 23s
fix(iace): remove EN ISO 13849-1 risk-graph reproduction; own risk model
IP/copyright fix: ComputePLr reproduced the EN ISO 13849-1 Anhang A risk-graph
decision table (S/F/P -> PLr a..e) and SeverityToS/ExposureToF its parameter
binning, emitted into every hazard description. Removed — we may not reproduce
DIN/Beuth norm logic.

Replaced with BreakPilot's OWN risk model:
- risk_estimation.go: probability (W) + avoidance (P) estimated from public,
  permissively-licensed accident statistics (Eurostat ESAW, CC BY 4.0) by
  contact mode, calibrated to our ground-truth corpus; own risk index + bands.
- iace_handler_init.go now emits "Risikoeinschaetzung (BreakPilot-Modell):
  S F W P -> Risiko: <level>" instead of the norm PLr string.
- DATA_SOURCES.md: data provenance + license register (ESAW CC BY 4.0; BLS/OSHA
  public domain; HSE OGL; DGUV + DIN/Beuth explicitly excluded).
- gt_risk_benchmark_test.go: first GT validation of risk numbers — W within +-1
  99%, P 93% vs the professional across both ground truths.

Removed risk_graph_test.go (pinned the reproduced norm table).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-09 13:10:53 +02:00

266 lines
9.6 KiB
Go

package iace
import (
"math"
"testing"
)
// ============================================================================
// Risk benchmark: engine risk parameters vs. the professional's (Fachmann) GT.
//
// The risk numbers have never been validated. This test measures — for the
// first time — how far the engine's per-pattern risk defaults are from the
// professional's EN-62061-style assessment in the ground truth, for every
// matched hazard across both GTs.
//
// COPYRIGHT NOTE: this test only COMPARES numbers (our defaults vs the GT's
// values) and computes agreement statistics. It does NOT reproduce any DIN/
// Beuth/ISO risk-graph table, parameter decision tree, or normative formula.
// The GT values are the professional's assessment of a specific machine, not
// the standard's text. Any future estimator must likewise derive parameters
// from OUR own model + PUBLIC accident data (ESAW/DGUV), never from a
// transcribed norm table.
//
// Parameter mapping (engine default -> GT column, EN-62061 naming):
//
// DefaultSeverity <-> GT.S (Se, severity)
// DefaultExposure <-> GT.F (Fr, frequency / duration of exposure)
// DefaultAvoidability <-> GT.P (Av, possibility of avoidance)
// (none) <-> GT.W (Pr, probability of occurrence) <-- the gap
//
// Run with:
//
// go test -v -vet=off -run TestGT_RiskBenchmark ./internal/iace/
// ============================================================================
type riskParams struct {
s, f, a int // severity, frequency/exposure, avoidability (engine defaults)
cats []string
scenario string
}
type axisStats struct {
n int
absErrSum float64
exact int
within1 int
}
func (a *axisStats) add(engine, gt int) {
a.n++
d := math.Abs(float64(engine - gt))
a.absErrSum += d
if d == 0 {
a.exact++
}
if d <= 1 {
a.within1++
}
}
func (a axisStats) mae() float64 {
if a.n == 0 {
return 0
}
return a.absErrSum / float64(a.n)
}
func (a axisStats) pct(x int) float64 {
if a.n == 0 {
return 0
}
return 100 * float64(x) / float64(a.n)
}
// kendallConcordance returns the fraction of comparable hazard pairs that the
// engine orders the same way the professional does (rank agreement, scale-
// invariant). 1.0 = identical ordering, 0.5 = random, 0.0 = inverted.
func kendallConcordance(engine, gt []float64) (float64, int) {
concordant, discordant := 0, 0
for i := 0; i < len(engine); i++ {
for j := i + 1; j < len(engine); j++ {
de := engine[i] - engine[j]
dg := gt[i] - gt[j]
if de == 0 || dg == 0 {
continue // tie on one side: not comparable
}
if (de > 0) == (dg > 0) {
concordant++
} else {
discordant++
}
}
}
total := concordant + discordant
if total == 0 {
return 0, 0
}
return float64(concordant) / float64(total), total
}
type riskAgg struct {
sev, freq, avoid axisStats
wEst, pEst axisStats
noAvoidDefault int
engineRisk []float64
newEngineRisk []float64
gtRisk []float64
matched int
noParam int
}
// TestGT_RiskCalibrationData logs, per contact mode, the professional's mean
// W and P vs our current estimate — the input for calibrating contactModeTable.
func TestGT_RiskCalibrationData(t *testing.T) {
type acc struct {
n int
sumGTW, sumGTP int
estW, estP int
}
byMode := map[string]*acc{}
for _, c := range gtBenchmarkCases {
gtData, narrative, _ := readGTNarrative(t, c.path)
if c.narrativeOverride != "" {
narrative = c.narrativeOverride
}
pr := ParseNarrative(narrative, c.machineType)
out := NewPatternEngine().Match(parseResultToMatchInput(pr, c.machineType))
byName := map[string]riskParams{}
for _, pm := range out.MatchedPatterns {
key := normalizeDE(pm.ScenarioDE)
if key == "" {
key = normalizeDE(pm.PatternName)
}
byName[key] = riskParams{cats: pm.HazardCats, scenario: pm.ScenarioDE}
}
hazards, mitigations := patternsToHazardsAndMitigations(out)
res := CompareBenchmark(&gtData, hazards, mitigations)
for _, mp := range res.MatchedPairs {
rp, ok := byName[normalizeDE(mp.EngineHazard.Name)]
if !ok {
continue
}
mode := DetectContactMode(rp.cats, rp.scenario)
if mode == "" {
mode = "(none)"
}
a := byMode[mode]
if a == nil {
a = &acc{estW: EstimateProbabilityW(rp.cats, rp.scenario), estP: EstimateAvoidabilityP(rp.cats, rp.scenario)}
byMode[mode] = a
}
a.n++
a.sumGTW += mp.GTEntry.RiskIn.W
a.sumGTP += mp.GTEntry.RiskIn.P
}
}
t.Logf("=== Per-contact-mode calibration data (GT mean vs our tier) ===")
t.Logf(" %-18s %4s | %7s %7s | %7s %7s", "mode", "n", "estW", "gtW̄", "estP", "gtP̄")
for mode, a := range byMode {
t.Logf(" %-18s %4d | %7d %7.1f | %7d %7.1f",
mode, a.n, a.estW, float64(a.sumGTW)/float64(a.n), a.estP, float64(a.sumGTP)/float64(a.n))
}
}
func TestGT_RiskBenchmark(t *testing.T) {
overall := riskAgg{}
for _, c := range gtBenchmarkCases {
gtData, narrative, _ := readGTNarrative(t, c.path)
if c.narrativeOverride != "" {
narrative = c.narrativeOverride
}
pr := ParseNarrative(narrative, c.machineType)
out := NewPatternEngine().Match(parseResultToMatchInput(pr, c.machineType))
// Index engine risk params by the hazard name the matcher will see
// (patternsToHazardsAndMitigations sets Hazard.Name = ScenarioDE, else PatternName).
byName := map[string]riskParams{}
for _, pm := range out.MatchedPatterns {
key := normalizeDE(pm.ScenarioDE)
if key == "" {
key = normalizeDE(pm.PatternName)
}
byName[key] = riskParams{s: pm.DefaultSeverity, f: pm.DefaultExposure, a: pm.DefaultAvoidability, cats: pm.HazardCats, scenario: pm.ScenarioDE}
}
hazards, mitigations := patternsToHazardsAndMitigations(out)
res := CompareBenchmark(&gtData, hazards, mitigations)
local := riskAgg{}
for _, mp := range res.MatchedPairs {
rp, ok := byName[normalizeDE(mp.EngineHazard.Name)]
if !ok {
local.noParam++
overall.noParam++
continue
}
gtR := mp.GTEntry.RiskIn
local.matched++
overall.matched++
if rp.s > 0 && gtR.S > 0 {
local.sev.add(rp.s, gtR.S)
overall.sev.add(rp.s, gtR.S)
}
if rp.f > 0 && gtR.F > 0 {
local.freq.add(rp.f, gtR.F)
overall.freq.add(rp.f, gtR.F)
}
if rp.a > 0 && gtR.P > 0 {
local.avoid.add(rp.a, gtR.P)
overall.avoid.add(rp.a, gtR.P)
}
if rp.a == 0 {
local.noAvoidDefault++
overall.noAvoidDefault++
}
// NEW: data-anchored estimates for the two missing axes.
estW := EstimateProbabilityW(rp.cats, rp.scenario)
estP := EstimateAvoidabilityP(rp.cats, rp.scenario)
if gtR.W > 0 {
local.wEst.add(estW, gtR.W)
overall.wEst.add(estW, gtR.W)
}
if gtR.P > 0 {
local.pEst.add(estP, gtR.P)
overall.pEst.add(estP, gtR.P)
}
// Two risk proxies for RANK comparison (our own aggregates, NOT a
// norm formula): OLD = today's engine (severity x exposure, with
// avoidability mostly unset); NEW = severity scaled by summed
// likelihood factors incl. the estimated W and P.
sev := maxInt(rp.s, 1)
oldProxy := float64(sev * maxInt(rp.f, 1) * maxInt(rp.a, 1))
newProxy := float64(sev * (maxInt(rp.f, 1) + estW + estP))
local.engineRisk = append(local.engineRisk, oldProxy)
local.newEngineRisk = append(local.newEngineRisk, newProxy)
local.gtRisk = append(local.gtRisk, float64(gtR.R))
overall.engineRisk = append(overall.engineRisk, oldProxy)
overall.newEngineRisk = append(overall.newEngineRisk, newProxy)
overall.gtRisk = append(overall.gtRisk, float64(gtR.R))
}
oldConc, _ := kendallConcordance(local.engineRisk, local.gtRisk)
newConc, pairs := kendallConcordance(local.newEngineRisk, local.gtRisk)
t.Logf("=== %s — Risk benchmark ===", c.name)
t.Logf(" Matched hazards w/ engine params: %d (%d pairs had no pattern param)", local.matched, local.noParam)
t.Logf(" Severity S: MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.sev.mae(), local.sev.pct(local.sev.within1), local.sev.pct(local.sev.exact), local.sev.n)
t.Logf(" Frequency F: MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.freq.mae(), local.freq.pct(local.freq.within1), local.freq.pct(local.freq.exact), local.freq.n)
t.Logf(" Probability W (NEW estimate): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.wEst.mae(), local.wEst.pct(local.wEst.within1), local.wEst.pct(local.wEst.exact), local.wEst.n)
t.Logf(" Avoidance P (NEW estimate): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.pEst.mae(), local.pEst.pct(local.pEst.within1), local.pEst.pct(local.pEst.exact), local.pEst.n)
t.Logf(" Risk RANK concordance: OLD %.1f%% -> NEW %.1f%% (over %d comparable pairs)", oldConc*100, newConc*100, pairs)
}
oldConc, _ := kendallConcordance(overall.engineRisk, overall.gtRisk)
newConc, pairs := kendallConcordance(overall.newEngineRisk, overall.gtRisk)
t.Logf("\n=== Cross-GT aggregate ===")
t.Logf(" Severity S: MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.sev.mae(), overall.sev.pct(overall.sev.within1), overall.sev.pct(overall.sev.exact), overall.sev.n)
t.Logf(" Frequency F: MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.freq.mae(), overall.freq.pct(overall.freq.within1), overall.freq.pct(overall.freq.exact), overall.freq.n)
t.Logf(" Probability W (NEW): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.wEst.mae(), overall.wEst.pct(overall.wEst.within1), overall.wEst.pct(overall.wEst.exact), overall.wEst.n)
t.Logf(" Avoidance P (NEW): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.pEst.mae(), overall.pEst.pct(overall.pEst.within1), overall.pEst.pct(overall.pEst.exact), overall.pEst.n)
t.Logf(" Risk RANK concordance: OLD %.1f%% -> NEW %.1f%% (%d pairs)", oldConc*100, newConc*100, pairs)
}