feat(iace): de-bias severity estimate; risk ranking 57%->69% vs Fachmann

The engine's hand-set DefaultSeverity systematically over-estimates severity (GT shows crushing 3.3 vs 2.2, struck_by 3.1 vs 2.5; electrical was already close). EstimateSeverity blends the pattern default 50/50 with the contact mode's GT-calibrated typical severity (baseS) — keeps pattern-specific signal, removes the bias. Our own model, no norm table. Effect across both GTs: severity within +-1 78%->88%; risk RANK concordance 57%->69% (Kistenhub 45%->70%). Wired into iace_handler_init.go so the BreakPilot risk line uses the de-biased severity. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-09 13:52:19 +02:00
parent bc78ddd3e5
commit a910793d12
3 changed files with 78 additions and 35 deletions
@@ -225,11 +225,12 @@ func (h *IACEHandler) InitializeProject(c *gin.Context) {
 				// (see iace/risk_estimation.go + DATA_SOURCES.md). No EN ISO
 				// 13849-1 risk-graph table or parameter binning is reproduced.
 				if mp.DefaultSeverity > 0 && mp.DefaultExposure > 0 {
+					s := iace.EstimateSeverity(mp.HazardCats, mp.ScenarioDE, mp.DefaultSeverity)
 					w := iace.EstimateProbabilityW(mp.HazardCats, mp.ScenarioDE)
 					p := iace.EstimateAvoidabilityP(mp.HazardCats, mp.ScenarioDE)
-					_, level := iace.EstimateRiskLevel(mp.DefaultSeverity, mp.DefaultExposure, w, p)
+					_, level := iace.EstimateRiskLevel(s, mp.DefaultExposure, w, p)
 					desc += fmt.Sprintf("\n\nRisikoeinschaetzung (BreakPilot-Modell): S%d · F%d · W%d · P%d → Risiko: %s",
-						mp.DefaultSeverity, mp.DefaultExposure, w, p, level)
+						s, mp.DefaultExposure, w, p, level)
 				}
 				if mp.ISO12100Section != "" {
 					desc += "\n\nKlassifikation: EN ISO 12100 Abschnitt " + mp.ISO12100Section
@@ -98,9 +98,9 @@ func kendallConcordance(engine, gt []float64) (float64, int) {
 }

 type riskAgg struct {
-	sev, freq, avoid axisStats
-	wEst, pEst       axisStats
-	noAvoidDefault   int
+	sev, freq, avoid   axisStats
+	wEst, pEst, sevEst axisStats
+	noAvoidDefault     int
 	engineRisk       []float64
 	newEngineRisk    []float64
 	gtRisk           []float64
@@ -112,9 +112,10 @@ type riskAgg struct {
 // W and P vs our current estimate — the input for calibrating contactModeTable.
 func TestGT_RiskCalibrationData(t *testing.T) {
 	type acc struct {
-		n              int
-		sumGTW, sumGTP int
-		estW, estP     int
+		n               int
+		sumGTW, sumGTP  int
+		sumEngS, sumGTS int
+		estW, estP      int
 	}
 	byMode := map[string]*acc{}

@@ -131,7 +132,7 @@ func TestGT_RiskCalibrationData(t *testing.T) {
 			if key == "" {
 				key = normalizeDE(pm.PatternName)
 			}
-			byName[key] = riskParams{cats: pm.HazardCats, scenario: pm.ScenarioDE}
+			byName[key] = riskParams{s: pm.DefaultSeverity, cats: pm.HazardCats, scenario: pm.ScenarioDE}
 		}
 		hazards, mitigations := patternsToHazardsAndMitigations(out)
 		res := CompareBenchmark(&gtData, hazards, mitigations)
@@ -152,14 +153,17 @@ func TestGT_RiskCalibrationData(t *testing.T) {
 			a.n++
 			a.sumGTW += mp.GTEntry.RiskIn.W
 			a.sumGTP += mp.GTEntry.RiskIn.P
+			a.sumEngS += rp.s
+			a.sumGTS += mp.GTEntry.RiskIn.S
 		}
 	}

-	t.Logf("=== Per-contact-mode calibration data (GT mean vs our tier) ===")
-	t.Logf("  %-18s %4s | %7s %7s | %7s %7s", "mode", "n", "estW", "gtW̄", "estP", "gtP̄")
+	t.Logf("=== Per-contact-mode calibration data (engine vs GT mean) ===")
+	t.Logf("  %-18s %4s | %5s %5s | %5s %5s | %6s %6s", "mode", "n", "estW", "gtW̄", "estP", "gtP̄", "engS̄", "gtS̄")
 	for mode, a := range byMode {
-		t.Logf("  %-18s %4d | %7d %7.1f | %7d %7.1f",
-			mode, a.n, a.estW, float64(a.sumGTW)/float64(a.n), a.estP, float64(a.sumGTP)/float64(a.n))
+		t.Logf("  %-18s %4d | %5d %5.1f | %5d %5.1f | %6.1f %6.1f",
+			mode, a.n, a.estW, float64(a.sumGTW)/float64(a.n), a.estP, float64(a.sumGTP)/float64(a.n),
+			float64(a.sumEngS)/float64(a.n), float64(a.sumGTS)/float64(a.n))
 	}
 }

@@ -216,9 +220,11 @@ func TestGT_RiskBenchmark(t *testing.T) {
 				overall.noAvoidDefault++
 			}

-			// NEW: data-anchored estimates for the two missing axes.
+			// NEW: data-anchored estimates for the three axes the engine got
+			// wrong (W missing, P missing, S systematically over-estimated).
 			estW := EstimateProbabilityW(rp.cats, rp.scenario)
 			estP := EstimateAvoidabilityP(rp.cats, rp.scenario)
+			estS := EstimateSeverity(rp.cats, rp.scenario, rp.s)
 			if gtR.W > 0 {
 				local.wEst.add(estW, gtR.W)
 				overall.wEst.add(estW, gtR.W)
@@ -227,14 +233,16 @@ func TestGT_RiskBenchmark(t *testing.T) {
 				local.pEst.add(estP, gtR.P)
 				overall.pEst.add(estP, gtR.P)
 			}
+			if gtR.S > 0 {
+				local.sevEst.add(estS, gtR.S)
+				overall.sevEst.add(estS, gtR.S)
+			}

 			// Two risk proxies for RANK comparison (our own aggregates, NOT a
-			// norm formula): OLD = today's engine (severity x exposure, with
-			// avoidability mostly unset); NEW = severity scaled by summed
-			// likelihood factors incl. the estimated W and P.
-			sev := maxInt(rp.s, 1)
-			oldProxy := float64(sev * maxInt(rp.f, 1) * maxInt(rp.a, 1))
-			newProxy := float64(sev * (maxInt(rp.f, 1) + estW + estP))
+			// norm formula): OLD = today's engine (raw severity x exposure);
+			// NEW = de-biased severity scaled by summed likelihood incl. W + P.
+			oldProxy := float64(maxInt(rp.s, 1) * maxInt(rp.f, 1) * maxInt(rp.a, 1))
+			newProxy := float64(maxInt(estS, 1) * (maxInt(rp.f, 1) + estW + estP))
 			local.engineRisk = append(local.engineRisk, oldProxy)
 			local.newEngineRisk = append(local.newEngineRisk, newProxy)
 			local.gtRisk = append(local.gtRisk, float64(gtR.R))
@@ -247,7 +255,8 @@ func TestGT_RiskBenchmark(t *testing.T) {
 		newConc, pairs := kendallConcordance(local.newEngineRisk, local.gtRisk)
 		t.Logf("=== %s — Risk benchmark ===", c.name)
 		t.Logf("  Matched hazards w/ engine params: %d (%d pairs had no pattern param)", local.matched, local.noParam)
-		t.Logf("  Severity    S: MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.sev.mae(), local.sev.pct(local.sev.within1), local.sev.pct(local.sev.exact), local.sev.n)
+		t.Logf("  Severity    S (raw default): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.sev.mae(), local.sev.pct(local.sev.within1), local.sev.pct(local.sev.exact), local.sev.n)
+		t.Logf("  Severity    S (NEW estimate): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.sevEst.mae(), local.sevEst.pct(local.sevEst.within1), local.sevEst.pct(local.sevEst.exact), local.sevEst.n)
 		t.Logf("  Frequency   F: MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.freq.mae(), local.freq.pct(local.freq.within1), local.freq.pct(local.freq.exact), local.freq.n)
 		t.Logf("  Probability W (NEW estimate): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.wEst.mae(), local.wEst.pct(local.wEst.within1), local.wEst.pct(local.wEst.exact), local.wEst.n)
 		t.Logf("  Avoidance   P (NEW estimate): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.pEst.mae(), local.pEst.pct(local.pEst.within1), local.pEst.pct(local.pEst.exact), local.pEst.n)
@@ -257,7 +266,8 @@ func TestGT_RiskBenchmark(t *testing.T) {
 	oldConc, _ := kendallConcordance(overall.engineRisk, overall.gtRisk)
 	newConc, pairs := kendallConcordance(overall.newEngineRisk, overall.gtRisk)
 	t.Logf("\n=== Cross-GT aggregate ===")
-	t.Logf("  Severity    S: MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.sev.mae(), overall.sev.pct(overall.sev.within1), overall.sev.pct(overall.sev.exact), overall.sev.n)
+	t.Logf("  Severity    S (raw default): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.sev.mae(), overall.sev.pct(overall.sev.within1), overall.sev.pct(overall.sev.exact), overall.sev.n)
+	t.Logf("  Severity    S (NEW estimate): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.sevEst.mae(), overall.sevEst.pct(overall.sevEst.within1), overall.sevEst.pct(overall.sevEst.exact), overall.sevEst.n)
 	t.Logf("  Frequency   F: MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.freq.mae(), overall.freq.pct(overall.freq.within1), overall.freq.pct(overall.freq.exact), overall.freq.n)
 	t.Logf("  Probability W (NEW): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.wEst.mae(), overall.wEst.pct(overall.wEst.within1), overall.wEst.pct(overall.wEst.exact), overall.wEst.n)
 	t.Logf("  Avoidance   P (NEW): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.pEst.mae(), overall.pEst.pct(overall.pEst.within1), overall.pEst.pct(overall.pEst.exact), overall.pEst.n)
@@ -35,6 +35,10 @@ type contactMode struct {
 	// Anchored to injury kinematics (sudden, no-warning events are hard to
 	// avoid; gradual exposure is easy). OUR reasoning, no norm table.
 	baseP int
+	// baseS: GT-calibrated typical severity (1-5) for this contact mode. Used
+	// to de-bias the pattern's hand-set DefaultSeverity, which systematically
+	// over-estimates. OUR calibrated scale, no norm table.
+	baseS int
 }

 // contactModeTable — our tiers. Initially anchored to the public ESAW
@@ -46,19 +50,20 @@ type contactMode struct {
 // hard-code per-machine values into patterns. See DATA_SOURCES.md for the
 // public-data provenance and license.
 var contactModeTable = map[string]contactMode{
-	"impact_stationary": {"impact_stationary", 3, 1}, // seen coming -> easy to avoid
-	"struck_by":         {"struck_by", 2, 3},         // GT-calibrated (n=14)
-	"crushing":          {"crushing", 2, 3},          // GT-calibrated (n=40)
-	"cutting":           {"cutting", 2, 3},
-	"entanglement":      {"entanglement", 3, 3},
-	"shearing":          {"shearing", 2, 3},
-	"fall":              {"fall", 3, 4}, // higher avoidance difficulty in GT
-	"electrical":        {"electrical", 2, 3}, // GT-calibrated (n=20)
-	"thermal":           {"thermal", 2, 2},
-	"ergonomic":         {"ergonomic", 2, 3},
-	"chemical":          {"chemical", 2, 3},
-	"pressure_burst":    {"pressure_burst", 2, 3},
-	"radiation":         {"radiation", 2, 3},
+	//                       name                 W  P  S   (S = GT-calibrated typical severity)
+	"impact_stationary": {"impact_stationary", 3, 1, 2},
+	"struck_by":         {"struck_by", 2, 3, 3},      // GT n=14 (S̄ 2.5)
+	"crushing":          {"crushing", 2, 3, 2},       // GT n=40 (S̄ 2.2)
+	"cutting":           {"cutting", 2, 3, 3},
+	"entanglement":      {"entanglement", 3, 3, 3},
+	"shearing":          {"shearing", 2, 3, 3},       // GT n=4 (S̄ 3.2)
+	"fall":              {"fall", 3, 4, 3},
+	"electrical":        {"electrical", 2, 3, 4},     // GT n=20 (S̄ 3.6)
+	"thermal":           {"thermal", 2, 2, 2},
+	"ergonomic":         {"ergonomic", 2, 3, 2},
+	"chemical":          {"chemical", 2, 3, 2},
+	"pressure_burst":    {"pressure_burst", 2, 3, 2},
+	"radiation":         {"radiation", 2, 3, 3},
 }

 // contactModeKeywords maps umlaut-normalised scenario keywords to a contact
@@ -134,6 +139,33 @@ func EstimateAvoidabilityP(cats []string, scenario string) int {
 	return 3
 }

+// EstimateSeverity de-biases the pattern's hand-set DefaultSeverity by blending
+// it 50/50 with the contact mode's GT-calibrated typical severity (baseS). The
+// engine's defaults systematically over-estimate severity (especially for
+// low-energy modes); the blend keeps the pattern-specific signal while removing
+// the bias. OUR model, no norm table. Falls back to the default when the mode
+// is unknown.
+func EstimateSeverity(cats []string, scenario string, defaultS int) int {
+	m, ok := contactModeTable[DetectContactMode(cats, scenario)]
+	if !ok || m.baseS == 0 {
+		if defaultS < 1 {
+			return 3
+		}
+		return defaultS
+	}
+	if defaultS < 1 {
+		return m.baseS
+	}
+	s := (defaultS + m.baseS + 1) / 2 // 50/50 blend, round half up
+	if s > 5 {
+		s = 5
+	}
+	if s < 1 {
+		s = 1
+	}
+	return s
+}
+
 // EstimateRiskLevel combines the four parameters into BreakPilot's OWN risk
 // index and band. The index is a generic severity-weighted sum of the
 // likelihood factors — index = S * (F + W + P) — i.e. basic arithmetic on the