feat(iace): de-bias severity estimate; risk ranking 57%->69% vs Fachmann
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / detect-changes (push) Successful in 8s
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 11s
CI / loc-budget (push) Successful in 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Failing after 44s
CI / iace-gt-coverage (push) Successful in 22s
CI / test-python-backend (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped

The engine's hand-set DefaultSeverity systematically over-estimates severity
(GT shows crushing 3.3 vs 2.2, struck_by 3.1 vs 2.5; electrical was already
close). EstimateSeverity blends the pattern default 50/50 with the contact
mode's GT-calibrated typical severity (baseS) — keeps pattern-specific signal,
removes the bias. Our own model, no norm table.

Effect across both GTs: severity within +-1 78%->88%; risk RANK concordance
57%->69% (Kistenhub 45%->70%). Wired into iace_handler_init.go so the
BreakPilot risk line uses the de-biased severity.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-09 13:52:19 +02:00
parent bc78ddd3e5
commit a910793d12
3 changed files with 78 additions and 35 deletions
@@ -98,9 +98,9 @@ func kendallConcordance(engine, gt []float64) (float64, int) {
}
type riskAgg struct {
sev, freq, avoid axisStats
wEst, pEst axisStats
noAvoidDefault int
sev, freq, avoid axisStats
wEst, pEst, sevEst axisStats
noAvoidDefault int
engineRisk []float64
newEngineRisk []float64
gtRisk []float64
@@ -112,9 +112,10 @@ type riskAgg struct {
// W and P vs our current estimate — the input for calibrating contactModeTable.
func TestGT_RiskCalibrationData(t *testing.T) {
type acc struct {
n int
sumGTW, sumGTP int
estW, estP int
n int
sumGTW, sumGTP int
sumEngS, sumGTS int
estW, estP int
}
byMode := map[string]*acc{}
@@ -131,7 +132,7 @@ func TestGT_RiskCalibrationData(t *testing.T) {
if key == "" {
key = normalizeDE(pm.PatternName)
}
byName[key] = riskParams{cats: pm.HazardCats, scenario: pm.ScenarioDE}
byName[key] = riskParams{s: pm.DefaultSeverity, cats: pm.HazardCats, scenario: pm.ScenarioDE}
}
hazards, mitigations := patternsToHazardsAndMitigations(out)
res := CompareBenchmark(&gtData, hazards, mitigations)
@@ -152,14 +153,17 @@ func TestGT_RiskCalibrationData(t *testing.T) {
a.n++
a.sumGTW += mp.GTEntry.RiskIn.W
a.sumGTP += mp.GTEntry.RiskIn.P
a.sumEngS += rp.s
a.sumGTS += mp.GTEntry.RiskIn.S
}
}
t.Logf("=== Per-contact-mode calibration data (GT mean vs our tier) ===")
t.Logf(" %-18s %4s | %7s %7s | %7s %7s", "mode", "n", "estW", "gtW̄", "estP", "gtP̄")
t.Logf("=== Per-contact-mode calibration data (engine vs GT mean) ===")
t.Logf(" %-18s %4s | %5s %5s | %5s %5s | %6s %6s", "mode", "n", "estW", "gtW̄", "estP", "gtP̄", "engS̄", "gtS̄")
for mode, a := range byMode {
t.Logf(" %-18s %4d | %7d %7.1f | %7d %7.1f",
mode, a.n, a.estW, float64(a.sumGTW)/float64(a.n), a.estP, float64(a.sumGTP)/float64(a.n))
t.Logf(" %-18s %4d | %5d %5.1f | %5d %5.1f | %6.1f %6.1f",
mode, a.n, a.estW, float64(a.sumGTW)/float64(a.n), a.estP, float64(a.sumGTP)/float64(a.n),
float64(a.sumEngS)/float64(a.n), float64(a.sumGTS)/float64(a.n))
}
}
@@ -216,9 +220,11 @@ func TestGT_RiskBenchmark(t *testing.T) {
overall.noAvoidDefault++
}
// NEW: data-anchored estimates for the two missing axes.
// NEW: data-anchored estimates for the three axes the engine got
// wrong (W missing, P missing, S systematically over-estimated).
estW := EstimateProbabilityW(rp.cats, rp.scenario)
estP := EstimateAvoidabilityP(rp.cats, rp.scenario)
estS := EstimateSeverity(rp.cats, rp.scenario, rp.s)
if gtR.W > 0 {
local.wEst.add(estW, gtR.W)
overall.wEst.add(estW, gtR.W)
@@ -227,14 +233,16 @@ func TestGT_RiskBenchmark(t *testing.T) {
local.pEst.add(estP, gtR.P)
overall.pEst.add(estP, gtR.P)
}
if gtR.S > 0 {
local.sevEst.add(estS, gtR.S)
overall.sevEst.add(estS, gtR.S)
}
// Two risk proxies for RANK comparison (our own aggregates, NOT a
// norm formula): OLD = today's engine (severity x exposure, with
// avoidability mostly unset); NEW = severity scaled by summed
// likelihood factors incl. the estimated W and P.
sev := maxInt(rp.s, 1)
oldProxy := float64(sev * maxInt(rp.f, 1) * maxInt(rp.a, 1))
newProxy := float64(sev * (maxInt(rp.f, 1) + estW + estP))
// norm formula): OLD = today's engine (raw severity x exposure);
// NEW = de-biased severity scaled by summed likelihood incl. W + P.
oldProxy := float64(maxInt(rp.s, 1) * maxInt(rp.f, 1) * maxInt(rp.a, 1))
newProxy := float64(maxInt(estS, 1) * (maxInt(rp.f, 1) + estW + estP))
local.engineRisk = append(local.engineRisk, oldProxy)
local.newEngineRisk = append(local.newEngineRisk, newProxy)
local.gtRisk = append(local.gtRisk, float64(gtR.R))
@@ -247,7 +255,8 @@ func TestGT_RiskBenchmark(t *testing.T) {
newConc, pairs := kendallConcordance(local.newEngineRisk, local.gtRisk)
t.Logf("=== %s — Risk benchmark ===", c.name)
t.Logf(" Matched hazards w/ engine params: %d (%d pairs had no pattern param)", local.matched, local.noParam)
t.Logf(" Severity S: MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.sev.mae(), local.sev.pct(local.sev.within1), local.sev.pct(local.sev.exact), local.sev.n)
t.Logf(" Severity S (raw default): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.sev.mae(), local.sev.pct(local.sev.within1), local.sev.pct(local.sev.exact), local.sev.n)
t.Logf(" Severity S (NEW estimate): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.sevEst.mae(), local.sevEst.pct(local.sevEst.within1), local.sevEst.pct(local.sevEst.exact), local.sevEst.n)
t.Logf(" Frequency F: MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.freq.mae(), local.freq.pct(local.freq.within1), local.freq.pct(local.freq.exact), local.freq.n)
t.Logf(" Probability W (NEW estimate): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.wEst.mae(), local.wEst.pct(local.wEst.within1), local.wEst.pct(local.wEst.exact), local.wEst.n)
t.Logf(" Avoidance P (NEW estimate): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.pEst.mae(), local.pEst.pct(local.pEst.within1), local.pEst.pct(local.pEst.exact), local.pEst.n)
@@ -257,7 +266,8 @@ func TestGT_RiskBenchmark(t *testing.T) {
oldConc, _ := kendallConcordance(overall.engineRisk, overall.gtRisk)
newConc, pairs := kendallConcordance(overall.newEngineRisk, overall.gtRisk)
t.Logf("\n=== Cross-GT aggregate ===")
t.Logf(" Severity S: MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.sev.mae(), overall.sev.pct(overall.sev.within1), overall.sev.pct(overall.sev.exact), overall.sev.n)
t.Logf(" Severity S (raw default): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.sev.mae(), overall.sev.pct(overall.sev.within1), overall.sev.pct(overall.sev.exact), overall.sev.n)
t.Logf(" Severity S (NEW estimate): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.sevEst.mae(), overall.sevEst.pct(overall.sevEst.within1), overall.sevEst.pct(overall.sevEst.exact), overall.sevEst.n)
t.Logf(" Frequency F: MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.freq.mae(), overall.freq.pct(overall.freq.within1), overall.freq.pct(overall.freq.exact), overall.freq.n)
t.Logf(" Probability W (NEW): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.wEst.mae(), overall.wEst.pct(overall.wEst.within1), overall.wEst.pct(overall.wEst.exact), overall.wEst.n)
t.Logf(" Avoidance P (NEW): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.pEst.mae(), overall.pEst.pct(overall.pEst.within1), overall.pEst.pct(overall.pEst.exact), overall.pEst.n)