package iace import ( "math" "testing" ) // ============================================================================ // Risk benchmark: engine risk parameters vs. the professional's (Fachmann) GT. // // The risk numbers have never been validated. This test measures — for the // first time — how far the engine's per-pattern risk defaults are from the // professional's EN-62061-style assessment in the ground truth, for every // matched hazard across both GTs. // // COPYRIGHT NOTE: this test only COMPARES numbers (our defaults vs the GT's // values) and computes agreement statistics. It does NOT reproduce any DIN/ // Beuth/ISO risk-graph table, parameter decision tree, or normative formula. // The GT values are the professional's assessment of a specific machine, not // the standard's text. Any future estimator must likewise derive parameters // from OUR own model + PUBLIC accident data (ESAW/DGUV), never from a // transcribed norm table. // // Parameter mapping (engine default -> GT column, EN-62061 naming): // // DefaultSeverity <-> GT.S (Se, severity) // DefaultExposure <-> GT.F (Fr, frequency / duration of exposure) // DefaultAvoidability <-> GT.P (Av, possibility of avoidance) // (none) <-> GT.W (Pr, probability of occurrence) <-- the gap // // Run with: // // go test -v -vet=off -run TestGT_RiskBenchmark ./internal/iace/ // ============================================================================ type riskParams struct { s, f, a int // severity, frequency/exposure, avoidability (engine defaults) cats []string scenario string } type axisStats struct { n int absErrSum float64 exact int within1 int } func (a *axisStats) add(engine, gt int) { a.n++ d := math.Abs(float64(engine - gt)) a.absErrSum += d if d == 0 { a.exact++ } if d <= 1 { a.within1++ } } func (a axisStats) mae() float64 { if a.n == 0 { return 0 } return a.absErrSum / float64(a.n) } func (a axisStats) pct(x int) float64 { if a.n == 0 { return 0 } return 100 * float64(x) / float64(a.n) } // kendallConcordance returns the fraction of comparable hazard pairs that the // engine orders the same way the professional does (rank agreement, scale- // invariant). 1.0 = identical ordering, 0.5 = random, 0.0 = inverted. func kendallConcordance(engine, gt []float64) (float64, int) { concordant, discordant := 0, 0 for i := 0; i < len(engine); i++ { for j := i + 1; j < len(engine); j++ { de := engine[i] - engine[j] dg := gt[i] - gt[j] if de == 0 || dg == 0 { continue // tie on one side: not comparable } if (de > 0) == (dg > 0) { concordant++ } else { discordant++ } } } total := concordant + discordant if total == 0 { return 0, 0 } return float64(concordant) / float64(total), total } type riskAgg struct { sev, freq, avoid axisStats wEst, pEst, sevEst axisStats noAvoidDefault int engineRisk []float64 newEngineRisk []float64 fkRisk []float64 gtRisk []float64 matched int noParam int } // TestGT_RiskCalibrationData logs, per contact mode, the professional's mean // W and P vs our current estimate — the input for calibrating contactModeTable. func TestGT_RiskCalibrationData(t *testing.T) { type acc struct { n int sumGTW, sumGTP int sumEngS, sumGTS int estW, estP int } byMode := map[string]*acc{} for _, c := range gtBenchmarkCases { gtData, narrative, _ := readGTNarrative(t, c.path) if c.narrativeOverride != "" { narrative = c.narrativeOverride } pr := ParseNarrative(narrative, c.machineType) out := NewPatternEngine().Match(parseResultToMatchInput(pr, c.machineType)) byName := map[string]riskParams{} for _, pm := range out.MatchedPatterns { key := normalizeDE(pm.ScenarioDE) if key == "" { key = normalizeDE(pm.PatternName) } byName[key] = riskParams{s: pm.DefaultSeverity, cats: pm.HazardCats, scenario: pm.ScenarioDE} } hazards, mitigations := patternsToHazardsAndMitigations(out) res := CompareBenchmark(>Data, hazards, mitigations) for _, mp := range res.MatchedPairs { rp, ok := byName[normalizeDE(mp.EngineHazard.Name)] if !ok { continue } mode := DetectContactMode(rp.cats, rp.scenario) if mode == "" { mode = "(none)" } a := byMode[mode] if a == nil { a = &acc{estW: EstimateProbabilityW(rp.cats, rp.scenario), estP: EstimateAvoidabilityP(rp.cats, rp.scenario)} byMode[mode] = a } a.n++ a.sumGTW += mp.GTEntry.RiskIn.W a.sumGTP += mp.GTEntry.RiskIn.P a.sumEngS += rp.s a.sumGTS += mp.GTEntry.RiskIn.S } } t.Logf("=== Per-contact-mode calibration data (engine vs GT mean) ===") t.Logf(" %-18s %4s | %5s %5s | %5s %5s | %6s %6s", "mode", "n", "estW", "gtW̄", "estP", "gtP̄", "engS̄", "gtS̄") for mode, a := range byMode { t.Logf(" %-18s %4d | %5d %5.1f | %5d %5.1f | %6.1f %6.1f", mode, a.n, a.estW, float64(a.sumGTW)/float64(a.n), a.estP, float64(a.sumGTP)/float64(a.n), float64(a.sumEngS)/float64(a.n), float64(a.sumGTS)/float64(a.n)) } } // TestGT_RiskComparison_CrossGT runs the EXACT production risk comparison // (ComputeRiskComparison) on BOTH ground truths, so any estimator change is // validated generically across two different machines (robot cell + lift), // not tuned to one. func TestGT_RiskComparison_CrossGT(t *testing.T) { for _, c := range gtBenchmarkCases { gtData, narrative, _ := readGTNarrative(t, c.path) if c.narrativeOverride != "" { narrative = c.narrativeOverride } pr := ParseNarrative(narrative, c.machineType) out := NewPatternEngine().Match(parseResultToMatchInput(pr, c.machineType)) hazards, mitigations := patternsToHazardsAndMitigations(out) res := CompareBenchmark(>Data, hazards, mitigations) _, agg := ComputeRiskComparison(res.MatchedPairs) t.Logf("=== %s — ComputeRiskComparison (production) ===", c.name) t.Logf(" n=%d | S±1 %.0f%% | F±1 %.0f%% | W±1 %.0f%% | P±1 %.0f%% | Ranking %.0f%%", agg.N, agg.SeverityWithin1, agg.FrequencyWithin1, agg.ProbabilityWithin1, agg.AvoidanceWithin1, agg.RankConcordance) } } func TestGT_RiskBenchmark(t *testing.T) { overall := riskAgg{} for _, c := range gtBenchmarkCases { gtData, narrative, _ := readGTNarrative(t, c.path) if c.narrativeOverride != "" { narrative = c.narrativeOverride } pr := ParseNarrative(narrative, c.machineType) out := NewPatternEngine().Match(parseResultToMatchInput(pr, c.machineType)) // Index engine risk params by the hazard name the matcher will see // (patternsToHazardsAndMitigations sets Hazard.Name = ScenarioDE, else PatternName). byName := map[string]riskParams{} for _, pm := range out.MatchedPatterns { key := normalizeDE(pm.ScenarioDE) if key == "" { key = normalizeDE(pm.PatternName) } byName[key] = riskParams{s: pm.DefaultSeverity, f: pm.DefaultExposure, a: pm.DefaultAvoidability, cats: pm.HazardCats, scenario: pm.ScenarioDE} } hazards, mitigations := patternsToHazardsAndMitigations(out) res := CompareBenchmark(>Data, hazards, mitigations) local := riskAgg{} for _, mp := range res.MatchedPairs { rp, ok := byName[normalizeDE(mp.EngineHazard.Name)] if !ok { local.noParam++ overall.noParam++ continue } gtR := mp.GTEntry.RiskIn local.matched++ overall.matched++ if rp.s > 0 && gtR.S > 0 { local.sev.add(rp.s, gtR.S) overall.sev.add(rp.s, gtR.S) } if rp.f > 0 && gtR.F > 0 { local.freq.add(rp.f, gtR.F) overall.freq.add(rp.f, gtR.F) } if rp.a > 0 && gtR.P > 0 { local.avoid.add(rp.a, gtR.P) overall.avoid.add(rp.a, gtR.P) } if rp.a == 0 { local.noAvoidDefault++ overall.noAvoidDefault++ } // NEW: data-anchored estimates for the three axes the engine got // wrong (W missing, P missing, S systematically over-estimated). estW := EstimateProbabilityW(rp.cats, rp.scenario) estP := EstimateAvoidabilityP(rp.cats, rp.scenario) estS := EstimateSeverity(rp.cats, rp.scenario, rp.s) if gtR.W > 0 { local.wEst.add(estW, gtR.W) overall.wEst.add(estW, gtR.W) } if gtR.P > 0 { local.pEst.add(estP, gtR.P) overall.pEst.add(estP, gtR.P) } if gtR.S > 0 { local.sevEst.add(estS, gtR.S) overall.sevEst.add(estS, gtR.S) } // Two risk proxies for RANK comparison (our own aggregates, NOT a // norm formula): OLD = today's engine (raw severity x exposure); // NEW = de-biased severity scaled by summed likelihood incl. W + P. oldProxy := float64(maxInt(rp.s, 1) * maxInt(rp.f, 1) * maxInt(rp.a, 1)) newProxy := float64(maxInt(estS, 1) * (maxInt(rp.f, 1) + estW + estP)) // Fine-Kinney score (our citable backbone) for rank comparison. fk := SuggestFineKinney(rp.cats, rp.scenario, pr.LifecyclePhases, rp.s) local.engineRisk = append(local.engineRisk, oldProxy) local.newEngineRisk = append(local.newEngineRisk, newProxy) local.fkRisk = append(local.fkRisk, fk.Score) local.gtRisk = append(local.gtRisk, float64(gtR.R)) overall.engineRisk = append(overall.engineRisk, oldProxy) overall.newEngineRisk = append(overall.newEngineRisk, newProxy) overall.fkRisk = append(overall.fkRisk, fk.Score) overall.gtRisk = append(overall.gtRisk, float64(gtR.R)) } oldConc, _ := kendallConcordance(local.engineRisk, local.gtRisk) newConc, pairs := kendallConcordance(local.newEngineRisk, local.gtRisk) t.Logf("=== %s — Risk benchmark ===", c.name) t.Logf(" Matched hazards w/ engine params: %d (%d pairs had no pattern param)", local.matched, local.noParam) t.Logf(" Severity S (raw default): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.sev.mae(), local.sev.pct(local.sev.within1), local.sev.pct(local.sev.exact), local.sev.n) t.Logf(" Severity S (NEW estimate): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.sevEst.mae(), local.sevEst.pct(local.sevEst.within1), local.sevEst.pct(local.sevEst.exact), local.sevEst.n) t.Logf(" Frequency F: MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.freq.mae(), local.freq.pct(local.freq.within1), local.freq.pct(local.freq.exact), local.freq.n) t.Logf(" Probability W (NEW estimate): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.wEst.mae(), local.wEst.pct(local.wEst.within1), local.wEst.pct(local.wEst.exact), local.wEst.n) t.Logf(" Avoidance P (NEW estimate): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.pEst.mae(), local.pEst.pct(local.pEst.within1), local.pEst.pct(local.pEst.exact), local.pEst.n) fkConc, _ := kendallConcordance(local.fkRisk, local.gtRisk) t.Logf(" Risk RANK concordance: OLD %.1f%% -> NEW %.1f%% | Fine-Kinney %.1f%% (over %d pairs)", oldConc*100, newConc*100, fkConc*100, pairs) } oldConc, _ := kendallConcordance(overall.engineRisk, overall.gtRisk) newConc, pairs := kendallConcordance(overall.newEngineRisk, overall.gtRisk) t.Logf("\n=== Cross-GT aggregate ===") t.Logf(" Severity S (raw default): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.sev.mae(), overall.sev.pct(overall.sev.within1), overall.sev.pct(overall.sev.exact), overall.sev.n) t.Logf(" Severity S (NEW estimate): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.sevEst.mae(), overall.sevEst.pct(overall.sevEst.within1), overall.sevEst.pct(overall.sevEst.exact), overall.sevEst.n) t.Logf(" Frequency F: MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.freq.mae(), overall.freq.pct(overall.freq.within1), overall.freq.pct(overall.freq.exact), overall.freq.n) t.Logf(" Probability W (NEW): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.wEst.mae(), overall.wEst.pct(overall.wEst.within1), overall.wEst.pct(overall.wEst.exact), overall.wEst.n) t.Logf(" Avoidance P (NEW): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.pEst.mae(), overall.pEst.pct(overall.pEst.within1), overall.pEst.pct(overall.pEst.exact), overall.pEst.n) fkConc, _ := kendallConcordance(overall.fkRisk, overall.gtRisk) t.Logf(" Risk RANK concordance: OLD %.1f%% -> NEW %.1f%% | Fine-Kinney %.1f%% (%d pairs)", oldConc*100, newConc*100, fkConc*100, pairs) }