breakpilot-compliance/ai-compliance-sdk/internal/iace/gt_risk_benchmark_test.go

package iace

import (
	"math"
	"testing"
)

// ============================================================================
// Risk benchmark: engine risk parameters vs. the professional's (Fachmann) GT.
//
// The risk numbers have never been validated. This test measures — for the
// first time — how far the engine's per-pattern risk defaults are from the
// professional's EN-62061-style assessment in the ground truth, for every
// matched hazard across both GTs.
//
// COPYRIGHT NOTE: this test only COMPARES numbers (our defaults vs the GT's
// values) and computes agreement statistics. It does NOT reproduce any DIN/
// Beuth/ISO risk-graph table, parameter decision tree, or normative formula.
// The GT values are the professional's assessment of a specific machine, not
// the standard's text. Any future estimator must likewise derive parameters
// from OUR own model + PUBLIC accident data (ESAW/DGUV), never from a
// transcribed norm table.
//
// Parameter mapping (engine default -> GT column, EN-62061 naming):
//
//	DefaultSeverity     <-> GT.S  (Se,  severity)
//	DefaultExposure     <-> GT.F  (Fr,  frequency / duration of exposure)
//	DefaultAvoidability <-> GT.P  (Av,  possibility of avoidance)
//	(none)              <-> GT.W  (Pr,  probability of occurrence)  <-- the gap
//
// Run with:
//
//	go test -v -vet=off -run TestGT_RiskBenchmark ./internal/iace/
// ============================================================================

type riskParams struct {
	s, f, a  int // severity, frequency/exposure, avoidability (engine defaults)
	cats     []string
	scenario string
}

type axisStats struct {
	n         int
	absErrSum float64
	exact     int
	within1   int
}

func (a *axisStats) add(engine, gt int) {
	a.n++
	d := math.Abs(float64(engine - gt))
	a.absErrSum += d
	if d == 0 {
		a.exact++
	}
	if d <= 1 {
		a.within1++
	}
}

func (a axisStats) mae() float64 {
	if a.n == 0 {
		return 0
	}
	return a.absErrSum / float64(a.n)
}
func (a axisStats) pct(x int) float64 {
	if a.n == 0 {
		return 0
	}
	return 100 * float64(x) / float64(a.n)
}

// kendallConcordance returns the fraction of comparable hazard pairs that the
// engine orders the same way the professional does (rank agreement, scale-
// invariant). 1.0 = identical ordering, 0.5 = random, 0.0 = inverted.
func kendallConcordance(engine, gt []float64) (float64, int) {
	concordant, discordant := 0, 0
	for i := 0; i < len(engine); i++ {
		for j := i + 1; j < len(engine); j++ {
			de := engine[i] - engine[j]
			dg := gt[i] - gt[j]
			if de == 0 || dg == 0 {
				continue // tie on one side: not comparable
			}
			if (de > 0) == (dg > 0) {
				concordant++
			} else {
				discordant++
			}
		}
	}
	total := concordant + discordant
	if total == 0 {
		return 0, 0
	}
	return float64(concordant) / float64(total), total
}

type riskAgg struct {
	sev, freq, avoid   axisStats
	wEst, pEst, sevEst axisStats
	noAvoidDefault     int
	engineRisk       []float64
	newEngineRisk    []float64
	fkRisk           []float64
	gtRisk           []float64
	matched          int
	noParam          int
}

// TestGT_RiskCalibrationData logs, per contact mode, the professional's mean
// W and P vs our current estimate — the input for calibrating contactModeTable.
func TestGT_RiskCalibrationData(t *testing.T) {
	type acc struct {
		n               int
		sumGTW, sumGTP  int
		sumEngS, sumGTS int
		estW, estP      int
	}
	byMode := map[string]*acc{}

	for _, c := range gtBenchmarkCases {
		gtData, narrative, _ := readGTNarrative(t, c.path)
		if c.narrativeOverride != "" {
			narrative = c.narrativeOverride
		}
		pr := ParseNarrative(narrative, c.machineType)
		out := NewPatternEngine().Match(parseResultToMatchInput(pr, c.machineType))
		byName := map[string]riskParams{}
		for _, pm := range out.MatchedPatterns {
			key := normalizeDE(pm.ScenarioDE)
			if key == "" {
				key = normalizeDE(pm.PatternName)
			}
			byName[key] = riskParams{s: pm.DefaultSeverity, cats: pm.HazardCats, scenario: pm.ScenarioDE}
		}
		hazards, mitigations := patternsToHazardsAndMitigations(out)
		res := CompareBenchmark(&gtData, hazards, mitigations)
		for _, mp := range res.MatchedPairs {
			rp, ok := byName[normalizeDE(mp.EngineHazard.Name)]
			if !ok {
				continue
			}
			mode := DetectContactMode(rp.cats, rp.scenario)
			if mode == "" {
				mode = "(none)"
			}
			a := byMode[mode]
			if a == nil {
				a = &acc{estW: EstimateProbabilityW(rp.cats, rp.scenario), estP: EstimateAvoidabilityP(rp.cats, rp.scenario)}
				byMode[mode] = a
			}
			a.n++
			a.sumGTW += mp.GTEntry.RiskIn.W
			a.sumGTP += mp.GTEntry.RiskIn.P
			a.sumEngS += rp.s
			a.sumGTS += mp.GTEntry.RiskIn.S
		}
	}

	t.Logf("=== Per-contact-mode calibration data (engine vs GT mean) ===")
	t.Logf("  %-18s %4s | %5s %5s | %5s %5s | %6s %6s", "mode", "n", "estW", "gtW̄", "estP", "gtP̄", "engS̄", "gtS̄")
	for mode, a := range byMode {
		t.Logf("  %-18s %4d | %5d %5.1f | %5d %5.1f | %6.1f %6.1f",
			mode, a.n, a.estW, float64(a.sumGTW)/float64(a.n), a.estP, float64(a.sumGTP)/float64(a.n),
			float64(a.sumEngS)/float64(a.n), float64(a.sumGTS)/float64(a.n))
	}
}

// TestGT_RiskComparison_CrossGT runs the EXACT production risk comparison
// (ComputeRiskComparison) on BOTH ground truths, so any estimator change is
// validated generically across two different machines (robot cell + lift),
// not tuned to one.
func TestGT_RiskComparison_CrossGT(t *testing.T) {
	for _, c := range gtBenchmarkCases {
		gtData, narrative, _ := readGTNarrative(t, c.path)
		if c.narrativeOverride != "" {
			narrative = c.narrativeOverride
		}
		pr := ParseNarrative(narrative, c.machineType)
		out := NewPatternEngine().Match(parseResultToMatchInput(pr, c.machineType))
		hazards, mitigations := patternsToHazardsAndMitigations(out)
		res := CompareBenchmark(&gtData, hazards, mitigations)
		_, agg := ComputeRiskComparison(res.MatchedPairs)
		t.Logf("=== %s — ComputeRiskComparison (production) ===", c.name)
		t.Logf("  n=%d | S±1 %.0f%% | F±1 %.0f%% | W±1 %.0f%% | P±1 %.0f%% | Ranking %.0f%%",
			agg.N, agg.SeverityWithin1, agg.FrequencyWithin1, agg.ProbabilityWithin1,
			agg.AvoidanceWithin1, agg.RankConcordance)
	}
}

func TestGT_RiskBenchmark(t *testing.T) {
	overall := riskAgg{}

	for _, c := range gtBenchmarkCases {
		gtData, narrative, _ := readGTNarrative(t, c.path)
		if c.narrativeOverride != "" {
			narrative = c.narrativeOverride
		}
		pr := ParseNarrative(narrative, c.machineType)
		out := NewPatternEngine().Match(parseResultToMatchInput(pr, c.machineType))

		// Index engine risk params by the hazard name the matcher will see
		// (patternsToHazardsAndMitigations sets Hazard.Name = ScenarioDE, else PatternName).
		byName := map[string]riskParams{}
		for _, pm := range out.MatchedPatterns {
			key := normalizeDE(pm.ScenarioDE)
			if key == "" {
				key = normalizeDE(pm.PatternName)
			}
			byName[key] = riskParams{s: pm.DefaultSeverity, f: pm.DefaultExposure, a: pm.DefaultAvoidability, cats: pm.HazardCats, scenario: pm.ScenarioDE}
		}

		hazards, mitigations := patternsToHazardsAndMitigations(out)
		res := CompareBenchmark(&gtData, hazards, mitigations)

		local := riskAgg{}
		for _, mp := range res.MatchedPairs {
			rp, ok := byName[normalizeDE(mp.EngineHazard.Name)]
			if !ok {
				local.noParam++
				overall.noParam++
				continue
			}
			gtR := mp.GTEntry.RiskIn
			local.matched++
			overall.matched++
			if rp.s > 0 && gtR.S > 0 {
				local.sev.add(rp.s, gtR.S)
				overall.sev.add(rp.s, gtR.S)
			}
			if rp.f > 0 && gtR.F > 0 {
				local.freq.add(rp.f, gtR.F)
				overall.freq.add(rp.f, gtR.F)
			}
			if rp.a > 0 && gtR.P > 0 {
				local.avoid.add(rp.a, gtR.P)
				overall.avoid.add(rp.a, gtR.P)
			}
			if rp.a == 0 {
				local.noAvoidDefault++
				overall.noAvoidDefault++
			}

			// NEW: data-anchored estimates for the three axes the engine got
			// wrong (W missing, P missing, S systematically over-estimated).
			estW := EstimateProbabilityW(rp.cats, rp.scenario)
			estP := EstimateAvoidabilityP(rp.cats, rp.scenario)
			estS := EstimateSeverity(rp.cats, rp.scenario, rp.s)
			if gtR.W > 0 {
				local.wEst.add(estW, gtR.W)
				overall.wEst.add(estW, gtR.W)
			}
			if gtR.P > 0 {
				local.pEst.add(estP, gtR.P)
				overall.pEst.add(estP, gtR.P)
			}
			if gtR.S > 0 {
				local.sevEst.add(estS, gtR.S)
				overall.sevEst.add(estS, gtR.S)
			}

			// Two risk proxies for RANK comparison (our own aggregates, NOT a
			// norm formula): OLD = today's engine (raw severity x exposure);
			// NEW = de-biased severity scaled by summed likelihood incl. W + P.
			oldProxy := float64(maxInt(rp.s, 1) * maxInt(rp.f, 1) * maxInt(rp.a, 1))
			newProxy := float64(maxInt(estS, 1) * (maxInt(rp.f, 1) + estW + estP))
			// Fine-Kinney score (our citable backbone) for rank comparison.
			fk := SuggestFineKinney(rp.cats, rp.scenario, pr.LifecyclePhases, rp.s)
			local.engineRisk = append(local.engineRisk, oldProxy)
			local.newEngineRisk = append(local.newEngineRisk, newProxy)
			local.fkRisk = append(local.fkRisk, fk.Score)
			local.gtRisk = append(local.gtRisk, float64(gtR.R))
			overall.engineRisk = append(overall.engineRisk, oldProxy)
			overall.newEngineRisk = append(overall.newEngineRisk, newProxy)
			overall.fkRisk = append(overall.fkRisk, fk.Score)
			overall.gtRisk = append(overall.gtRisk, float64(gtR.R))
		}

		oldConc, _ := kendallConcordance(local.engineRisk, local.gtRisk)
		newConc, pairs := kendallConcordance(local.newEngineRisk, local.gtRisk)
		t.Logf("=== %s — Risk benchmark ===", c.name)
		t.Logf("  Matched hazards w/ engine params: %d (%d pairs had no pattern param)", local.matched, local.noParam)
		t.Logf("  Severity    S (raw default): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.sev.mae(), local.sev.pct(local.sev.within1), local.sev.pct(local.sev.exact), local.sev.n)
		t.Logf("  Severity    S (NEW estimate): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.sevEst.mae(), local.sevEst.pct(local.sevEst.within1), local.sevEst.pct(local.sevEst.exact), local.sevEst.n)
		t.Logf("  Frequency   F: MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.freq.mae(), local.freq.pct(local.freq.within1), local.freq.pct(local.freq.exact), local.freq.n)
		t.Logf("  Probability W (NEW estimate): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.wEst.mae(), local.wEst.pct(local.wEst.within1), local.wEst.pct(local.wEst.exact), local.wEst.n)
		t.Logf("  Avoidance   P (NEW estimate): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", local.pEst.mae(), local.pEst.pct(local.pEst.within1), local.pEst.pct(local.pEst.exact), local.pEst.n)
		fkConc, _ := kendallConcordance(local.fkRisk, local.gtRisk)
		t.Logf("  Risk RANK concordance: OLD %.1f%% -> NEW %.1f%% | Fine-Kinney %.1f%% (over %d pairs)", oldConc*100, newConc*100, fkConc*100, pairs)
	}

	oldConc, _ := kendallConcordance(overall.engineRisk, overall.gtRisk)
	newConc, pairs := kendallConcordance(overall.newEngineRisk, overall.gtRisk)
	t.Logf("\n=== Cross-GT aggregate ===")
	t.Logf("  Severity    S (raw default): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.sev.mae(), overall.sev.pct(overall.sev.within1), overall.sev.pct(overall.sev.exact), overall.sev.n)
	t.Logf("  Severity    S (NEW estimate): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.sevEst.mae(), overall.sevEst.pct(overall.sevEst.within1), overall.sevEst.pct(overall.sevEst.exact), overall.sevEst.n)
	t.Logf("  Frequency   F: MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.freq.mae(), overall.freq.pct(overall.freq.within1), overall.freq.pct(overall.freq.exact), overall.freq.n)
	t.Logf("  Probability W (NEW): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.wEst.mae(), overall.wEst.pct(overall.wEst.within1), overall.wEst.pct(overall.wEst.exact), overall.wEst.n)
	t.Logf("  Avoidance   P (NEW): MAE %.2f | within±1 %.0f%% | exact %.0f%% (n=%d)", overall.pEst.mae(), overall.pEst.pct(overall.pEst.within1), overall.pEst.pct(overall.pEst.exact), overall.pEst.n)
	fkConc, _ := kendallConcordance(overall.fkRisk, overall.gtRisk)
	t.Logf("  Risk RANK concordance: OLD %.1f%% -> NEW %.1f%% | Fine-Kinney %.1f%% (%d pairs)", oldConc*100, newConc*100, fkConc*100, pairs)
}