From c4d9b1426f94452af4f2efedca79feeaa1e5756a Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 9 Jun 2026 19:02:18 +0200 Subject: [PATCH] =?UTF-8?q?fix(iace):=20lower=20EstimateFrequency=20tiers?= =?UTF-8?q?=20=E2=80=94=20engine=20F=20was=20~1=20too=20high=20vs=20the=20?= =?UTF-8?q?GT?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Diagnosis: engine F mean 3.56 vs professional 2.56; the dominant disagreement was normal-operation hazards getting F=4 where the professional assigned 2. Lowered the lifecycle→F mapping (normal operation 4→3, occasional phases 3→2). New TestGT_RiskComparison_CrossGT runs the exact production comparison on BOTH GTs: F within±1 rose to 95% (robot cell) and 94% (lift) — generic, not lift-tuned. Co-Authored-By: Claude Opus 4.7 --- .../internal/iace/gt_risk_benchmark_test.go | 22 +++++++++++++++++++ .../internal/iace/risk_estimation.go | 10 +++++---- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/ai-compliance-sdk/internal/iace/gt_risk_benchmark_test.go b/ai-compliance-sdk/internal/iace/gt_risk_benchmark_test.go index 5fa09828..565de0e7 100644 --- a/ai-compliance-sdk/internal/iace/gt_risk_benchmark_test.go +++ b/ai-compliance-sdk/internal/iace/gt_risk_benchmark_test.go @@ -168,6 +168,28 @@ func TestGT_RiskCalibrationData(t *testing.T) { } } +// TestGT_RiskComparison_CrossGT runs the EXACT production risk comparison +// (ComputeRiskComparison) on BOTH ground truths, so any estimator change is +// validated generically across two different machines (robot cell + lift), +// not tuned to one. +func TestGT_RiskComparison_CrossGT(t *testing.T) { + for _, c := range gtBenchmarkCases { + gtData, narrative, _ := readGTNarrative(t, c.path) + if c.narrativeOverride != "" { + narrative = c.narrativeOverride + } + pr := ParseNarrative(narrative, c.machineType) + out := NewPatternEngine().Match(parseResultToMatchInput(pr, c.machineType)) + hazards, mitigations := patternsToHazardsAndMitigations(out) + res := CompareBenchmark(>Data, hazards, mitigations) + _, agg := ComputeRiskComparison(res.MatchedPairs) + t.Logf("=== %s — ComputeRiskComparison (production) ===", c.name) + t.Logf(" n=%d | S±1 %.0f%% | F±1 %.0f%% | W±1 %.0f%% | P±1 %.0f%% | Ranking %.0f%%", + agg.N, agg.SeverityWithin1, agg.FrequencyWithin1, agg.ProbabilityWithin1, + agg.AvoidanceWithin1, agg.RankConcordance) + } +} + func TestGT_RiskBenchmark(t *testing.T) { overall := riskAgg{} diff --git a/ai-compliance-sdk/internal/iace/risk_estimation.go b/ai-compliance-sdk/internal/iace/risk_estimation.go index f4e44c1e..604375c0 100644 --- a/ai-compliance-sdk/internal/iace/risk_estimation.go +++ b/ai-compliance-sdk/internal/iace/risk_estimation.go @@ -178,15 +178,17 @@ func EstimateFrequency(phases []string) int { } return false } + // Calibrated to the professional's scale: the GT assigns lower exposure + // frequencies than a naive "operating = high" mapping. Normal operation is + // 3 (regular exposure), occasional phases (setup/maintenance/cleaning) 2, + // otherwise 2. (Engine F was systematically ~1 too high vs the GT.) switch { case has("normal_operation") || has("auto_operation") || has("manual_operation"): - return 4 - case has("setup") || has("maintenance") || has("cleaning") || has("changeover"): return 3 - case len(phases) > 0: + case has("setup") || has("maintenance") || has("cleaning") || has("changeover"): return 2 default: - return 3 + return 2 } }