fix(iace): lower EstimateFrequency tiers — engine F was ~1 too high vs the GT
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 11s
CI / loc-budget (push) Failing after 15s
CI / go-lint (push) Has been skipped
CI / detect-changes (push) Successful in 6s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Failing after 37s
CI / iace-gt-coverage (push) Successful in 23s
CI / test-python-backend (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 11s
CI / loc-budget (push) Failing after 15s
CI / go-lint (push) Has been skipped
CI / detect-changes (push) Successful in 6s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Failing after 37s
CI / iace-gt-coverage (push) Successful in 23s
CI / test-python-backend (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
Diagnosis: engine F mean 3.56 vs professional 2.56; the dominant disagreement was normal-operation hazards getting F=4 where the professional assigned 2. Lowered the lifecycle→F mapping (normal operation 4→3, occasional phases 3→2). New TestGT_RiskComparison_CrossGT runs the exact production comparison on BOTH GTs: F within±1 rose to 95% (robot cell) and 94% (lift) — generic, not lift-tuned. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -168,6 +168,28 @@ func TestGT_RiskCalibrationData(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestGT_RiskComparison_CrossGT runs the EXACT production risk comparison
|
||||
// (ComputeRiskComparison) on BOTH ground truths, so any estimator change is
|
||||
// validated generically across two different machines (robot cell + lift),
|
||||
// not tuned to one.
|
||||
func TestGT_RiskComparison_CrossGT(t *testing.T) {
|
||||
for _, c := range gtBenchmarkCases {
|
||||
gtData, narrative, _ := readGTNarrative(t, c.path)
|
||||
if c.narrativeOverride != "" {
|
||||
narrative = c.narrativeOverride
|
||||
}
|
||||
pr := ParseNarrative(narrative, c.machineType)
|
||||
out := NewPatternEngine().Match(parseResultToMatchInput(pr, c.machineType))
|
||||
hazards, mitigations := patternsToHazardsAndMitigations(out)
|
||||
res := CompareBenchmark(>Data, hazards, mitigations)
|
||||
_, agg := ComputeRiskComparison(res.MatchedPairs)
|
||||
t.Logf("=== %s — ComputeRiskComparison (production) ===", c.name)
|
||||
t.Logf(" n=%d | S±1 %.0f%% | F±1 %.0f%% | W±1 %.0f%% | P±1 %.0f%% | Ranking %.0f%%",
|
||||
agg.N, agg.SeverityWithin1, agg.FrequencyWithin1, agg.ProbabilityWithin1,
|
||||
agg.AvoidanceWithin1, agg.RankConcordance)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGT_RiskBenchmark(t *testing.T) {
|
||||
overall := riskAgg{}
|
||||
|
||||
|
||||
@@ -178,15 +178,17 @@ func EstimateFrequency(phases []string) int {
|
||||
}
|
||||
return false
|
||||
}
|
||||
// Calibrated to the professional's scale: the GT assigns lower exposure
|
||||
// frequencies than a naive "operating = high" mapping. Normal operation is
|
||||
// 3 (regular exposure), occasional phases (setup/maintenance/cleaning) 2,
|
||||
// otherwise 2. (Engine F was systematically ~1 too high vs the GT.)
|
||||
switch {
|
||||
case has("normal_operation") || has("auto_operation") || has("manual_operation"):
|
||||
return 4
|
||||
case has("setup") || has("maintenance") || has("cleaning") || has("changeover"):
|
||||
return 3
|
||||
case len(phases) > 0:
|
||||
case has("setup") || has("maintenance") || has("cleaning") || has("changeover"):
|
||||
return 2
|
||||
default:
|
||||
return 3
|
||||
return 2
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user