Files
breakpilot-compliance/ai-compliance-sdk/internal/iace/risk_benchmark.go
T
Benjamin Admin 2677bca9ca
CI / detect-changes (push) Successful in 7s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 11s
CI / loc-budget (push) Failing after 14s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m23s
CI / test-go (push) Failing after 37s
CI / iace-gt-coverage (push) Successful in 24s
CI / test-python-backend (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
feat(iace): benchmark risk comparison (traffic lights) + misuse pattern + 1:n matcher
#1 Risk-number comparison in the benchmark: ComputeRiskComparison derives the
tool's S/F/W/P + Fine-Kinney per matched hazard and compares to the GT values;
exposed on the benchmark response and rendered in a new RiskComparison table
with GREEN/YELLOW/RED traffic lights on the risk number R (like the Excel),
plus per-axis within-1 agreement cards.

#2 Generic misuse pattern HP2103 "Personenbefoerderung auf Hebezeug" — gated to
lift-family machine types, fires for ANY lifting device (not machine-specific).

#3 Benchmark matcher is now 1:n — one broad engine hazard may cover several
fine-grained GT sub-scenarios (foot/hand/leg crush), so coverage reflects real
risk coverage rather than 1:1 wording matches.

Validated on BOTH ground truths (robot cell + lift): leakage 0, ghosts 0,
coverage held.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-09 17:24:52 +02:00

130 lines
3.9 KiB
Go

package iace
// Risk-number comparison for the benchmark: for every matched hazard, the
// tool's risk parameters (EN-62061-style S/F/W/P + Fine-Kinney) next to the
// professional's GT values, plus aggregate agreement. Used by the benchmark
// endpoint so the Risikobewertung comparison is visible in the tab.
// RiskComparisonPair is one matched hazard's tool-vs-professional risk numbers.
type RiskComparisonPair struct {
HazardName string `json:"hazard_name"`
GTSeverity int `json:"gt_severity"`
GTFrequency int `json:"gt_frequency"`
GTProbability int `json:"gt_probability"` // GT column W
GTAvoidance int `json:"gt_avoidance"` // GT column P
GTRisk int `json:"gt_risk"` // GT column R
EngSeverity int `json:"eng_severity"`
EngFrequency int `json:"eng_frequency"`
EngProbability int `json:"eng_probability"`
EngAvoidance int `json:"eng_avoidance"`
FKScore float64 `json:"fk_score"`
FKBand string `json:"fk_band"`
}
// RiskAgreement aggregates how close the tool's risk numbers are to the GT.
type RiskAgreement struct {
N int `json:"n"`
SeverityWithin1 float64 `json:"severity_within1"`
FrequencyWithin1 float64 `json:"frequency_within1"`
ProbabilityWithin1 float64 `json:"probability_within1"`
AvoidanceWithin1 float64 `json:"avoidance_within1"`
RankConcordance float64 `json:"rank_concordance"` // Fine-Kinney vs GT R
}
// ComputeRiskComparison derives the tool's risk numbers for each matched hazard
// and compares them to the professional's GT values.
func ComputeRiskComparison(matched []HazardMatchPair) ([]RiskComparisonPair, RiskAgreement) {
pairs := make([]RiskComparisonPair, 0, len(matched))
var sevOK, freqOK, probOK, avoidOK, n int
var engFK, gtR []float64
for _, m := range matched {
eh := m.EngineHazard
cats := []string{eh.Category}
scenario := eh.Scenario
if scenario == "" {
scenario = eh.Name
}
lifecycle := splitLifecyclePhases(eh.LifecyclePhase)
engS := EstimateSeverity(cats, scenario, 0)
engF := EstimateFrequency(lifecycle)
engW := EstimateProbabilityW(cats, scenario)
engP := EstimateAvoidabilityP(cats, scenario)
fk := SuggestFineKinney(cats, scenario, lifecycle, 0)
gt := m.GTEntry.RiskIn
pairs = append(pairs, RiskComparisonPair{
HazardName: m.GTEntry.HazardType,
GTSeverity: gt.S, GTFrequency: gt.F, GTProbability: gt.W, GTAvoidance: gt.P, GTRisk: gt.R,
EngSeverity: engS, EngFrequency: engF, EngProbability: engW, EngAvoidance: engP,
FKScore: fk.Score, FKBand: fk.Band,
})
if gt.S > 0 {
n++
if abs(engS-gt.S) <= 1 {
sevOK++
}
if gt.F > 0 && abs(engF-gt.F) <= 1 {
freqOK++
}
if gt.W > 0 && abs(engW-gt.W) <= 1 {
probOK++
}
if gt.P > 0 && abs(engP-gt.P) <= 1 {
avoidOK++
}
engFK = append(engFK, fk.Score)
gtR = append(gtR, float64(gt.R))
}
}
agg := RiskAgreement{N: n}
if n > 0 {
agg.SeverityWithin1 = pct(sevOK, n)
agg.FrequencyWithin1 = pct(freqOK, n)
agg.ProbabilityWithin1 = pct(probOK, n)
agg.AvoidanceWithin1 = pct(avoidOK, n)
agg.RankConcordance = rankConcordance(engFK, gtR)
}
return pairs, agg
}
func abs(x int) int {
if x < 0 {
return -x
}
return x
}
func pct(x, total int) float64 {
if total == 0 {
return 0
}
return 100 * float64(x) / float64(total)
}
// rankConcordance returns the fraction of comparable hazard pairs the tool
// orders the same way the professional does (scale-invariant, 0.5 = random).
func rankConcordance(a, b []float64) float64 {
concordant, discordant := 0, 0
for i := 0; i < len(a); i++ {
for j := i + 1; j < len(a); j++ {
da, db := a[i]-a[j], b[i]-b[j]
if da == 0 || db == 0 {
continue
}
if (da > 0) == (db > 0) {
concordant++
} else {
discordant++
}
}
}
if concordant+discordant == 0 {
return 0
}
return 100 * float64(concordant) / float64(concordant+discordant)
}