Files
breakpilot-compliance/ai-compliance-sdk/internal/iace/benchmark_matcher.go
T
Benjamin Admin 2677bca9ca
CI / detect-changes (push) Successful in 7s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 11s
CI / loc-budget (push) Failing after 14s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m23s
CI / test-go (push) Failing after 37s
CI / iace-gt-coverage (push) Successful in 24s
CI / test-python-backend (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
feat(iace): benchmark risk comparison (traffic lights) + misuse pattern + 1:n matcher
#1 Risk-number comparison in the benchmark: ComputeRiskComparison derives the
tool's S/F/W/P + Fine-Kinney per matched hazard and compares to the GT values;
exposed on the benchmark response and rendered in a new RiskComparison table
with GREEN/YELLOW/RED traffic lights on the risk number R (like the Excel),
plus per-axis within-1 agreement cards.

#2 Generic misuse pattern HP2103 "Personenbefoerderung auf Hebezeug" — gated to
lift-family machine types, fires for ANY lifting device (not machine-specific).

#3 Benchmark matcher is now 1:n — one broad engine hazard may cover several
fine-grained GT sub-scenarios (foot/hand/leg crush), so coverage reflects real
risk coverage rather than 1:1 wording matches.

Validated on BOTH ground truths (robot cell + lift): leakage 0, ghosts 0,
coverage held.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-09 17:24:52 +02:00

487 lines
13 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package iace
import (
"sort"
"strings"
)
// ============================================================================
// Fuzzy matching: Ground Truth entries ↔ Engine hazards
// ============================================================================
const matchThreshold = 0.20
// categoryMap, synonymSets, wrongMachineTerms → benchmark_synonyms.go
// CompareBenchmark runs the full comparison between Ground Truth and engine output.
func CompareBenchmark(gt *GroundTruth, hazards []Hazard, mitigations []Mitigation) *BenchmarkResult {
if gt == nil || len(gt.Entries) == 0 {
return &BenchmarkResult{}
}
gt = filterPlaceholderEntries(gt)
// Build mitigation names per hazard
mitNamesByHazard := make(map[string][]string)
for _, m := range mitigations {
mitNamesByHazard[m.HazardID.String()] = append(mitNamesByHazard[m.HazardID.String()], m.Name)
}
engineSummaries := make([]HazardSummary, len(hazards))
for i, h := range hazards {
engineSummaries[i] = HazardSummary{
ID: h.ID.String(),
Name: h.Name,
Category: h.Category,
Zone: h.HazardousZone,
Description: h.Description,
Scenario: h.Scenario,
PossibleHarm: h.PossibleHarm,
TriggerEvent: h.TriggerEvent,
AffectedPerson: h.AffectedPerson,
LifecyclePhase: h.LifecyclePhase,
Mitigations: mitNamesByHazard[h.ID.String()],
}
}
// Build score matrix: gt[i] × engine[j]
type scoredPair struct {
gtIdx, engIdx int
score float64
reason string
}
var pairs []scoredPair
for i := range gt.Entries {
for j := range hazards {
score, reason := fuzzyMatchScore(&gt.Entries[i], &hazards[j])
if score >= matchThreshold {
pairs = append(pairs, scoredPair{i, j, score, reason})
}
}
}
// Greedy assignment: sort by score, but prioritize high-specificity matches
// (matches where both category AND zone overlap) over generic ones
sort.Slice(pairs, func(a, b int) bool {
// First: prioritize matches with zone overlap (more specific)
aHasZone := pairs[a].reason != "" && (strings.Contains(pairs[a].reason, "Zone") || strings.Contains(pairs[a].reason, "Keywords+Zone"))
bHasZone := pairs[b].reason != "" && (strings.Contains(pairs[b].reason, "Zone") || strings.Contains(pairs[b].reason, "Keywords+Zone"))
if aHasZone != bHasZone {
return aHasZone
}
return pairs[a].score > pairs[b].score
})
usedGT := make(map[int]bool)
usedEng := make(map[int]bool)
var matched []HazardMatchPair
// 1:n matching: a single broad engine hazard may legitimately cover several
// fine-grained GT sub-scenarios (e.g. one "crush under descending load"
// pattern covers the GT's separate foot / hand / leg crush rows). We only
// block a GT entry from matching twice; an engine hazard may match several.
for _, p := range pairs {
if usedGT[p.gtIdx] {
continue
}
usedGT[p.gtIdx] = true
usedEng[p.engIdx] = true
matched = append(matched, HazardMatchPair{
GTEntry: gt.Entries[p.gtIdx],
EngineHazard: engineSummaries[p.engIdx],
MatchScore: p.score,
MatchReason: p.reason,
})
}
// Collect unmatched
var missing []GroundTruthEntry
for i, e := range gt.Entries {
if !usedGT[i] {
missing = append(missing, e)
}
}
var extra []HazardSummary
for i, s := range engineSummaries {
if !usedEng[i] {
extra = append(extra, s)
}
}
// Category breakdown
catGT := map[string]int{}
catMatch := map[string]int{}
for _, e := range gt.Entries {
cat := normalizeCategoryDE(e.HazardGroup)
catGT[cat]++
}
for _, m := range matched {
cat := normalizeCategoryDE(m.GTEntry.HazardGroup)
catMatch[cat]++
}
var breakdown []CategoryScore
for cat, total := range catGT {
cov := 0.0
if total > 0 {
cov = float64(catMatch[cat]) / float64(total)
}
breakdown = append(breakdown, CategoryScore{
Category: cat, GTCount: total, MatchCount: catMatch[cat], Coverage: cov,
})
}
sort.Slice(breakdown, func(i, j int) bool { return breakdown[i].GTCount > breakdown[j].GTCount })
// Measure coverage (simplified: count GT entries where at least 1 measure keyword matches)
measMatched := 0
for _, m := range matched {
if measureOverlap(m.GTEntry.Measures, mitigations) {
measMatched++
}
}
measCov := 0.0
if len(matched) > 0 {
measCov = float64(measMatched) / float64(len(matched))
}
// Risk rank comparison
rankPairs := buildRiskRankPairs(matched)
coverage := 0.0
if len(gt.Entries) > 0 {
coverage = float64(len(matched)) / float64(len(gt.Entries))
}
return &BenchmarkResult{
CoverageScore: coverage,
MeasureCoverage: measCov,
TotalGT: len(gt.Entries),
TotalEngine: len(hazards),
MatchedPairs: matched,
MissingFromEngine: missing,
ExtraInEngine: extra,
CategoryBreakdown: breakdown,
RiskRankPairs: rankPairs,
}
}
// fuzzyMatchScore computes a 0-1 similarity between a GT entry and an engine hazard.
// 4 signals: category (0.2), keywords (0.2), zone (0.3), scenario similarity (0.3).
func fuzzyMatchScore(gt *GroundTruthEntry, h *Hazard) (float64, string) {
var score float64
var reasons []string
// 1. Category match (weight 0.2)
catScore := categoryMatchScore(gt.HazardGroup, h.Category)
score += 0.2 * catScore
if catScore > 0 {
reasons = append(reasons, "Kategorie")
}
// 2. Keyword/synonym match on hazard TYPE (weight 0.2)
kwScore := keywordMatchScore(gt.HazardType, gt.HazardCause, h.Name, h.Description, h.Scenario)
score += 0.2 * kwScore
if kwScore > 0 {
reasons = append(reasons, "Keywords")
}
// 3. Component/zone match (weight 0.3)
zoneScore := zoneMatchScore(gt.ComponentZone, gt.HazardSubgroup, h.HazardousZone, h.MachineModule)
score += 0.3 * zoneScore
if zoneScore > 0 {
reasons = append(reasons, "Zone")
}
// 4. Scenario similarity (weight 0.3) — compares the actual event description
scenScore := scenarioSimilarity(gt.HazardCause, h.Scenario, h.Name)
score += 0.3 * scenScore
if scenScore > 0 {
reasons = append(reasons, "Szenario")
}
// Penalty: wrong machine term
if hasWrongMachineTerm(h.Name, h.Scenario, gt.HazardCause, gt.ComponentZone) {
score *= 0.3
reasons = append(reasons, "Strafabzug:FremdMaschine")
}
// Penalty: no keyword AND no scenario overlap → unreliable
if kwScore == 0 && scenScore == 0 && zoneScore < 0.5 {
score *= 0.4
reasons = append(reasons, "Strafabzug:KeinInhalt")
}
return score, strings.Join(reasons, "+")
}
// scenarioSimilarity compares the GT cause description with the engine scenario.
// Uses action words + synonym-set cross-matching for robust comparison.
func scenarioSimilarity(gtCause, engScenario, engName string) float64 {
gtText := normalizeDE(gtCause)
engText := normalizeDE(engScenario + " " + engName)
gtActions := extractActionWords(gtText)
engActions := extractActionWords(engText)
if len(gtActions) == 0 {
// Fallback: use significant word overlap
return significantWordOverlap(gtText, engText)
}
matched := 0
for _, ga := range gtActions {
// Direct match
directFound := false
for _, ea := range engActions {
if ga == ea || strings.HasPrefix(ea, ga) || strings.HasPrefix(ga, ea) {
directFound = true
break
}
}
if directFound {
matched++
continue
}
// Synonym-set match: if GT action and any engine action are in the same synonym set
for _, synSet := range synonymSets {
gaInSet := false
for _, syn := range synSet {
if strings.Contains(ga, syn) || strings.Contains(syn, ga) {
gaInSet = true
break
}
}
if !gaInSet {
continue
}
// Check if any engine action is in this same set
for _, ea := range engActions {
for _, syn := range synSet {
if strings.Contains(ea, syn) || strings.Contains(syn, ea) {
matched++
goto nextAction
}
}
}
// Also check full engine text for synonym hit
for _, syn := range synSet {
if strings.Contains(engText, syn) {
matched++
goto nextAction
}
}
}
nextAction:
}
return float64(matched) / float64(len(gtActions))
}
// significantWordOverlap is a fallback when no action words are found.
func significantWordOverlap(gtText, engText string) float64 {
gtWords := extractSignificantWords(gtText)
if len(gtWords) == 0 {
return 0
}
matched := 0
for _, w := range gtWords {
if strings.Contains(engText, w) {
matched++
}
}
return float64(matched) / float64(len(gtWords))
}
func hasWrongMachineTerm(engName, engScenario, gtCause, gtZone string) bool {
engText := normalizeDE(engName + " " + engScenario)
gtText := normalizeDE(gtCause + " " + gtZone)
for _, term := range wrongMachineTerms {
if strings.Contains(engText, term) && !strings.Contains(gtText, term) {
return true
}
}
return false
}
func categoryMatchScore(gtGroup, engCategory string) float64 {
normalized := normalizeDE(gtGroup)
prefixes, ok := categoryMap[normalized]
if !ok {
return 0
}
engLower := strings.ToLower(engCategory)
for _, p := range prefixes {
if strings.Contains(engLower, p) {
return 1.0
}
}
return 0
}
func keywordMatchScore(gtType, gtCause, engName, engDesc, engScenario string) float64 {
gtText := normalizeDE(gtType + " " + gtCause)
engText := normalizeDE(engName + " " + engDesc + " " + engScenario)
matchedSets := 0
totalRelevant := 0
for _, synSet := range synonymSets {
gtHas := false
engHas := false
for _, syn := range synSet {
if strings.Contains(gtText, syn) {
gtHas = true
}
if strings.Contains(engText, syn) {
engHas = true
}
}
if gtHas {
totalRelevant++
if engHas {
matchedSets++
}
}
}
if totalRelevant == 0 {
return 0
}
return float64(matchedSets) / float64(totalRelevant)
}
func zoneMatchScore(gtZone, gtSubgroup, engZone, engModule string) float64 {
gtText := normalizeDE(gtZone + " " + gtSubgroup)
engText := normalizeDE(engZone + " " + engModule)
if gtText == "" || engText == "" {
return 0
}
// Check for significant word overlap
gtWords := extractSignificantWords(gtText)
engWords := extractSignificantWords(engText)
if len(gtWords) == 0 {
return 0
}
matched := 0
for _, gw := range gtWords {
for _, ew := range engWords {
if strings.Contains(ew, gw) || strings.Contains(gw, ew) {
matched++
break
}
}
}
return float64(matched) / float64(len(gtWords))
}
func extractSignificantWords(text string) []string {
stopWords := map[string]bool{
"der": true, "die": true, "das": true, "und": true, "oder": true,
"von": true, "in": true, "an": true, "am": true, "im": true,
"zu": true, "bei": true, "mit": true, "des": true, "den": true,
"dem": true, "ein": true, "eine": true, "einer": true, "einem": true,
"fuer": true, "auf": true, "aus": true, "um": true, "nach": true,
"ueber": true, "unter": true, "vor": true, "durch": true,
}
words := strings.Fields(text)
var sig []string
for _, w := range words {
if len(w) < 3 || stopWords[w] {
continue
}
sig = append(sig, w)
}
return sig
}
// NormalizeDEPublic is the exported version of normalizeDE for use outside this package.
func NormalizeDEPublic(s string) string { return normalizeDE(s) }
// normalizeDE lowercases and replaces umlauts (same as narrative_parser).
func normalizeDE(s string) string {
s = strings.ToLower(strings.TrimSpace(s))
s = strings.ReplaceAll(s, "ä", "ae")
s = strings.ReplaceAll(s, "ö", "oe")
s = strings.ReplaceAll(s, "ü", "ue")
s = strings.ReplaceAll(s, "ß", "ss")
return s
}
func normalizeCategoryDE(group string) string {
n := normalizeDE(group)
// Shorten for display
n = strings.TrimPrefix(n, "gefaehrdungen durch ")
n = strings.TrimPrefix(n, "gefaehrdungen im zusammenhang mit ")
return n
}
func measureOverlap(gtMeasures []string, mitigations []Mitigation) bool {
for _, gm := range gtMeasures {
gmNorm := normalizeDE(gm)
for _, m := range mitigations {
mNorm := normalizeDE(m.Name + " " + m.Description)
// Check if any significant word from GT measure appears in engine mitigation
words := extractSignificantWords(gmNorm)
for _, w := range words {
if strings.Contains(mNorm, w) {
return true
}
}
}
}
return false
}
func buildRiskRankPairs(matched []HazardMatchPair) []RiskRankPair {
if len(matched) == 0 {
return nil
}
// Sort by GT risk descending to get GT rank
type ranked struct {
idx int
gtRisk int
name string
}
items := make([]ranked, len(matched))
for i, m := range matched {
items[i] = ranked{i, m.GTEntry.RiskIn.R, m.GTEntry.HazardType}
}
sort.Slice(items, func(a, b int) bool { return items[a].gtRisk > items[b].gtRisk })
pairs := make([]RiskRankPair, len(items))
for rank, item := range items {
pairs[rank] = RiskRankPair{
GTRank: rank + 1,
EngineRank: 0, // Engine has no assessment yet for auto-generated hazards
HazardName: item.name,
GTRiskScore: item.gtRisk,
EngineRisk: 0,
}
}
return pairs
}
// filterPlaceholderEntries drops GT rows that are not real hazards — empty
// causes with placeholder/section-heading types like "[weitere Risikominderung]"
// or "Allgemeine ... Anforderungen aus der MaschinenRiL". They are not engine-
// matchable and unfairly depress the coverage metric, so they are excluded
// from TotalGT.
func filterPlaceholderEntries(gt *GroundTruth) *GroundTruth {
kept := make([]GroundTruthEntry, 0, len(gt.Entries))
for _, e := range gt.Entries {
cause := strings.TrimSpace(e.HazardCause)
typ := normalizeDE(e.HazardType)
isPlaceholder := cause == "" && (typ == "" ||
strings.HasPrefix(typ, "[") ||
strings.Contains(typ, "allgemeine") ||
strings.Contains(typ, "weitere risikominderung"))
if !isPlaceholder {
kept = append(kept, e)
}
}
out := *gt
out.Entries = kept
return &out
}