package iace import ( "sort" "strings" ) // ============================================================================ // Fuzzy matching: Ground Truth entries ↔ Engine hazards // ============================================================================ const matchThreshold = 0.35 // categoryMap maps GT hazard_group (German) to engine category prefixes. var categoryMap = map[string][]string{ "mechanische gefaehrdungen": {"mechanical"}, "elektrische gefaehrdungen": {"electrical"}, "thermische gefaehrdungen": {"thermal"}, "gefaehrdungen durch laerm": {"noise", "ergonomic"}, "gefaehrdungen durch vibration": {"noise", "vibration"}, "gefaehrdungen durch strahlung": {"radiation", "emc"}, "gefaehrdungen durch materialien und substanzen": {"material", "environmental"}, "ergonomische gefaehrdungen": {"ergonomic"}, "gefaehrdungen im zusammenhang mit der einsatzumgebung": {"environmental"}, } // synonymSets groups equivalent hazard terms for keyword matching. var synonymSets = [][]string{ {"quetsch", "crush", "einklemm", "klemm"}, {"scher", "shear", "absch"}, {"schneid", "cut", "schnitt"}, {"stoss", "schlag", "impact", "treff", "aufprall"}, {"einzug", "fang", "erfass", "entangle", "wickel"}, {"elektrisch", "stromschlag", "electric", "beruehr", "spannungsfuehr", "koerperdurchstroemung"}, {"brand", "feuer", "fire", "kabelbrand", "kurzschluss", "ueberlast", "ueberstrom"}, {"verbrenn", "burn", "heiss", "thermisch", "lichtbogen"}, {"laerm", "noise", "gehoer", "schall", "dezibel"}, {"vibration", "schwing"}, {"ergonom", "haltung", "handhabung", "bedien", "bewegungsapparat"}, {"kuehlschmierstoff", "kss", "aerosol", "coolant"}, {"pneumat", "druckluft", "compressed"}, {"hydraul", "druck", "pressure"}, {"roboter", "robot", "roboterarm"}, {"greifer", "gripper", "schunk"}, {"foerderband", "transport", "conveyor"}, {"schutzzaun", "schutzgitter", "fence", "guard"}, {"werkzeugmaschine", "robodrill", "bearbeitungszentrum", "wzm"}, {"stolper", "rutsch", "slip", "trip"}, {"leckage", "austreten", "leak"}, {"einstich", "puncture", "spritz"}, {"isolat", "kriechstrom", "schutzleiter", "erdung", "indirekt"}, {"luft", "kriechstreck", "beruehrer", "oberflaeche", "leitfaehig"}, {"emv", "strahlung", "radiation", "elektromagnet", "stoereinfluss"}, } // CompareBenchmark runs the full comparison between Ground Truth and engine output. func CompareBenchmark(gt *GroundTruth, hazards []Hazard, mitigations []Mitigation) *BenchmarkResult { if gt == nil || len(gt.Entries) == 0 { return &BenchmarkResult{} } // Build mitigation names per hazard mitNamesByHazard := make(map[string][]string) for _, m := range mitigations { mitNamesByHazard[m.HazardID.String()] = append(mitNamesByHazard[m.HazardID.String()], m.Name) } engineSummaries := make([]HazardSummary, len(hazards)) for i, h := range hazards { engineSummaries[i] = HazardSummary{ ID: h.ID.String(), Name: h.Name, Category: h.Category, Zone: h.HazardousZone, Description: h.Description, Scenario: h.Scenario, PossibleHarm: h.PossibleHarm, TriggerEvent: h.TriggerEvent, Mitigations: mitNamesByHazard[h.ID.String()], } } // Build score matrix: gt[i] × engine[j] type scoredPair struct { gtIdx, engIdx int score float64 reason string } var pairs []scoredPair for i := range gt.Entries { for j := range hazards { score, reason := fuzzyMatchScore(>.Entries[i], &hazards[j]) if score >= matchThreshold { pairs = append(pairs, scoredPair{i, j, score, reason}) } } } // Greedy best-first 1:1 assignment sort.Slice(pairs, func(a, b int) bool { return pairs[a].score > pairs[b].score }) usedGT := make(map[int]bool) usedEng := make(map[int]bool) var matched []HazardMatchPair for _, p := range pairs { if usedGT[p.gtIdx] || usedEng[p.engIdx] { continue } usedGT[p.gtIdx] = true usedEng[p.engIdx] = true matched = append(matched, HazardMatchPair{ GTEntry: gt.Entries[p.gtIdx], EngineHazard: engineSummaries[p.engIdx], MatchScore: p.score, MatchReason: p.reason, }) } // Collect unmatched var missing []GroundTruthEntry for i, e := range gt.Entries { if !usedGT[i] { missing = append(missing, e) } } var extra []HazardSummary for i, s := range engineSummaries { if !usedEng[i] { extra = append(extra, s) } } // Category breakdown catGT := map[string]int{} catMatch := map[string]int{} for _, e := range gt.Entries { cat := normalizeCategoryDE(e.HazardGroup) catGT[cat]++ } for _, m := range matched { cat := normalizeCategoryDE(m.GTEntry.HazardGroup) catMatch[cat]++ } var breakdown []CategoryScore for cat, total := range catGT { cov := 0.0 if total > 0 { cov = float64(catMatch[cat]) / float64(total) } breakdown = append(breakdown, CategoryScore{ Category: cat, GTCount: total, MatchCount: catMatch[cat], Coverage: cov, }) } sort.Slice(breakdown, func(i, j int) bool { return breakdown[i].GTCount > breakdown[j].GTCount }) // Measure coverage (simplified: count GT entries where at least 1 measure keyword matches) measMatched := 0 for _, m := range matched { if measureOverlap(m.GTEntry.Measures, mitigations) { measMatched++ } } measCov := 0.0 if len(matched) > 0 { measCov = float64(measMatched) / float64(len(matched)) } // Risk rank comparison rankPairs := buildRiskRankPairs(matched) coverage := 0.0 if len(gt.Entries) > 0 { coverage = float64(len(matched)) / float64(len(gt.Entries)) } return &BenchmarkResult{ CoverageScore: coverage, MeasureCoverage: measCov, TotalGT: len(gt.Entries), TotalEngine: len(hazards), MatchedPairs: matched, MissingFromEngine: missing, ExtraInEngine: extra, CategoryBreakdown: breakdown, RiskRankPairs: rankPairs, } } // fuzzyMatchScore computes a 0-1 similarity between a GT entry and an engine hazard. func fuzzyMatchScore(gt *GroundTruthEntry, h *Hazard) (float64, string) { var score float64 var reasons []string // 1. Category match (weight 0.3) catScore := categoryMatchScore(gt.HazardGroup, h.Category) score += 0.3 * catScore if catScore > 0 { reasons = append(reasons, "Kategorie") } // 2. Keyword/synonym match on hazard TYPE (weight 0.3) kwScore := keywordMatchScore(gt.HazardType, gt.HazardCause, h.Name, h.Description, h.Scenario) score += 0.3 * kwScore if kwScore > 0 { reasons = append(reasons, "Keywords") } // 3. Component/zone match (weight 0.4 — most important for specificity) zoneScore := zoneMatchScore(gt.ComponentZone, gt.HazardSubgroup, h.HazardousZone, h.MachineModule) score += 0.4 * zoneScore if zoneScore > 0 { reasons = append(reasons, "Zone") } // Penalty: if engine hazard mentions a machine-specific term not in the GT context, // it's likely a wrong-machine match (e.g. "Spielplatz" for a robot cell GT entry) if hasWrongMachineTerm(h.Name, h.Scenario, gt.HazardCause, gt.ComponentZone) { score *= 0.3 // Heavy penalty reasons = append(reasons, "Strafabzug:FremdMaschine") } return score, strings.Join(reasons, "+") } // wrongMachineTerms are words in an engine hazard that indicate it's about // a completely different machine type. If the GT entry doesn't mention these, // the match is penalized. var wrongMachineTerms = []string{ "spielplatz", "fahrtreppe", "trommelwaschmaschine", "umreifungsband", "drehteller", "rundtaktanlage", "exzentrisch", "webstuhl", "aufzug", "rolltreppe", "bagger", "kettensaege", "kreissaege", "druckmaschine", "zentrifuge", "autoklav", "hobel", "naehmaschine", "strickmaschine", "schleifmaschine", "gabelstapler", "flurfoerder", "erntemaschine", "kollision zweier roboter", } func hasWrongMachineTerm(engName, engScenario, gtCause, gtZone string) bool { engText := normalizeDE(engName + " " + engScenario) gtText := normalizeDE(gtCause + " " + gtZone) for _, term := range wrongMachineTerms { if strings.Contains(engText, term) && !strings.Contains(gtText, term) { return true } } return false } func categoryMatchScore(gtGroup, engCategory string) float64 { normalized := normalizeDE(gtGroup) prefixes, ok := categoryMap[normalized] if !ok { return 0 } engLower := strings.ToLower(engCategory) for _, p := range prefixes { if strings.Contains(engLower, p) { return 1.0 } } return 0 } func keywordMatchScore(gtType, gtCause, engName, engDesc, engScenario string) float64 { gtText := normalizeDE(gtType + " " + gtCause) engText := normalizeDE(engName + " " + engDesc + " " + engScenario) matchedSets := 0 totalRelevant := 0 for _, synSet := range synonymSets { gtHas := false engHas := false for _, syn := range synSet { if strings.Contains(gtText, syn) { gtHas = true } if strings.Contains(engText, syn) { engHas = true } } if gtHas { totalRelevant++ if engHas { matchedSets++ } } } if totalRelevant == 0 { return 0 } return float64(matchedSets) / float64(totalRelevant) } func zoneMatchScore(gtZone, gtSubgroup, engZone, engModule string) float64 { gtText := normalizeDE(gtZone + " " + gtSubgroup) engText := normalizeDE(engZone + " " + engModule) if gtText == "" || engText == "" { return 0 } // Check for significant word overlap gtWords := extractSignificantWords(gtText) engWords := extractSignificantWords(engText) if len(gtWords) == 0 { return 0 } matched := 0 for _, gw := range gtWords { for _, ew := range engWords { if strings.Contains(ew, gw) || strings.Contains(gw, ew) { matched++ break } } } return float64(matched) / float64(len(gtWords)) } func extractSignificantWords(text string) []string { stopWords := map[string]bool{ "der": true, "die": true, "das": true, "und": true, "oder": true, "von": true, "in": true, "an": true, "am": true, "im": true, "zu": true, "bei": true, "mit": true, "des": true, "den": true, "dem": true, "ein": true, "eine": true, "einer": true, "einem": true, "fuer": true, "auf": true, "aus": true, "um": true, "nach": true, "ueber": true, "unter": true, "vor": true, "durch": true, } words := strings.Fields(text) var sig []string for _, w := range words { if len(w) < 3 || stopWords[w] { continue } sig = append(sig, w) } return sig } // NormalizeDEPublic is the exported version of normalizeDE for use outside this package. func NormalizeDEPublic(s string) string { return normalizeDE(s) } // normalizeDE lowercases and replaces umlauts (same as narrative_parser). func normalizeDE(s string) string { s = strings.ToLower(strings.TrimSpace(s)) s = strings.ReplaceAll(s, "ä", "ae") s = strings.ReplaceAll(s, "ö", "oe") s = strings.ReplaceAll(s, "ü", "ue") s = strings.ReplaceAll(s, "ß", "ss") return s } func normalizeCategoryDE(group string) string { n := normalizeDE(group) // Shorten for display n = strings.TrimPrefix(n, "gefaehrdungen durch ") n = strings.TrimPrefix(n, "gefaehrdungen im zusammenhang mit ") return n } func measureOverlap(gtMeasures []string, mitigations []Mitigation) bool { for _, gm := range gtMeasures { gmNorm := normalizeDE(gm) for _, m := range mitigations { mNorm := normalizeDE(m.Name + " " + m.Description) // Check if any significant word from GT measure appears in engine mitigation words := extractSignificantWords(gmNorm) for _, w := range words { if strings.Contains(mNorm, w) { return true } } } } return false } func buildRiskRankPairs(matched []HazardMatchPair) []RiskRankPair { if len(matched) == 0 { return nil } // Sort by GT risk descending to get GT rank type ranked struct { idx int gtRisk int name string } items := make([]ranked, len(matched)) for i, m := range matched { items[i] = ranked{i, m.GTEntry.RiskIn.R, m.GTEntry.HazardType} } sort.Slice(items, func(a, b int) bool { return items[a].gtRisk > items[b].gtRisk }) pairs := make([]RiskRankPair, len(items)) for rank, item := range items { pairs[rank] = RiskRankPair{ GTRank: rank + 1, EngineRank: 0, // Engine has no assessment yet for auto-generated hazards HazardName: item.name, GTRiskScore: item.gtRisk, EngineRisk: 0, } } return pairs }