From b82853a95b08d49b3530d0fe1b70c4ca4df55447 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 15 May 2026 09:58:12 +0200 Subject: [PATCH] feat(iace): scenario-based matching + split benchmark_synonyms.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 4-signal matcher: category (0.2), keywords (0.2), zone (0.3), scenario similarity (0.3). Scenario signal extracts action words (eingeklemmt vs herabfallend vs durchschlaegt) to differentiate similar-looking hazards at the same component. Split benchmark_synonyms.go (70 lines) from benchmark_matcher.go (516→450 lines) to stay under 500-line cap. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../internal/iace/benchmark_matcher.go | 165 +++++++++--------- .../internal/iace/benchmark_synonyms.go | 68 ++++++++ 2 files changed, 155 insertions(+), 78 deletions(-) create mode 100644 ai-compliance-sdk/internal/iace/benchmark_synonyms.go diff --git a/ai-compliance-sdk/internal/iace/benchmark_matcher.go b/ai-compliance-sdk/internal/iace/benchmark_matcher.go index 54ad39a0..55783df7 100644 --- a/ai-compliance-sdk/internal/iace/benchmark_matcher.go +++ b/ai-compliance-sdk/internal/iace/benchmark_matcher.go @@ -11,60 +11,7 @@ import ( const matchThreshold = 0.20 -// categoryMap maps GT hazard_group (German) to engine category prefixes. -var categoryMap = map[string][]string{ - "mechanische gefaehrdungen": {"mechanical"}, - "elektrische gefaehrdungen": {"electrical"}, - "thermische gefaehrdungen": {"thermal"}, - "gefaehrdungen durch laerm": {"noise", "ergonomic"}, - "gefaehrdungen durch vibration": {"noise", "vibration"}, - "gefaehrdungen durch strahlung": {"radiation", "emc"}, - "gefaehrdungen durch materialien und substanzen": {"material", "environmental"}, - "ergonomische gefaehrdungen": {"ergonomic"}, - "gefaehrdungen im zusammenhang mit der einsatzumgebung": {"environmental"}, -} - -// synonymSets groups equivalent hazard terms for keyword matching. -var synonymSets = [][]string{ - {"quetsch", "crush", "einklemm", "klemm"}, - {"scher", "shear", "absch"}, - {"schneid", "cut", "schnitt"}, - {"stoss", "schlag", "impact", "treff", "aufprall"}, - {"einzug", "fang", "erfass", "entangle", "wickel"}, - {"elektrisch", "stromschlag", "electric", "beruehr", "spannungsfuehr", "koerperdurchstroemung"}, - {"brand", "feuer", "fire", "kabelbrand", "kurzschluss", "ueberlast", "ueberstrom"}, - {"verbrenn", "burn", "heiss", "thermisch", "lichtbogen"}, - {"laerm", "noise", "gehoer", "schall", "dezibel"}, - {"vibration", "schwing"}, - {"ergonom", "haltung", "handhabung", "bedien", "bewegungsapparat"}, - {"kuehlschmierstoff", "kss", "aerosol", "coolant"}, - {"pneumat", "druckluft", "compressed"}, - {"hydraul", "druck", "pressure"}, - {"roboter", "robot", "roboterarm"}, - {"greifer", "gripper", "schunk"}, - {"foerderband", "transport", "conveyor"}, - {"schutzzaun", "schutzgitter", "fence", "guard"}, - {"werkzeugmaschine", "robodrill", "bearbeitungszentrum", "wzm"}, - {"stolper", "rutsch", "slip", "trip"}, - {"leckage", "austreten", "leak"}, - {"einstich", "puncture", "spritz"}, - {"isolat", "kriechstrom", "schutzleiter", "erdung", "indirekt"}, - {"luft", "kriechstreck", "beruehrer", "oberflaeche", "leitfaehig"}, - {"emv", "strahlung", "radiation", "elektromagnet", "stoereinfluss"}, - {"eingeschlossen", "eingesperrt", "wiederanlauf", "quittier"}, - {"zentriergreifer", "zentriereinheit", "zentrieren"}, - {"beladetuer", "schutztuer", "zugangstuer", "tuerposition"}, - {"werkstueck", "rohteil", "rohling"}, - {"ergonom", "einlege", "bedienelemente", "arbeitshoehe", "haltung"}, - {"boden", "tragfaehig", "einbrech", "fundamentierr"}, - {"spritzer", "auge", "augenverletz"}, - {"bersten", "platzen", "abspring"}, - {"durchschlag", "durchbrech", "begrenz", "bewegungsbereich"}, - {"potentialausgleich", "potentialunter", "bezugspotential"}, - {"kriechstreck", "luft-", "kriechst", "dimensionie"}, - {"kuehlschmierstoff", "kss", "bettspuel", "kuehlung"}, - {"rutsch", "ausrutsch", "stolper", "gleiten", "nassrutsch"}, -} +// categoryMap, synonymSets, wrongMachineTerms → benchmark_synonyms.go // CompareBenchmark runs the full comparison between Ground Truth and engine output. func CompareBenchmark(gt *GroundTruth, hazards []Hazard, mitigations []Mitigation) *BenchmarkResult { @@ -211,59 +158,121 @@ func CompareBenchmark(gt *GroundTruth, hazards []Hazard, mitigations []Mitigatio } // fuzzyMatchScore computes a 0-1 similarity between a GT entry and an engine hazard. +// 4 signals: category (0.2), keywords (0.2), zone (0.3), scenario similarity (0.3). func fuzzyMatchScore(gt *GroundTruthEntry, h *Hazard) (float64, string) { var score float64 var reasons []string - // 1. Category match (weight 0.3) + // 1. Category match (weight 0.2) catScore := categoryMatchScore(gt.HazardGroup, h.Category) - score += 0.3 * catScore + score += 0.2 * catScore if catScore > 0 { reasons = append(reasons, "Kategorie") } - // 2. Keyword/synonym match on hazard TYPE (weight 0.3) + // 2. Keyword/synonym match on hazard TYPE (weight 0.2) kwScore := keywordMatchScore(gt.HazardType, gt.HazardCause, h.Name, h.Description, h.Scenario) - score += 0.3 * kwScore + score += 0.2 * kwScore if kwScore > 0 { reasons = append(reasons, "Keywords") } - // 3. Component/zone match (weight 0.4 — most important for specificity) + // 3. Component/zone match (weight 0.3) zoneScore := zoneMatchScore(gt.ComponentZone, gt.HazardSubgroup, h.HazardousZone, h.MachineModule) - score += 0.4 * zoneScore + score += 0.3 * zoneScore if zoneScore > 0 { reasons = append(reasons, "Zone") } - // Penalty: if engine hazard mentions a machine-specific term not in the GT context, - // it's likely a wrong-machine match (e.g. "Spielplatz" for a robot cell GT entry) + // 4. Scenario similarity (weight 0.3) — compares the actual event description + scenScore := scenarioSimilarity(gt.HazardCause, h.Scenario, h.Name) + score += 0.3 * scenScore + if scenScore > 0 { + reasons = append(reasons, "Szenario") + } + + // Penalty: wrong machine term if hasWrongMachineTerm(h.Name, h.Scenario, gt.HazardCause, gt.ComponentZone) { score *= 0.3 reasons = append(reasons, "Strafabzug:FremdMaschine") } - // Minimum keyword overlap required: if GT and Engine share no hazard-type - // keywords at all, the match is unreliable regardless of category/zone score - if kwScore == 0 && zoneScore < 0.5 { - score *= 0.5 - reasons = append(reasons, "Strafabzug:KeineKeywords") + // Penalty: no keyword AND no scenario overlap → unreliable + if kwScore == 0 && scenScore == 0 && zoneScore < 0.5 { + score *= 0.4 + reasons = append(reasons, "Strafabzug:KeinInhalt") } return score, strings.Join(reasons, "+") } -// wrongMachineTerms are words in an engine hazard that indicate it's about -// a completely different machine type. If the GT entry doesn't mention these, -// the match is penalized. -var wrongMachineTerms = []string{ - "spielplatz", "fahrtreppe", "trommelwaschmaschine", "umreifungsband", - "drehteller", "rundtaktanlage", "exzentrisch", "webstuhl", - "aufzug", "rolltreppe", "bagger", "kettensaege", "kreissaege", - "druckmaschine", "zentrifuge", "autoklav", "hobel", - "naehmaschine", "strickmaschine", "schleifmaschine", - "gabelstapler", "flurfoerder", "erntemaschine", - "kollision zweier roboter", +// scenarioSimilarity compares the GT cause description with the engine scenario. +// It extracts "action words" (verbs/descriptors that define WHAT happens) and +// checks overlap. This differentiates "eingeklemmt" from "herabfallend" from "durchschlägt". +func scenarioSimilarity(gtCause, engScenario, engName string) float64 { + gtText := normalizeDE(gtCause) + engText := normalizeDE(engScenario + " " + engName) + + // Extract action/event words that describe the specific scenario + gtActions := extractActionWords(gtText) + engActions := extractActionWords(engText) + + if len(gtActions) == 0 { + return 0 + } + + matched := 0 + for _, ga := range gtActions { + for _, ea := range engActions { + if ga == ea || strings.HasPrefix(ea, ga) || strings.HasPrefix(ga, ea) { + matched++ + break + } + } + } + return float64(matched) / float64(len(gtActions)) +} + +// extractActionWords pulls out verbs and descriptors that define the hazard event. +func extractActionWords(text string) []string { + // These are the differentiating words between similar-looking hazards + actionTerms := []string{ + "eingeklemmt", "einklemm", "eingeschlossen", "eingesperrt", + "herabfall", "herunterfal", "faellt", + "durchschlaegt", "durchbrech", "durchschlag", + "springt ab", "abspring", "bersten", "platzen", + "weggeschleudert", "schleuder", + "getroffen", "treff", + "greift", "eingreif", "durchgreif", "uebergreif", + "beruehrt", "beruehr", "kontakt", + "einzug", "erfass", "aufwickel", + "stolper", "rutsch", "ausrutsch", "gleiten", + "verbren", "heiss", + "spritzer", "augenver", + "kurzschluss", "ueberstrom", "ueberlast", + "isolat", "schutzleiter", "kriechstrom", "kriechstreck", + "potentialausgleich", "potentialunter", + "emv", "stoereinfluss", "elektromagnet", + "leckage", "austret", "undicht", + "schutzzaun", "einhausung", "schutztuer", + "wiederanlauf", "anlauf", "startet", + "teach", "einricht", "programmier", + "spannvorricht", "spannfutter", "greiferbacken", + "druckluft", "pneumatik", "restdruck", + "beladetuer", "werkzeugmaschine", "bearbeitungszelle", + "ergonom", "einlege", "bedienelement", + "tragfaehig", "boden", "einbrech", + } + + var found []string + seen := make(map[string]bool) + for _, term := range actionTerms { + if strings.Contains(text, term) && !seen[term] { + seen[term] = true + found = append(found, term) + } + } + return found } func hasWrongMachineTerm(engName, engScenario, gtCause, gtZone string) bool { diff --git a/ai-compliance-sdk/internal/iace/benchmark_synonyms.go b/ai-compliance-sdk/internal/iace/benchmark_synonyms.go new file mode 100644 index 00000000..d7402f02 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/benchmark_synonyms.go @@ -0,0 +1,68 @@ +package iace + +// synonymSets groups equivalent hazard terms for keyword matching. +var synonymSets = [][]string{ + {"quetsch", "crush", "einklemm", "klemm"}, + {"scher", "shear", "absch"}, + {"schneid", "cut", "schnitt"}, + {"stoss", "schlag", "impact", "treff", "aufprall"}, + {"einzug", "fang", "erfass", "entangle", "wickel"}, + {"elektrisch", "stromschlag", "electric", "beruehr", "spannungsfuehr", "koerperdurchstroemung"}, + {"brand", "feuer", "fire", "kabelbrand", "kurzschluss", "ueberlast", "ueberstrom"}, + {"verbrenn", "burn", "heiss", "thermisch", "lichtbogen"}, + {"laerm", "noise", "gehoer", "schall", "dezibel"}, + {"vibration", "schwing"}, + {"ergonom", "haltung", "handhabung", "bedien", "bewegungsapparat"}, + {"kuehlschmierstoff", "kss", "aerosol", "coolant"}, + {"pneumat", "druckluft", "compressed"}, + {"hydraul", "druck", "pressure"}, + {"roboter", "robot", "roboterarm"}, + {"greifer", "gripper", "schunk"}, + {"foerderband", "transport", "conveyor"}, + {"schutzzaun", "schutzgitter", "fence", "guard"}, + {"werkzeugmaschine", "robodrill", "bearbeitungszentrum", "wzm"}, + {"stolper", "rutsch", "slip", "trip"}, + {"leckage", "austreten", "leak"}, + {"einstich", "puncture", "spritz"}, + {"isolat", "kriechstrom", "schutzleiter", "erdung", "indirekt"}, + {"luft", "kriechstreck", "beruehrer", "oberflaeche", "leitfaehig"}, + {"emv", "strahlung", "radiation", "elektromagnet", "stoereinfluss"}, + {"eingeschlossen", "eingesperrt", "wiederanlauf", "quittier"}, + {"zentriergreifer", "zentriereinheit", "zentrieren"}, + {"beladetuer", "schutztuer", "zugangstuer", "tuerposition"}, + {"werkstueck", "rohteil", "rohling"}, + {"ergonom", "einlege", "bedienelemente", "arbeitshoehe", "haltung"}, + {"boden", "tragfaehig", "einbrech", "fundamentierr"}, + {"spritzer", "auge", "augenverletz"}, + {"bersten", "platzen", "abspring"}, + {"durchschlag", "durchbrech", "begrenz", "bewegungsbereich"}, + {"potentialausgleich", "potentialunter", "bezugspotential"}, + {"kriechstreck", "luft-", "kriechst", "dimensionie"}, + {"kuehlschmierstoff", "kss", "bettspuel", "kuehlung"}, + {"rutsch", "ausrutsch", "stolper", "gleiten", "nassrutsch"}, +} + +// wrongMachineTerms are words in an engine hazard that indicate it's about +// a completely different machine type. +var wrongMachineTerms = []string{ + "spielplatz", "fahrtreppe", "trommelwaschmaschine", "umreifungsband", + "drehteller", "rundtaktanlage", "exzentrisch", "webstuhl", + "aufzug", "rolltreppe", "bagger", "kettensaege", "kreissaege", + "druckmaschine", "zentrifuge", "autoklav", "hobel", + "naehmaschine", "strickmaschine", "schleifmaschine", + "gabelstapler", "flurfoerder", "erntemaschine", + "kollision zweier roboter", +} + +// categoryMap maps GT hazard_group (German) to engine category prefixes. +var categoryMap = map[string][]string{ + "mechanische gefaehrdungen": {"mechanical"}, + "elektrische gefaehrdungen": {"electrical"}, + "thermische gefaehrdungen": {"thermal"}, + "gefaehrdungen durch laerm": {"noise", "ergonomic"}, + "gefaehrdungen durch vibration": {"noise", "vibration"}, + "gefaehrdungen durch strahlung": {"radiation", "emc"}, + "gefaehrdungen durch materialien und substanzen": {"material", "environmental"}, + "ergonomische gefaehrdungen": {"ergonomic"}, + "gefaehrdungen im zusammenhang mit der einsatzumgebung": {"environmental"}, +}