feat(iace): scenario-based matching + split benchmark_synonyms.go

4-signal matcher: category (0.2), keywords (0.2), zone (0.3),
scenario similarity (0.3). Scenario signal extracts action words
(eingeklemmt vs herabfallend vs durchschlaegt) to differentiate
similar-looking hazards at the same component.

Split benchmark_synonyms.go (70 lines) from benchmark_matcher.go
(516→450 lines) to stay under 500-line cap.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-15 09:58:12 +02:00
parent c060ac222a
commit b82853a95b
2 changed files with 155 additions and 78 deletions
@@ -11,60 +11,7 @@ import (
const matchThreshold = 0.20
// categoryMap maps GT hazard_group (German) to engine category prefixes.
var categoryMap = map[string][]string{
"mechanische gefaehrdungen": {"mechanical"},
"elektrische gefaehrdungen": {"electrical"},
"thermische gefaehrdungen": {"thermal"},
"gefaehrdungen durch laerm": {"noise", "ergonomic"},
"gefaehrdungen durch vibration": {"noise", "vibration"},
"gefaehrdungen durch strahlung": {"radiation", "emc"},
"gefaehrdungen durch materialien und substanzen": {"material", "environmental"},
"ergonomische gefaehrdungen": {"ergonomic"},
"gefaehrdungen im zusammenhang mit der einsatzumgebung": {"environmental"},
}
// synonymSets groups equivalent hazard terms for keyword matching.
var synonymSets = [][]string{
{"quetsch", "crush", "einklemm", "klemm"},
{"scher", "shear", "absch"},
{"schneid", "cut", "schnitt"},
{"stoss", "schlag", "impact", "treff", "aufprall"},
{"einzug", "fang", "erfass", "entangle", "wickel"},
{"elektrisch", "stromschlag", "electric", "beruehr", "spannungsfuehr", "koerperdurchstroemung"},
{"brand", "feuer", "fire", "kabelbrand", "kurzschluss", "ueberlast", "ueberstrom"},
{"verbrenn", "burn", "heiss", "thermisch", "lichtbogen"},
{"laerm", "noise", "gehoer", "schall", "dezibel"},
{"vibration", "schwing"},
{"ergonom", "haltung", "handhabung", "bedien", "bewegungsapparat"},
{"kuehlschmierstoff", "kss", "aerosol", "coolant"},
{"pneumat", "druckluft", "compressed"},
{"hydraul", "druck", "pressure"},
{"roboter", "robot", "roboterarm"},
{"greifer", "gripper", "schunk"},
{"foerderband", "transport", "conveyor"},
{"schutzzaun", "schutzgitter", "fence", "guard"},
{"werkzeugmaschine", "robodrill", "bearbeitungszentrum", "wzm"},
{"stolper", "rutsch", "slip", "trip"},
{"leckage", "austreten", "leak"},
{"einstich", "puncture", "spritz"},
{"isolat", "kriechstrom", "schutzleiter", "erdung", "indirekt"},
{"luft", "kriechstreck", "beruehrer", "oberflaeche", "leitfaehig"},
{"emv", "strahlung", "radiation", "elektromagnet", "stoereinfluss"},
{"eingeschlossen", "eingesperrt", "wiederanlauf", "quittier"},
{"zentriergreifer", "zentriereinheit", "zentrieren"},
{"beladetuer", "schutztuer", "zugangstuer", "tuerposition"},
{"werkstueck", "rohteil", "rohling"},
{"ergonom", "einlege", "bedienelemente", "arbeitshoehe", "haltung"},
{"boden", "tragfaehig", "einbrech", "fundamentierr"},
{"spritzer", "auge", "augenverletz"},
{"bersten", "platzen", "abspring"},
{"durchschlag", "durchbrech", "begrenz", "bewegungsbereich"},
{"potentialausgleich", "potentialunter", "bezugspotential"},
{"kriechstreck", "luft-", "kriechst", "dimensionie"},
{"kuehlschmierstoff", "kss", "bettspuel", "kuehlung"},
{"rutsch", "ausrutsch", "stolper", "gleiten", "nassrutsch"},
}
// categoryMap, synonymSets, wrongMachineTerms → benchmark_synonyms.go
// CompareBenchmark runs the full comparison between Ground Truth and engine output.
func CompareBenchmark(gt *GroundTruth, hazards []Hazard, mitigations []Mitigation) *BenchmarkResult {
@@ -211,59 +158,121 @@ func CompareBenchmark(gt *GroundTruth, hazards []Hazard, mitigations []Mitigatio
}
// fuzzyMatchScore computes a 0-1 similarity between a GT entry and an engine hazard.
// 4 signals: category (0.2), keywords (0.2), zone (0.3), scenario similarity (0.3).
func fuzzyMatchScore(gt *GroundTruthEntry, h *Hazard) (float64, string) {
var score float64
var reasons []string
// 1. Category match (weight 0.3)
// 1. Category match (weight 0.2)
catScore := categoryMatchScore(gt.HazardGroup, h.Category)
score += 0.3 * catScore
score += 0.2 * catScore
if catScore > 0 {
reasons = append(reasons, "Kategorie")
}
// 2. Keyword/synonym match on hazard TYPE (weight 0.3)
// 2. Keyword/synonym match on hazard TYPE (weight 0.2)
kwScore := keywordMatchScore(gt.HazardType, gt.HazardCause, h.Name, h.Description, h.Scenario)
score += 0.3 * kwScore
score += 0.2 * kwScore
if kwScore > 0 {
reasons = append(reasons, "Keywords")
}
// 3. Component/zone match (weight 0.4 — most important for specificity)
// 3. Component/zone match (weight 0.3)
zoneScore := zoneMatchScore(gt.ComponentZone, gt.HazardSubgroup, h.HazardousZone, h.MachineModule)
score += 0.4 * zoneScore
score += 0.3 * zoneScore
if zoneScore > 0 {
reasons = append(reasons, "Zone")
}
// Penalty: if engine hazard mentions a machine-specific term not in the GT context,
// it's likely a wrong-machine match (e.g. "Spielplatz" for a robot cell GT entry)
// 4. Scenario similarity (weight 0.3) — compares the actual event description
scenScore := scenarioSimilarity(gt.HazardCause, h.Scenario, h.Name)
score += 0.3 * scenScore
if scenScore > 0 {
reasons = append(reasons, "Szenario")
}
// Penalty: wrong machine term
if hasWrongMachineTerm(h.Name, h.Scenario, gt.HazardCause, gt.ComponentZone) {
score *= 0.3
reasons = append(reasons, "Strafabzug:FremdMaschine")
}
// Minimum keyword overlap required: if GT and Engine share no hazard-type
// keywords at all, the match is unreliable regardless of category/zone score
if kwScore == 0 && zoneScore < 0.5 {
score *= 0.5
reasons = append(reasons, "Strafabzug:KeineKeywords")
// Penalty: no keyword AND no scenario overlap → unreliable
if kwScore == 0 && scenScore == 0 && zoneScore < 0.5 {
score *= 0.4
reasons = append(reasons, "Strafabzug:KeinInhalt")
}
return score, strings.Join(reasons, "+")
}
// wrongMachineTerms are words in an engine hazard that indicate it's about
// a completely different machine type. If the GT entry doesn't mention these,
// the match is penalized.
var wrongMachineTerms = []string{
"spielplatz", "fahrtreppe", "trommelwaschmaschine", "umreifungsband",
"drehteller", "rundtaktanlage", "exzentrisch", "webstuhl",
"aufzug", "rolltreppe", "bagger", "kettensaege", "kreissaege",
"druckmaschine", "zentrifuge", "autoklav", "hobel",
"naehmaschine", "strickmaschine", "schleifmaschine",
"gabelstapler", "flurfoerder", "erntemaschine",
"kollision zweier roboter",
// scenarioSimilarity compares the GT cause description with the engine scenario.
// It extracts "action words" (verbs/descriptors that define WHAT happens) and
// checks overlap. This differentiates "eingeklemmt" from "herabfallend" from "durchschlägt".
func scenarioSimilarity(gtCause, engScenario, engName string) float64 {
gtText := normalizeDE(gtCause)
engText := normalizeDE(engScenario + " " + engName)
// Extract action/event words that describe the specific scenario
gtActions := extractActionWords(gtText)
engActions := extractActionWords(engText)
if len(gtActions) == 0 {
return 0
}
matched := 0
for _, ga := range gtActions {
for _, ea := range engActions {
if ga == ea || strings.HasPrefix(ea, ga) || strings.HasPrefix(ga, ea) {
matched++
break
}
}
}
return float64(matched) / float64(len(gtActions))
}
// extractActionWords pulls out verbs and descriptors that define the hazard event.
func extractActionWords(text string) []string {
// These are the differentiating words between similar-looking hazards
actionTerms := []string{
"eingeklemmt", "einklemm", "eingeschlossen", "eingesperrt",
"herabfall", "herunterfal", "faellt",
"durchschlaegt", "durchbrech", "durchschlag",
"springt ab", "abspring", "bersten", "platzen",
"weggeschleudert", "schleuder",
"getroffen", "treff",
"greift", "eingreif", "durchgreif", "uebergreif",
"beruehrt", "beruehr", "kontakt",
"einzug", "erfass", "aufwickel",
"stolper", "rutsch", "ausrutsch", "gleiten",
"verbren", "heiss",
"spritzer", "augenver",
"kurzschluss", "ueberstrom", "ueberlast",
"isolat", "schutzleiter", "kriechstrom", "kriechstreck",
"potentialausgleich", "potentialunter",
"emv", "stoereinfluss", "elektromagnet",
"leckage", "austret", "undicht",
"schutzzaun", "einhausung", "schutztuer",
"wiederanlauf", "anlauf", "startet",
"teach", "einricht", "programmier",
"spannvorricht", "spannfutter", "greiferbacken",
"druckluft", "pneumatik", "restdruck",
"beladetuer", "werkzeugmaschine", "bearbeitungszelle",
"ergonom", "einlege", "bedienelement",
"tragfaehig", "boden", "einbrech",
}
var found []string
seen := make(map[string]bool)
for _, term := range actionTerms {
if strings.Contains(text, term) && !seen[term] {
seen[term] = true
found = append(found, term)
}
}
return found
}
func hasWrongMachineTerm(engName, engScenario, gtCause, gtZone string) bool {
@@ -0,0 +1,68 @@
package iace
// synonymSets groups equivalent hazard terms for keyword matching.
var synonymSets = [][]string{
{"quetsch", "crush", "einklemm", "klemm"},
{"scher", "shear", "absch"},
{"schneid", "cut", "schnitt"},
{"stoss", "schlag", "impact", "treff", "aufprall"},
{"einzug", "fang", "erfass", "entangle", "wickel"},
{"elektrisch", "stromschlag", "electric", "beruehr", "spannungsfuehr", "koerperdurchstroemung"},
{"brand", "feuer", "fire", "kabelbrand", "kurzschluss", "ueberlast", "ueberstrom"},
{"verbrenn", "burn", "heiss", "thermisch", "lichtbogen"},
{"laerm", "noise", "gehoer", "schall", "dezibel"},
{"vibration", "schwing"},
{"ergonom", "haltung", "handhabung", "bedien", "bewegungsapparat"},
{"kuehlschmierstoff", "kss", "aerosol", "coolant"},
{"pneumat", "druckluft", "compressed"},
{"hydraul", "druck", "pressure"},
{"roboter", "robot", "roboterarm"},
{"greifer", "gripper", "schunk"},
{"foerderband", "transport", "conveyor"},
{"schutzzaun", "schutzgitter", "fence", "guard"},
{"werkzeugmaschine", "robodrill", "bearbeitungszentrum", "wzm"},
{"stolper", "rutsch", "slip", "trip"},
{"leckage", "austreten", "leak"},
{"einstich", "puncture", "spritz"},
{"isolat", "kriechstrom", "schutzleiter", "erdung", "indirekt"},
{"luft", "kriechstreck", "beruehrer", "oberflaeche", "leitfaehig"},
{"emv", "strahlung", "radiation", "elektromagnet", "stoereinfluss"},
{"eingeschlossen", "eingesperrt", "wiederanlauf", "quittier"},
{"zentriergreifer", "zentriereinheit", "zentrieren"},
{"beladetuer", "schutztuer", "zugangstuer", "tuerposition"},
{"werkstueck", "rohteil", "rohling"},
{"ergonom", "einlege", "bedienelemente", "arbeitshoehe", "haltung"},
{"boden", "tragfaehig", "einbrech", "fundamentierr"},
{"spritzer", "auge", "augenverletz"},
{"bersten", "platzen", "abspring"},
{"durchschlag", "durchbrech", "begrenz", "bewegungsbereich"},
{"potentialausgleich", "potentialunter", "bezugspotential"},
{"kriechstreck", "luft-", "kriechst", "dimensionie"},
{"kuehlschmierstoff", "kss", "bettspuel", "kuehlung"},
{"rutsch", "ausrutsch", "stolper", "gleiten", "nassrutsch"},
}
// wrongMachineTerms are words in an engine hazard that indicate it's about
// a completely different machine type.
var wrongMachineTerms = []string{
"spielplatz", "fahrtreppe", "trommelwaschmaschine", "umreifungsband",
"drehteller", "rundtaktanlage", "exzentrisch", "webstuhl",
"aufzug", "rolltreppe", "bagger", "kettensaege", "kreissaege",
"druckmaschine", "zentrifuge", "autoklav", "hobel",
"naehmaschine", "strickmaschine", "schleifmaschine",
"gabelstapler", "flurfoerder", "erntemaschine",
"kollision zweier roboter",
}
// categoryMap maps GT hazard_group (German) to engine category prefixes.
var categoryMap = map[string][]string{
"mechanische gefaehrdungen": {"mechanical"},
"elektrische gefaehrdungen": {"electrical"},
"thermische gefaehrdungen": {"thermal"},
"gefaehrdungen durch laerm": {"noise", "ergonomic"},
"gefaehrdungen durch vibration": {"noise", "vibration"},
"gefaehrdungen durch strahlung": {"radiation", "emc"},
"gefaehrdungen durch materialien und substanzen": {"material", "environmental"},
"ergonomische gefaehrdungen": {"ergonomic"},
"gefaehrdungen im zusammenhang mit der einsatzumgebung": {"environmental"},
}