test(ai-sdk): GT #3 (commercial dishwasher) + fix Drehtisch keyword mislabel

Add ground_truth_warewashing.json + TestWarewashing_GTCoverage. The test runs the UC-M narrative through the SAME chain as production (ParseNarrative -> engine -> relevance + cyber filter), so keyword/gating fixes are measured on the real hazard set, and false positives show up as "extra". Class A (generic keyword hygiene): spuelarm/spuelfeld no longer map to library component C004 ("Drehtisch" / rotary table) — that mislabelled the spray arm. Keep the rotating_part tag. Removes the bogus "Drehtisch" hazard. GT #3 baseline -> after Class A: recall 80% (unchanged), one false positive (Drehtisch) removed. Kistenhub 97.1% and Bremse pinned mappings unchanged. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-24 21:51:26 +02:00
parent 79ad95e244
commit cf86dc241b
3 changed files with 384 additions and 1 deletions
@@ -0,0 +1,146 @@
+package iace
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"sort"
+	"testing"
+)
+
+// GT #3 — commercial UNDERCOUNTER dishwasher (Winterhalter UC-M). Self-assessed
+// ground truth: we can judge what a dishwasher is. The test runs the narrative
+// through the SAME chain as production (ParseNarrative -> engine -> relevance
+// filter + cyber-skip), so keyword/gating fixes are measured on the hazard set
+// the user actually sees — not the raw pattern flood.
+
+// Condensed UC-M limits_form narrative. Deliberately includes "Cool-Ausfuehrung"
+// and "Filter" so the known false components (Kuehlaggregat, Absauganlage) are
+// reproduced and visible in the baseline.
+const warewashingNarrative = `Gewerbliche Untertisch-Geschirrspuelmaschine fuer Gastronomie-Kueche, ` +
+	`vernetzt ueber LAN und WLAN (Connected Wash Internetportal). Heisswasser-Boiler mit ` +
+	`Nachspueltemperatur ca. 85 Grad C, Tank mit Hygiene-Tankheizkoerper. Spuelpumpe 150-200 l/min ` +
+	`mit rotierenden Spuelfeldern und Spuelarmen, Ablaufpumpe. Eingebautes Dosiergeraet fuer Reiniger ` +
+	`und Klarspueler (aetzende Konzentrate). 4-fach-Laugenfiltration mit Filter. Doppelwandige Tuer ` +
+	`mit Sicherheitsschalter und Rastposition (Thermostopp). Elektromotor (Drehstrom) 400 V. ` +
+	`Touch-Steuerung (SPS) mit Bedienfeld und HMI, USB-Schnittstelle fuer Softwareupdates, ` +
+	`PIN-geschuetzter Servicetechniker-Fernzugriff. Cool-Ausfuehrung mit kalter Nachspuelung. ` +
+	`Untertischmontage. Eingreifen in die Spuelkammer moeglich. Aerosole und Daempfe der ` +
+	`Reinigungschemie gelangen in die Atemzone. Manuelles Be- und Entladen der Spuelkoerbe von Hand. ` +
+	`Reinigung und Wartung durch Servicetechniker. Branche Lebensmittel und Getraenke.`
+
+// warewashingCyberCategories mirrors handlers.nativeCyberSecurityCategories —
+// native cyber/AI hazards are routed to the CRA module, not the CE hazard log.
+var warewashingCyberCategories = map[string]bool{
+	"unauthorized_access": true, "firmware_corruption": true, "cyber_resilience": true,
+	"logging_audit_failure": true, "cyber_network": true, "sensor_spoofing": true,
+	"ai_specific": true, "ai_misclassification": true, "false_classification": true,
+	"model_drift": true, "data_poisoning": true, "unintended_bias": true,
+}
+
+// warewashingEngineOutput runs the production chain and returns the filtered
+// hazards/mitigations the user would see for the UC-M.
+func warewashingEngineOutput() ([]Hazard, []Mitigation, int) {
+	res := ParseNarrative(warewashingNarrative, "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)")
+
+	var compIDs, compNames []string
+	for _, c := range res.Components {
+		if c.Negated {
+			continue
+		}
+		compIDs = append(compIDs, c.LibraryID)
+		compNames = append(compNames, c.NameDE)
+	}
+	var energyIDs []string
+	for _, e := range res.EnergySources {
+		energyIDs = append(energyIDs, e.SourceID)
+	}
+	lifecycles := append([]string{}, res.LifecyclePhases...)
+	lifecycles = append(lifecycles, "normal_operation", "maintenance", "cleaning", "setup", "fault_clearing")
+
+	input := MatchInput{
+		ComponentLibraryIDs: compIDs,
+		EnergySourceIDs:     energyIDs,
+		LifecyclePhases:     lifecycles,
+		CustomTags:          res.CustomTags,
+		OperationalStates:   append(res.OperationalStates, "normal_operation", "cleaning", "maintenance"),
+		HumanRoles:          res.Roles,
+		MachineTypes:        []string{"food_processing", "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)"},
+	}
+
+	out := NewPatternEngine().Match(input)
+
+	var kept []PatternMatch
+	for _, pm := range out.MatchedPatterns {
+		if !IsPatternRelevant(pm, warewashingNarrative, compNames) {
+			continue
+		}
+		allCyber := len(pm.HazardCats) > 0
+		for _, c := range pm.HazardCats {
+			if !warewashingCyberCategories[c] {
+				allCyber = false
+			}
+		}
+		if allCyber {
+			continue
+		}
+		kept = append(kept, pm)
+	}
+	filtered := *out
+	filtered.MatchedPatterns = kept
+	hazards, mitigations := patternsToHazardsAndMitigations(&filtered)
+	return hazards, mitigations, len(kept)
+}
+
+func TestWarewashing_GTCoverage(t *testing.T) {
+	gtPath := filepath.Join("testdata", "ground_truth_warewashing.json")
+	raw, err := os.ReadFile(gtPath)
+	if err != nil {
+		t.Fatalf("read GT: %v", err)
+	}
+	var gt GroundTruth
+	if err := json.Unmarshal(raw, &gt); err != nil {
+		t.Fatalf("parse GT: %v", err)
+	}
+
+	hazards, mitigations, nPatterns := warewashingEngineOutput()
+	t.Logf("Engine: %d patterns kept (relevance+cyber filter) -> %d hazards", nPatterns, len(hazards))
+
+	result := CompareBenchmark(&gt, hazards, mitigations)
+	precision := 0.0
+	if result.TotalEngine > 0 {
+		precision = float64(len(result.MatchedPairs)) / float64(result.TotalEngine)
+	}
+	t.Logf("=== Warewashing-GT (GT #3) Baseline ===")
+	t.Logf("Recall (Coverage): %.1f%% (%d/%d matched, %d missing)",
+		result.CoverageScore*100, len(result.MatchedPairs), result.TotalGT, len(result.MissingFromEngine))
+	t.Logf("Precision:         %.1f%% (%d engine hazards, %d extra)",
+		precision*100, result.TotalEngine, len(result.ExtraInEngine))
+
+	if len(result.MissingFromEngine) > 0 {
+		t.Logf("--- MISSING (recall gaps) ---")
+		for _, m := range result.MissingFromEngine {
+			t.Logf("  MISS %s: %s", m.Nr, abbrev(m.HazardType, 60))
+		}
+	}
+	if len(result.ExtraInEngine) > 0 {
+		t.Logf("--- EXTRA (false positives / precision loss) ---")
+		names := make([]string, 0, len(result.ExtraInEngine))
+		for _, e := range result.ExtraInEngine {
+			n := e.Name
+			if n == "" {
+				n = e.Scenario
+			}
+			names = append(names, "["+e.Category+"] "+n)
+		}
+		sort.Strings(names)
+		for _, n := range names {
+			t.Logf("  EXTRA %s", abbrev(n, 85))
+		}
+	}
+
+	// Loose smoke floor for the baseline — fixes should push recall up, not down.
+	if result.CoverageScore < 0.4 {
+		t.Errorf("warewashing recall below 40%% floor: %.1f%%", result.CoverageScore*100)
+	}
+}