test(ai-sdk): GT #3 (commercial dishwasher) + fix Drehtisch keyword mislabel
Add ground_truth_warewashing.json + TestWarewashing_GTCoverage. The test runs
the UC-M narrative through the SAME chain as production (ParseNarrative ->
engine -> relevance + cyber filter), so keyword/gating fixes are measured on
the real hazard set, and false positives show up as "extra".
Class A (generic keyword hygiene): spuelarm/spuelfeld no longer map to library
component C004 ("Drehtisch" / rotary table) — that mislabelled the spray arm.
Keep the rotating_part tag. Removes the bogus "Drehtisch" hazard.
GT #3 baseline -> after Class A: recall 80% (unchanged), one false positive
(Drehtisch) removed. Kistenhub 97.1% and Bremse pinned mappings unchanged.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,146 @@
|
||||
package iace
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// GT #3 — commercial UNDERCOUNTER dishwasher (Winterhalter UC-M). Self-assessed
|
||||
// ground truth: we can judge what a dishwasher is. The test runs the narrative
|
||||
// through the SAME chain as production (ParseNarrative -> engine -> relevance
|
||||
// filter + cyber-skip), so keyword/gating fixes are measured on the hazard set
|
||||
// the user actually sees — not the raw pattern flood.
|
||||
|
||||
// Condensed UC-M limits_form narrative. Deliberately includes "Cool-Ausfuehrung"
|
||||
// and "Filter" so the known false components (Kuehlaggregat, Absauganlage) are
|
||||
// reproduced and visible in the baseline.
|
||||
const warewashingNarrative = `Gewerbliche Untertisch-Geschirrspuelmaschine fuer Gastronomie-Kueche, ` +
|
||||
`vernetzt ueber LAN und WLAN (Connected Wash Internetportal). Heisswasser-Boiler mit ` +
|
||||
`Nachspueltemperatur ca. 85 Grad C, Tank mit Hygiene-Tankheizkoerper. Spuelpumpe 150-200 l/min ` +
|
||||
`mit rotierenden Spuelfeldern und Spuelarmen, Ablaufpumpe. Eingebautes Dosiergeraet fuer Reiniger ` +
|
||||
`und Klarspueler (aetzende Konzentrate). 4-fach-Laugenfiltration mit Filter. Doppelwandige Tuer ` +
|
||||
`mit Sicherheitsschalter und Rastposition (Thermostopp). Elektromotor (Drehstrom) 400 V. ` +
|
||||
`Touch-Steuerung (SPS) mit Bedienfeld und HMI, USB-Schnittstelle fuer Softwareupdates, ` +
|
||||
`PIN-geschuetzter Servicetechniker-Fernzugriff. Cool-Ausfuehrung mit kalter Nachspuelung. ` +
|
||||
`Untertischmontage. Eingreifen in die Spuelkammer moeglich. Aerosole und Daempfe der ` +
|
||||
`Reinigungschemie gelangen in die Atemzone. Manuelles Be- und Entladen der Spuelkoerbe von Hand. ` +
|
||||
`Reinigung und Wartung durch Servicetechniker. Branche Lebensmittel und Getraenke.`
|
||||
|
||||
// warewashingCyberCategories mirrors handlers.nativeCyberSecurityCategories —
|
||||
// native cyber/AI hazards are routed to the CRA module, not the CE hazard log.
|
||||
var warewashingCyberCategories = map[string]bool{
|
||||
"unauthorized_access": true, "firmware_corruption": true, "cyber_resilience": true,
|
||||
"logging_audit_failure": true, "cyber_network": true, "sensor_spoofing": true,
|
||||
"ai_specific": true, "ai_misclassification": true, "false_classification": true,
|
||||
"model_drift": true, "data_poisoning": true, "unintended_bias": true,
|
||||
}
|
||||
|
||||
// warewashingEngineOutput runs the production chain and returns the filtered
|
||||
// hazards/mitigations the user would see for the UC-M.
|
||||
func warewashingEngineOutput() ([]Hazard, []Mitigation, int) {
|
||||
res := ParseNarrative(warewashingNarrative, "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)")
|
||||
|
||||
var compIDs, compNames []string
|
||||
for _, c := range res.Components {
|
||||
if c.Negated {
|
||||
continue
|
||||
}
|
||||
compIDs = append(compIDs, c.LibraryID)
|
||||
compNames = append(compNames, c.NameDE)
|
||||
}
|
||||
var energyIDs []string
|
||||
for _, e := range res.EnergySources {
|
||||
energyIDs = append(energyIDs, e.SourceID)
|
||||
}
|
||||
lifecycles := append([]string{}, res.LifecyclePhases...)
|
||||
lifecycles = append(lifecycles, "normal_operation", "maintenance", "cleaning", "setup", "fault_clearing")
|
||||
|
||||
input := MatchInput{
|
||||
ComponentLibraryIDs: compIDs,
|
||||
EnergySourceIDs: energyIDs,
|
||||
LifecyclePhases: lifecycles,
|
||||
CustomTags: res.CustomTags,
|
||||
OperationalStates: append(res.OperationalStates, "normal_operation", "cleaning", "maintenance"),
|
||||
HumanRoles: res.Roles,
|
||||
MachineTypes: []string{"food_processing", "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)"},
|
||||
}
|
||||
|
||||
out := NewPatternEngine().Match(input)
|
||||
|
||||
var kept []PatternMatch
|
||||
for _, pm := range out.MatchedPatterns {
|
||||
if !IsPatternRelevant(pm, warewashingNarrative, compNames) {
|
||||
continue
|
||||
}
|
||||
allCyber := len(pm.HazardCats) > 0
|
||||
for _, c := range pm.HazardCats {
|
||||
if !warewashingCyberCategories[c] {
|
||||
allCyber = false
|
||||
}
|
||||
}
|
||||
if allCyber {
|
||||
continue
|
||||
}
|
||||
kept = append(kept, pm)
|
||||
}
|
||||
filtered := *out
|
||||
filtered.MatchedPatterns = kept
|
||||
hazards, mitigations := patternsToHazardsAndMitigations(&filtered)
|
||||
return hazards, mitigations, len(kept)
|
||||
}
|
||||
|
||||
func TestWarewashing_GTCoverage(t *testing.T) {
|
||||
gtPath := filepath.Join("testdata", "ground_truth_warewashing.json")
|
||||
raw, err := os.ReadFile(gtPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read GT: %v", err)
|
||||
}
|
||||
var gt GroundTruth
|
||||
if err := json.Unmarshal(raw, >); err != nil {
|
||||
t.Fatalf("parse GT: %v", err)
|
||||
}
|
||||
|
||||
hazards, mitigations, nPatterns := warewashingEngineOutput()
|
||||
t.Logf("Engine: %d patterns kept (relevance+cyber filter) -> %d hazards", nPatterns, len(hazards))
|
||||
|
||||
result := CompareBenchmark(>, hazards, mitigations)
|
||||
precision := 0.0
|
||||
if result.TotalEngine > 0 {
|
||||
precision = float64(len(result.MatchedPairs)) / float64(result.TotalEngine)
|
||||
}
|
||||
t.Logf("=== Warewashing-GT (GT #3) Baseline ===")
|
||||
t.Logf("Recall (Coverage): %.1f%% (%d/%d matched, %d missing)",
|
||||
result.CoverageScore*100, len(result.MatchedPairs), result.TotalGT, len(result.MissingFromEngine))
|
||||
t.Logf("Precision: %.1f%% (%d engine hazards, %d extra)",
|
||||
precision*100, result.TotalEngine, len(result.ExtraInEngine))
|
||||
|
||||
if len(result.MissingFromEngine) > 0 {
|
||||
t.Logf("--- MISSING (recall gaps) ---")
|
||||
for _, m := range result.MissingFromEngine {
|
||||
t.Logf(" MISS %s: %s", m.Nr, abbrev(m.HazardType, 60))
|
||||
}
|
||||
}
|
||||
if len(result.ExtraInEngine) > 0 {
|
||||
t.Logf("--- EXTRA (false positives / precision loss) ---")
|
||||
names := make([]string, 0, len(result.ExtraInEngine))
|
||||
for _, e := range result.ExtraInEngine {
|
||||
n := e.Name
|
||||
if n == "" {
|
||||
n = e.Scenario
|
||||
}
|
||||
names = append(names, "["+e.Category+"] "+n)
|
||||
}
|
||||
sort.Strings(names)
|
||||
for _, n := range names {
|
||||
t.Logf(" EXTRA %s", abbrev(n, 85))
|
||||
}
|
||||
}
|
||||
|
||||
// Loose smoke floor for the baseline — fixes should push recall up, not down.
|
||||
if result.CoverageScore < 0.4 {
|
||||
t.Errorf("warewashing recall below 40%% floor: %.1f%%", result.CoverageScore*100)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user