Files
breakpilot-compliance/ai-compliance-sdk/internal/iace/gt_warewashing_test.go
T
Benjamin Admin 8674b2cd9a feat(ai-sdk): offline dedup-candidate proposer + deterministic GT wall (P2 slice 1)
First thin slice of the offline library-improvement proposer. DEV-TIME ONLY,
propose-only — it never mutates the pattern library or the runtime.

- FindDedupCandidates (proposer_dedup.go): structural near-duplicate detection
  over the fired patterns (category + measure/zone/scenario overlap). Bakes in
  the P1 lesson: only same-category pairs compare, and pairs with different
  operational states are never proposed (normal-operation vs maintenance are
  legitimately distinct, e.g. HP011 vs HP077).
- ScreenSupersession (proposer_screen.go): the wall. A proposal is safe only if
  (1) dropping the hazard does not reduce GT recall AND (2) keep/drop do not
  credit DIFFERENT GT entries. Check 2 catches distinct hazards that merely share
  measures (HP2201 hot surface GT 1.3 vs HP2202 hot ware GT 1.4) which recall
  alone would wave through.

On real warewashing output: 3 candidates -> 1 BLOCKED (distinct GT), 2
RECALL-SAFE for human/LLM review (the update + winding/friction near-dupes).
Nothing auto-applied. All 3 GTs unaffected (read-only). The LLM judgement and a
CLI/file queue are slice 2.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-26 10:27:01 +02:00

229 lines
9.0 KiB
Go

package iace
import (
"encoding/json"
"os"
"path/filepath"
"sort"
"testing"
)
// GT #3 — commercial UNDERCOUNTER dishwasher (Winterhalter UC-M). Self-assessed
// ground truth: we can judge what a dishwasher is. The test runs the narrative
// through the SAME chain as production (ParseNarrative -> engine -> relevance
// filter + cyber-skip), so keyword/gating fixes are measured on the hazard set
// the user actually sees — not the raw pattern flood.
// Condensed UC-M limits_form narrative. Deliberately includes "Cool-Ausfuehrung"
// and "Filter" so the known false components (Kuehlaggregat, Absauganlage) are
// reproduced and visible in the baseline.
const warewashingNarrative = `Gewerbliche Untertisch-Geschirrspuelmaschine fuer Gastronomie-Kueche, ` +
`vernetzt ueber LAN und WLAN (Connected Wash Internetportal). Heisswasser-Boiler mit ` +
`Nachspueltemperatur ca. 85 Grad C, Tank mit Hygiene-Tankheizkoerper. Spuelpumpe 150-200 l/min ` +
`mit rotierenden Spuelfeldern und Spuelarmen, Ablaufpumpe. Eingebautes Dosiergeraet fuer Reiniger ` +
`und Klarspueler (aetzende Konzentrate). 4-fach-Laugenfiltration mit Filter. Doppelwandige Tuer ` +
`mit Sicherheitsschalter und Rastposition (Thermostopp). Elektromotor (Drehstrom) 400 V. ` +
`Touch-Steuerung (SPS) mit Bedienfeld und HMI, USB-Schnittstelle fuer Softwareupdates, ` +
`PIN-geschuetzter Servicetechniker-Fernzugriff. Cool-Ausfuehrung mit kalter Nachspuelung. ` +
`Untertischmontage. Eingreifen in die Spuelkammer moeglich. Aerosole und Daempfe der ` +
`Reinigungschemie gelangen in die Atemzone. Manuelles Be- und Entladen der Spuelkoerbe von Hand. ` +
`Reinigung und Wartung durch Servicetechniker. Branche Lebensmittel und Getraenke. ` +
`Siebe und scharfe Blechkanten in der Spuelkammer. Boiler kann bei Wassermangel trockenlaufen. ` +
`Frequenzumrichter und Elektronik mit Restspannung nach dem Abschalten. Wartung nur im ` +
`freigeschalteten Zustand; Gefahr des unerwarteten Wiederanlaufs. Frischwasseranschluss mit ` +
`Rueckflussverhinderer gegen Ruecksaugen in das Trinkwassernetz. Stehwasser im Boiler ` +
`(Hygiene/Legionellen). Standsicherheit bei Untertischmontage.`
// warewashingCyberCategories mirrors handlers.nativeCyberSecurityCategories —
// native cyber/AI hazards are routed to the CRA module, not the CE hazard log.
var warewashingCyberCategories = map[string]bool{
"unauthorized_access": true, "firmware_corruption": true, "cyber_resilience": true,
"logging_audit_failure": true, "cyber_network": true, "sensor_spoofing": true,
"ai_specific": true, "ai_misclassification": true, "false_classification": true,
"model_drift": true, "data_poisoning": true, "unintended_bias": true,
}
// warewashingEngineOutput runs the production chain and returns the filtered
// hazards/mitigations the user would see for the UC-M.
func warewashingEngineOutput() ([]Hazard, []Mitigation, []PatternMatch) {
res := ParseNarrative(warewashingNarrative, "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)")
var compIDs, compNames []string
for _, c := range res.Components {
if c.Negated {
continue
}
compIDs = append(compIDs, c.LibraryID)
compNames = append(compNames, c.NameDE)
}
var energyIDs []string
for _, e := range res.EnergySources {
energyIDs = append(energyIDs, e.SourceID)
}
lifecycles := append([]string{}, res.LifecyclePhases...)
lifecycles = append(lifecycles, "normal_operation", "maintenance", "cleaning", "setup", "fault_clearing")
input := MatchInput{
ComponentLibraryIDs: compIDs,
EnergySourceIDs: energyIDs,
LifecyclePhases: lifecycles,
CustomTags: res.CustomTags,
OperationalStates: append(res.OperationalStates, "normal_operation", "cleaning", "maintenance"),
HumanRoles: res.Roles,
MachineTypes: []string{"food_processing", "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)"},
}
out := NewPatternEngine().Match(input)
var kept []PatternMatch
for _, pm := range out.MatchedPatterns {
if !IsPatternRelevant(pm, warewashingNarrative, compNames) {
continue
}
allCyber := len(pm.HazardCats) > 0
for _, c := range pm.HazardCats {
if !warewashingCyberCategories[c] {
allCyber = false
}
}
if allCyber {
continue
}
kept = append(kept, pm)
}
filtered := *out
filtered.MatchedPatterns = kept
hazards, mitigations := patternsToHazardsAndMitigations(&filtered)
return hazards, mitigations, kept
}
func TestWarewashing_GTCoverage(t *testing.T) {
gtPath := filepath.Join("testdata", "ground_truth_warewashing.json")
raw, err := os.ReadFile(gtPath)
if err != nil {
t.Fatalf("read GT: %v", err)
}
var gt GroundTruth
if err := json.Unmarshal(raw, &gt); err != nil {
t.Fatalf("parse GT: %v", err)
}
{
res := ParseNarrative(warewashingNarrative, "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)")
var cn []string
for _, c := range res.Components {
if !c.Negated {
cn = append(cn, c.NameDE)
}
}
t.Logf("Parsed components: %v", cn)
}
hazards, mitigations, keptPatterns := warewashingEngineOutput()
t.Logf("Engine: %d patterns kept (relevance+cyber filter) -> %d hazards", len(keptPatterns), len(hazards))
result := CompareBenchmark(&gt, hazards, mitigations)
precision := 0.0
if result.TotalEngine > 0 {
precision = float64(len(result.MatchedPairs)) / float64(result.TotalEngine)
}
t.Logf("=== Warewashing-GT (GT #3) Baseline ===")
t.Logf("Recall (Coverage): %.1f%% (%d/%d matched, %d missing)",
result.CoverageScore*100, len(result.MatchedPairs), result.TotalGT, len(result.MissingFromEngine))
t.Logf("Precision: %.1f%% (%d engine hazards, %d extra)",
precision*100, result.TotalEngine, len(result.ExtraInEngine))
if len(result.MissingFromEngine) > 0 {
t.Logf("--- MISSING (recall gaps) ---")
for _, m := range result.MissingFromEngine {
t.Logf(" MISS %s: %s", m.Nr, abbrev(m.HazardType, 60))
}
}
// Measure completeness: which generated hazards have NO protective measure?
t.Logf("--- Measure completeness ---")
t.Logf("Measure coverage (GT-matched): %.0f%%", result.MeasureCoverage*100)
withMeas := make(map[string]bool)
for _, m := range mitigations {
withMeas[m.HazardID.String()] = true
}
noMeasure := 0
for _, h := range hazards {
if !withMeas[h.ID.String()] {
noMeasure++
n := h.Name
if n == "" {
n = h.Scenario
}
t.Logf(" NO-MEASURE: [%s] %s", h.Category, abbrev(n, 60))
}
}
t.Logf("Hazards without any measure: %d/%d", noMeasure, len(hazards))
if len(result.ExtraInEngine) > 0 {
t.Logf("--- EXTRA (false positives / precision loss) ---")
names := make([]string, 0, len(result.ExtraInEngine))
for _, e := range result.ExtraInEngine {
n := e.Name
if n == "" {
n = e.Scenario
}
names = append(names, "["+e.Category+"] "+n)
}
sort.Strings(names)
for _, n := range names {
t.Logf(" EXTRA %s", abbrev(n, 85))
}
}
// Loose smoke floor for the baseline — fixes should push recall up, not down.
if result.CoverageScore < 0.4 {
t.Errorf("warewashing recall below 40%% floor: %.1f%%", result.CoverageScore*100)
}
}
// TestWarewashing_DedupProposer exercises the offline dedup-candidate proposer
// end-to-end on the real warewashing engine output: detect candidates, screen
// each against the GT, and log the human-review queue. It asserts the WALL is
// self-consistent — a PASS verdict may never coincide with a recall drop.
func TestWarewashing_DedupProposer(t *testing.T) {
raw, err := os.ReadFile(filepath.Join("testdata", "ground_truth_warewashing.json"))
if err != nil {
t.Fatalf("read GT: %v", err)
}
var gt GroundTruth
if err := json.Unmarshal(raw, &gt); err != nil {
t.Fatalf("parse GT: %v", err)
}
hazards, mits, kept := warewashingEngineOutput()
// 0.25 is a deliberately permissive candidate threshold: the proposer is meant
// to over-surface, because the deterministic GT wall below (and a human, and in
// slice 2 an LLM) is the precision filter — not the detector.
candidates := FindDedupCandidates(kept, 0.25)
t.Logf("Proposer: %d dedup candidate(s) from %d fired patterns", len(candidates), len(kept))
safe, blocked := 0, 0
for _, c := range candidates {
sr := ScreenSupersession(&gt, hazards, mits, c.KeepHazardName, c.DropName)
var verdict string
switch {
case sr.RecallAfter < sr.RecallBefore:
verdict, blocked = "BLOCK (recall-load-bearing)", blocked+1
case sr.DistinctGT:
verdict, blocked = "BLOCK (distinct GT "+sr.KeepGT+" vs "+sr.DropGT+")", blocked+1
default:
verdict, safe = "RECALL-SAFE (needs semantic review)", safe+1
}
t.Logf("[%s] keep %s / drop %s score=%.2f recall %.1f%%->%.1f%% | %s",
verdict, c.KeepPattern, c.DropPattern, c.Score,
sr.RecallBefore*100, sr.RecallAfter*100, c.Rationale)
// The wall must be sound: Safe implies recall preserved AND not distinct.
if sr.Safe && (sr.RecallAfter < sr.RecallBefore || sr.DistinctGT) {
t.Errorf("screen inconsistent for drop %s: Safe but recall dropped or distinct GT", c.DropPattern)
}
}
t.Logf("Proposer summary: %d RECALL-SAFE candidate(s) for human/LLM review, %d BLOCKED by the GT wall — propose-only, nothing auto-applied",
safe, blocked)
}