breakpilot-compliance/ai-compliance-sdk/internal/iace/gt_warewashing_test.go

package iace

import (
	"context"
	"encoding/json"
	"os"
	"path/filepath"
	"sort"
	"testing"
)

// GT #3 — commercial UNDERCOUNTER dishwasher (Winterhalter UC-M). Self-assessed
// ground truth: we can judge what a dishwasher is. The test runs the narrative
// through the SAME chain as production (ParseNarrative -> engine -> relevance
// filter + cyber-skip), so keyword/gating fixes are measured on the hazard set
// the user actually sees — not the raw pattern flood.

// Condensed UC-M limits_form narrative. Deliberately includes "Cool-Ausfuehrung"
// and "Filter" so the known false components (Kuehlaggregat, Absauganlage) are
// reproduced and visible in the baseline.
const warewashingNarrative = `Gewerbliche Untertisch-Geschirrspuelmaschine fuer Gastronomie-Kueche, ` +
	`vernetzt ueber LAN und WLAN (Connected Wash Internetportal). Heisswasser-Boiler mit ` +
	`Nachspueltemperatur ca. 85 Grad C, Tank mit Hygiene-Tankheizkoerper. Spuelpumpe 150-200 l/min ` +
	`mit rotierenden Spuelfeldern und Spuelarmen, Ablaufpumpe. Eingebautes Dosiergeraet fuer Reiniger ` +
	`und Klarspueler (aetzende Konzentrate). 4-fach-Laugenfiltration mit Filter. Doppelwandige Tuer ` +
	`mit Sicherheitsschalter und Rastposition (Thermostopp). Elektromotor (Drehstrom) 400 V. ` +
	`Touch-Steuerung (SPS) mit Bedienfeld und HMI, USB-Schnittstelle fuer Softwareupdates, ` +
	`PIN-geschuetzter Servicetechniker-Fernzugriff. Cool-Ausfuehrung mit kalter Nachspuelung. ` +
	`Untertischmontage. Eingreifen in die Spuelkammer moeglich. Aerosole und Daempfe der ` +
	`Reinigungschemie gelangen in die Atemzone. Manuelles Be- und Entladen der Spuelkoerbe von Hand. ` +
	`Reinigung und Wartung durch Servicetechniker. Branche Lebensmittel und Getraenke. ` +
	`Siebe und scharfe Blechkanten in der Spuelkammer. Boiler kann bei Wassermangel trockenlaufen. ` +
	`Frequenzumrichter und Elektronik mit Restspannung nach dem Abschalten. Wartung nur im ` +
	`freigeschalteten Zustand; Gefahr des unerwarteten Wiederanlaufs. Frischwasseranschluss mit ` +
	`Rueckflussverhinderer gegen Ruecksaugen in das Trinkwassernetz. Stehwasser im Boiler ` +
	`(Hygiene/Legionellen). Standsicherheit bei Untertischmontage.`

// warewashingCyberCategories mirrors handlers.nativeCyberSecurityCategories —
// native cyber/AI hazards are routed to the CRA module, not the CE hazard log.
var warewashingCyberCategories = map[string]bool{
	"unauthorized_access": true, "firmware_corruption": true, "cyber_resilience": true,
	"logging_audit_failure": true, "cyber_network": true, "sensor_spoofing": true,
	"ai_specific": true, "ai_misclassification": true, "false_classification": true,
	"model_drift": true, "data_poisoning": true, "unintended_bias": true,
}

// warewashingEngineOutput runs the production chain and returns the filtered
// hazards/mitigations the user would see for the UC-M.
func warewashingEngineOutput() ([]Hazard, []Mitigation, []PatternMatch) {
	res := ParseNarrative(warewashingNarrative, "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)")

	var compIDs, compNames []string
	for _, c := range res.Components {
		if c.Negated {
			continue
		}
		compIDs = append(compIDs, c.LibraryID)
		compNames = append(compNames, c.NameDE)
	}
	var energyIDs []string
	for _, e := range res.EnergySources {
		energyIDs = append(energyIDs, e.SourceID)
	}
	lifecycles := append([]string{}, res.LifecyclePhases...)
	lifecycles = append(lifecycles, "normal_operation", "maintenance", "cleaning", "setup", "fault_clearing")

	input := MatchInput{
		ComponentLibraryIDs: compIDs,
		EnergySourceIDs:     energyIDs,
		LifecyclePhases:     lifecycles,
		CustomTags:          res.CustomTags,
		OperationalStates:   append(res.OperationalStates, "normal_operation", "cleaning", "maintenance"),
		HumanRoles:          res.Roles,
		MachineTypes:        []string{"food_processing", "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)"},
	}

	out := NewPatternEngine().Match(input)

	var kept []PatternMatch
	for _, pm := range out.MatchedPatterns {
		if !IsPatternRelevant(pm, warewashingNarrative, compNames) {
			continue
		}
		allCyber := len(pm.HazardCats) > 0
		for _, c := range pm.HazardCats {
			if !warewashingCyberCategories[c] {
				allCyber = false
			}
		}
		if allCyber {
			continue
		}
		kept = append(kept, pm)
	}
	filtered := *out
	filtered.MatchedPatterns = kept
	hazards, mitigations := patternsToHazardsAndMitigations(&filtered)
	return hazards, mitigations, kept
}

func TestWarewashing_GTCoverage(t *testing.T) {
	gtPath := filepath.Join("testdata", "ground_truth_warewashing.json")
	raw, err := os.ReadFile(gtPath)
	if err != nil {
		t.Fatalf("read GT: %v", err)
	}
	var gt GroundTruth
	if err := json.Unmarshal(raw, &gt); err != nil {
		t.Fatalf("parse GT: %v", err)
	}

	{
		res := ParseNarrative(warewashingNarrative, "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)")
		var cn []string
		for _, c := range res.Components {
			if !c.Negated {
				cn = append(cn, c.NameDE)
			}
		}
		t.Logf("Parsed components: %v", cn)
	}

	hazards, mitigations, keptPatterns := warewashingEngineOutput()
	t.Logf("Engine: %d patterns kept (relevance+cyber filter) -> %d hazards", len(keptPatterns), len(hazards))

	result := CompareBenchmark(&gt, hazards, mitigations)
	precision := 0.0
	if result.TotalEngine > 0 {
		precision = float64(len(result.MatchedPairs)) / float64(result.TotalEngine)
	}
	t.Logf("=== Warewashing-GT (GT #3) Baseline ===")
	t.Logf("Recall (Coverage): %.1f%% (%d/%d matched, %d missing)",
		result.CoverageScore*100, len(result.MatchedPairs), result.TotalGT, len(result.MissingFromEngine))
	t.Logf("Precision:         %.1f%% (%d engine hazards, %d extra)",
		precision*100, result.TotalEngine, len(result.ExtraInEngine))

	if len(result.MissingFromEngine) > 0 {
		t.Logf("--- MISSING (recall gaps) ---")
		for _, m := range result.MissingFromEngine {
			t.Logf("  MISS %s: %s", m.Nr, abbrev(m.HazardType, 60))
		}
	}

	// Measure completeness: which generated hazards have NO protective measure?
	t.Logf("--- Measure completeness ---")
	t.Logf("Measure coverage (GT-matched): %.0f%%", result.MeasureCoverage*100)
	withMeas := make(map[string]bool)
	for _, m := range mitigations {
		withMeas[m.HazardID.String()] = true
	}
	noMeasure := 0
	for _, h := range hazards {
		if !withMeas[h.ID.String()] {
			noMeasure++
			n := h.Name
			if n == "" {
				n = h.Scenario
			}
			t.Logf("  NO-MEASURE: [%s] %s", h.Category, abbrev(n, 60))
		}
	}
	t.Logf("Hazards without any measure: %d/%d", noMeasure, len(hazards))
	if len(result.ExtraInEngine) > 0 {
		t.Logf("--- EXTRA (false positives / precision loss) ---")
		names := make([]string, 0, len(result.ExtraInEngine))
		for _, e := range result.ExtraInEngine {
			n := e.Name
			if n == "" {
				n = e.Scenario
			}
			names = append(names, "["+e.Category+"] "+n)
		}
		sort.Strings(names)
		for _, n := range names {
			t.Logf("  EXTRA %s", abbrev(n, 85))
		}
	}

	// Loose smoke floor for the baseline — fixes should push recall up, not down.
	if result.CoverageScore < 0.4 {
		t.Errorf("warewashing recall below 40%% floor: %.1f%%", result.CoverageScore*100)
	}
}

// TestWarewashing_DedupProposer exercises the offline dedup-candidate proposer
// end-to-end on the real warewashing engine output: detect candidates, screen
// each against the GT, and log the human-review queue. It asserts the WALL is
// self-consistent — a PASS verdict may never coincide with a recall drop.
func TestWarewashing_DedupProposer(t *testing.T) {
	raw, err := os.ReadFile(filepath.Join("testdata", "ground_truth_warewashing.json"))
	if err != nil {
		t.Fatalf("read GT: %v", err)
	}
	var gt GroundTruth
	if err := json.Unmarshal(raw, &gt); err != nil {
		t.Fatalf("parse GT: %v", err)
	}

	hazards, mits, kept := warewashingEngineOutput()
	byID := map[string]PatternMatch{}
	for _, pm := range kept {
		byID[pm.PatternID] = pm
	}
	// 0.25 is a deliberately permissive candidate threshold: the proposer is meant
	// to over-surface, because the deterministic GT wall below (and a human, and the
	// LLM judge) is the precision filter — not the detector.
	candidates := FindDedupCandidates(kept, 0.25)
	t.Logf("Proposer: %d dedup candidate(s) from %d fired patterns", len(candidates), len(kept))

	// Deterministic judge in the test; the dev-time CLI swaps in LLMJudge.
	judge := HeuristicJudge{}
	var judged []JudgedProposal
	blocked := 0
	for _, c := range candidates {
		sr := ScreenSupersession(&gt, hazards, mits, c.KeepHazardName, c.DropName)
		switch {
		case sr.RecallAfter < sr.RecallBefore:
			t.Logf("[BLOCK recall-load-bearing] keep %s / drop %s", c.KeepPattern, c.DropPattern)
			blocked++
		case sr.DistinctGT:
			t.Logf("[BLOCK distinct GT %s vs %s] keep %s / drop %s", sr.KeepGT, sr.DropGT, c.KeepPattern, c.DropPattern)
			blocked++
		default:
			if !sr.Safe {
				t.Errorf("RECALL-SAFE branch but ScreenResult.Safe=false for drop %s", c.DropPattern)
			}
			v, conf, rat := judge.Judge(context.Background(), c, byID[c.KeepPattern], byID[c.DropPattern])
			judged = append(judged, JudgedProposal{
				Candidate: c, Screen: sr, Verdict: v, Confidence: conf, Rationale: rat, Judge: judge.Name(),
			})
		}
	}

	t.Logf("\n%s", RenderProposalQueue("Gewerbliche Geschirrspuelmaschine (vernetzt)", judged))
	t.Logf("Proposer summary: %d candidate(s) in queue (judge=%s), %d BLOCKED by the GT wall — propose-only, nothing auto-applied",
		len(judged), judge.Name(), blocked)
}