0ce4794767
Adds the semantic judgement layer on top of the slice-1 detector + GT wall. DEV-TIME, propose-only — nothing mutates the library or runtime. - CandidateJudge interface with two implementations: HeuristicJudge (deterministic default/fallback, used in tests) and LLMJudge (offline, over the shared llm.ProviderRegistry via the LLMCompleter adapter). LLMJudge degrades to "uncertain" on any transport/parse error — it can never break a run. - BuildJudgePrompt: the ISO 12100 same-vs-distinct prompt, unit-tested deterministically even though the call is not. - RenderProposalQueue: markdown human-review queue with a suggested action per candidate (supersede / keep both / needs review). On real warewashing output the heuristic punts to "uncertain — needs the LLM judge" for exactly the two recall-safe near-dupes (HP807/HP033 update, HP101/HP096 winding-vs-friction), making the LLM's role explicit. All 3 GTs unaffected (read-only). Live qwen wiring + a CLI/file queue are slice 3. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
238 lines
9.2 KiB
Go
238 lines
9.2 KiB
Go
package iace
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"testing"
|
|
)
|
|
|
|
// GT #3 — commercial UNDERCOUNTER dishwasher (Winterhalter UC-M). Self-assessed
|
|
// ground truth: we can judge what a dishwasher is. The test runs the narrative
|
|
// through the SAME chain as production (ParseNarrative -> engine -> relevance
|
|
// filter + cyber-skip), so keyword/gating fixes are measured on the hazard set
|
|
// the user actually sees — not the raw pattern flood.
|
|
|
|
// Condensed UC-M limits_form narrative. Deliberately includes "Cool-Ausfuehrung"
|
|
// and "Filter" so the known false components (Kuehlaggregat, Absauganlage) are
|
|
// reproduced and visible in the baseline.
|
|
const warewashingNarrative = `Gewerbliche Untertisch-Geschirrspuelmaschine fuer Gastronomie-Kueche, ` +
|
|
`vernetzt ueber LAN und WLAN (Connected Wash Internetportal). Heisswasser-Boiler mit ` +
|
|
`Nachspueltemperatur ca. 85 Grad C, Tank mit Hygiene-Tankheizkoerper. Spuelpumpe 150-200 l/min ` +
|
|
`mit rotierenden Spuelfeldern und Spuelarmen, Ablaufpumpe. Eingebautes Dosiergeraet fuer Reiniger ` +
|
|
`und Klarspueler (aetzende Konzentrate). 4-fach-Laugenfiltration mit Filter. Doppelwandige Tuer ` +
|
|
`mit Sicherheitsschalter und Rastposition (Thermostopp). Elektromotor (Drehstrom) 400 V. ` +
|
|
`Touch-Steuerung (SPS) mit Bedienfeld und HMI, USB-Schnittstelle fuer Softwareupdates, ` +
|
|
`PIN-geschuetzter Servicetechniker-Fernzugriff. Cool-Ausfuehrung mit kalter Nachspuelung. ` +
|
|
`Untertischmontage. Eingreifen in die Spuelkammer moeglich. Aerosole und Daempfe der ` +
|
|
`Reinigungschemie gelangen in die Atemzone. Manuelles Be- und Entladen der Spuelkoerbe von Hand. ` +
|
|
`Reinigung und Wartung durch Servicetechniker. Branche Lebensmittel und Getraenke. ` +
|
|
`Siebe und scharfe Blechkanten in der Spuelkammer. Boiler kann bei Wassermangel trockenlaufen. ` +
|
|
`Frequenzumrichter und Elektronik mit Restspannung nach dem Abschalten. Wartung nur im ` +
|
|
`freigeschalteten Zustand; Gefahr des unerwarteten Wiederanlaufs. Frischwasseranschluss mit ` +
|
|
`Rueckflussverhinderer gegen Ruecksaugen in das Trinkwassernetz. Stehwasser im Boiler ` +
|
|
`(Hygiene/Legionellen). Standsicherheit bei Untertischmontage.`
|
|
|
|
// warewashingCyberCategories mirrors handlers.nativeCyberSecurityCategories —
|
|
// native cyber/AI hazards are routed to the CRA module, not the CE hazard log.
|
|
var warewashingCyberCategories = map[string]bool{
|
|
"unauthorized_access": true, "firmware_corruption": true, "cyber_resilience": true,
|
|
"logging_audit_failure": true, "cyber_network": true, "sensor_spoofing": true,
|
|
"ai_specific": true, "ai_misclassification": true, "false_classification": true,
|
|
"model_drift": true, "data_poisoning": true, "unintended_bias": true,
|
|
}
|
|
|
|
// warewashingEngineOutput runs the production chain and returns the filtered
|
|
// hazards/mitigations the user would see for the UC-M.
|
|
func warewashingEngineOutput() ([]Hazard, []Mitigation, []PatternMatch) {
|
|
res := ParseNarrative(warewashingNarrative, "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)")
|
|
|
|
var compIDs, compNames []string
|
|
for _, c := range res.Components {
|
|
if c.Negated {
|
|
continue
|
|
}
|
|
compIDs = append(compIDs, c.LibraryID)
|
|
compNames = append(compNames, c.NameDE)
|
|
}
|
|
var energyIDs []string
|
|
for _, e := range res.EnergySources {
|
|
energyIDs = append(energyIDs, e.SourceID)
|
|
}
|
|
lifecycles := append([]string{}, res.LifecyclePhases...)
|
|
lifecycles = append(lifecycles, "normal_operation", "maintenance", "cleaning", "setup", "fault_clearing")
|
|
|
|
input := MatchInput{
|
|
ComponentLibraryIDs: compIDs,
|
|
EnergySourceIDs: energyIDs,
|
|
LifecyclePhases: lifecycles,
|
|
CustomTags: res.CustomTags,
|
|
OperationalStates: append(res.OperationalStates, "normal_operation", "cleaning", "maintenance"),
|
|
HumanRoles: res.Roles,
|
|
MachineTypes: []string{"food_processing", "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)"},
|
|
}
|
|
|
|
out := NewPatternEngine().Match(input)
|
|
|
|
var kept []PatternMatch
|
|
for _, pm := range out.MatchedPatterns {
|
|
if !IsPatternRelevant(pm, warewashingNarrative, compNames) {
|
|
continue
|
|
}
|
|
allCyber := len(pm.HazardCats) > 0
|
|
for _, c := range pm.HazardCats {
|
|
if !warewashingCyberCategories[c] {
|
|
allCyber = false
|
|
}
|
|
}
|
|
if allCyber {
|
|
continue
|
|
}
|
|
kept = append(kept, pm)
|
|
}
|
|
filtered := *out
|
|
filtered.MatchedPatterns = kept
|
|
hazards, mitigations := patternsToHazardsAndMitigations(&filtered)
|
|
return hazards, mitigations, kept
|
|
}
|
|
|
|
func TestWarewashing_GTCoverage(t *testing.T) {
|
|
gtPath := filepath.Join("testdata", "ground_truth_warewashing.json")
|
|
raw, err := os.ReadFile(gtPath)
|
|
if err != nil {
|
|
t.Fatalf("read GT: %v", err)
|
|
}
|
|
var gt GroundTruth
|
|
if err := json.Unmarshal(raw, >); err != nil {
|
|
t.Fatalf("parse GT: %v", err)
|
|
}
|
|
|
|
{
|
|
res := ParseNarrative(warewashingNarrative, "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)")
|
|
var cn []string
|
|
for _, c := range res.Components {
|
|
if !c.Negated {
|
|
cn = append(cn, c.NameDE)
|
|
}
|
|
}
|
|
t.Logf("Parsed components: %v", cn)
|
|
}
|
|
|
|
hazards, mitigations, keptPatterns := warewashingEngineOutput()
|
|
t.Logf("Engine: %d patterns kept (relevance+cyber filter) -> %d hazards", len(keptPatterns), len(hazards))
|
|
|
|
result := CompareBenchmark(>, hazards, mitigations)
|
|
precision := 0.0
|
|
if result.TotalEngine > 0 {
|
|
precision = float64(len(result.MatchedPairs)) / float64(result.TotalEngine)
|
|
}
|
|
t.Logf("=== Warewashing-GT (GT #3) Baseline ===")
|
|
t.Logf("Recall (Coverage): %.1f%% (%d/%d matched, %d missing)",
|
|
result.CoverageScore*100, len(result.MatchedPairs), result.TotalGT, len(result.MissingFromEngine))
|
|
t.Logf("Precision: %.1f%% (%d engine hazards, %d extra)",
|
|
precision*100, result.TotalEngine, len(result.ExtraInEngine))
|
|
|
|
if len(result.MissingFromEngine) > 0 {
|
|
t.Logf("--- MISSING (recall gaps) ---")
|
|
for _, m := range result.MissingFromEngine {
|
|
t.Logf(" MISS %s: %s", m.Nr, abbrev(m.HazardType, 60))
|
|
}
|
|
}
|
|
|
|
// Measure completeness: which generated hazards have NO protective measure?
|
|
t.Logf("--- Measure completeness ---")
|
|
t.Logf("Measure coverage (GT-matched): %.0f%%", result.MeasureCoverage*100)
|
|
withMeas := make(map[string]bool)
|
|
for _, m := range mitigations {
|
|
withMeas[m.HazardID.String()] = true
|
|
}
|
|
noMeasure := 0
|
|
for _, h := range hazards {
|
|
if !withMeas[h.ID.String()] {
|
|
noMeasure++
|
|
n := h.Name
|
|
if n == "" {
|
|
n = h.Scenario
|
|
}
|
|
t.Logf(" NO-MEASURE: [%s] %s", h.Category, abbrev(n, 60))
|
|
}
|
|
}
|
|
t.Logf("Hazards without any measure: %d/%d", noMeasure, len(hazards))
|
|
if len(result.ExtraInEngine) > 0 {
|
|
t.Logf("--- EXTRA (false positives / precision loss) ---")
|
|
names := make([]string, 0, len(result.ExtraInEngine))
|
|
for _, e := range result.ExtraInEngine {
|
|
n := e.Name
|
|
if n == "" {
|
|
n = e.Scenario
|
|
}
|
|
names = append(names, "["+e.Category+"] "+n)
|
|
}
|
|
sort.Strings(names)
|
|
for _, n := range names {
|
|
t.Logf(" EXTRA %s", abbrev(n, 85))
|
|
}
|
|
}
|
|
|
|
// Loose smoke floor for the baseline — fixes should push recall up, not down.
|
|
if result.CoverageScore < 0.4 {
|
|
t.Errorf("warewashing recall below 40%% floor: %.1f%%", result.CoverageScore*100)
|
|
}
|
|
}
|
|
|
|
// TestWarewashing_DedupProposer exercises the offline dedup-candidate proposer
|
|
// end-to-end on the real warewashing engine output: detect candidates, screen
|
|
// each against the GT, and log the human-review queue. It asserts the WALL is
|
|
// self-consistent — a PASS verdict may never coincide with a recall drop.
|
|
func TestWarewashing_DedupProposer(t *testing.T) {
|
|
raw, err := os.ReadFile(filepath.Join("testdata", "ground_truth_warewashing.json"))
|
|
if err != nil {
|
|
t.Fatalf("read GT: %v", err)
|
|
}
|
|
var gt GroundTruth
|
|
if err := json.Unmarshal(raw, >); err != nil {
|
|
t.Fatalf("parse GT: %v", err)
|
|
}
|
|
|
|
hazards, mits, kept := warewashingEngineOutput()
|
|
byID := map[string]PatternMatch{}
|
|
for _, pm := range kept {
|
|
byID[pm.PatternID] = pm
|
|
}
|
|
// 0.25 is a deliberately permissive candidate threshold: the proposer is meant
|
|
// to over-surface, because the deterministic GT wall below (and a human, and the
|
|
// LLM judge) is the precision filter — not the detector.
|
|
candidates := FindDedupCandidates(kept, 0.25)
|
|
t.Logf("Proposer: %d dedup candidate(s) from %d fired patterns", len(candidates), len(kept))
|
|
|
|
// Deterministic judge in the test; the dev-time CLI swaps in LLMJudge.
|
|
judge := HeuristicJudge{}
|
|
var judged []JudgedProposal
|
|
blocked := 0
|
|
for _, c := range candidates {
|
|
sr := ScreenSupersession(>, hazards, mits, c.KeepHazardName, c.DropName)
|
|
switch {
|
|
case sr.RecallAfter < sr.RecallBefore:
|
|
t.Logf("[BLOCK recall-load-bearing] keep %s / drop %s", c.KeepPattern, c.DropPattern)
|
|
blocked++
|
|
case sr.DistinctGT:
|
|
t.Logf("[BLOCK distinct GT %s vs %s] keep %s / drop %s", sr.KeepGT, sr.DropGT, c.KeepPattern, c.DropPattern)
|
|
blocked++
|
|
default:
|
|
if !sr.Safe {
|
|
t.Errorf("RECALL-SAFE branch but ScreenResult.Safe=false for drop %s", c.DropPattern)
|
|
}
|
|
v, conf, rat := judge.Judge(context.Background(), c, byID[c.KeepPattern], byID[c.DropPattern])
|
|
judged = append(judged, JudgedProposal{
|
|
Candidate: c, Screen: sr, Verdict: v, Confidence: conf, Rationale: rat, Judge: judge.Name(),
|
|
})
|
|
}
|
|
}
|
|
|
|
t.Logf("\n%s", RenderProposalQueue("Gewerbliche Geschirrspuelmaschine (vernetzt)", judged))
|
|
t.Logf("Proposer summary: %d candidate(s) in queue (judge=%s), %d BLOCKED by the GT wall — propose-only, nothing auto-applied",
|
|
len(judged), judge.Name(), blocked)
|
|
}
|