From 8674b2cd9a04965458e9abae6d4893cf5f1e1ed0 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 25 Jun 2026 08:43:32 +0200 Subject: [PATCH] feat(ai-sdk): offline dedup-candidate proposer + deterministic GT wall (P2 slice 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First thin slice of the offline library-improvement proposer. DEV-TIME ONLY, propose-only — it never mutates the pattern library or the runtime. - FindDedupCandidates (proposer_dedup.go): structural near-duplicate detection over the fired patterns (category + measure/zone/scenario overlap). Bakes in the P1 lesson: only same-category pairs compare, and pairs with different operational states are never proposed (normal-operation vs maintenance are legitimately distinct, e.g. HP011 vs HP077). - ScreenSupersession (proposer_screen.go): the wall. A proposal is safe only if (1) dropping the hazard does not reduce GT recall AND (2) keep/drop do not credit DIFFERENT GT entries. Check 2 catches distinct hazards that merely share measures (HP2201 hot surface GT 1.3 vs HP2202 hot ware GT 1.4) which recall alone would wave through. On real warewashing output: 3 candidates -> 1 BLOCKED (distinct GT), 2 RECALL-SAFE for human/LLM review (the update + winding/friction near-dupes). Nothing auto-applied. All 3 GTs unaffected (read-only). The LLM judgement and a CLI/file queue are slice 2. Co-Authored-By: Claude Opus 4.7 --- .../internal/iace/gt_warewashing_test.go | 54 ++++++- .../internal/iace/proposer_dedup.go | 152 ++++++++++++++++++ .../internal/iace/proposer_dedup_test.go | 67 ++++++++ .../internal/iace/proposer_screen.go | 61 +++++++ 4 files changed, 330 insertions(+), 4 deletions(-) create mode 100644 ai-compliance-sdk/internal/iace/proposer_dedup.go create mode 100644 ai-compliance-sdk/internal/iace/proposer_dedup_test.go create mode 100644 ai-compliance-sdk/internal/iace/proposer_screen.go diff --git a/ai-compliance-sdk/internal/iace/gt_warewashing_test.go b/ai-compliance-sdk/internal/iace/gt_warewashing_test.go index 6644bd36..71e8d960 100644 --- a/ai-compliance-sdk/internal/iace/gt_warewashing_test.go +++ b/ai-compliance-sdk/internal/iace/gt_warewashing_test.go @@ -45,7 +45,7 @@ var warewashingCyberCategories = map[string]bool{ // warewashingEngineOutput runs the production chain and returns the filtered // hazards/mitigations the user would see for the UC-M. -func warewashingEngineOutput() ([]Hazard, []Mitigation, int) { +func warewashingEngineOutput() ([]Hazard, []Mitigation, []PatternMatch) { res := ParseNarrative(warewashingNarrative, "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)") var compIDs, compNames []string @@ -94,7 +94,7 @@ func warewashingEngineOutput() ([]Hazard, []Mitigation, int) { filtered := *out filtered.MatchedPatterns = kept hazards, mitigations := patternsToHazardsAndMitigations(&filtered) - return hazards, mitigations, len(kept) + return hazards, mitigations, kept } func TestWarewashing_GTCoverage(t *testing.T) { @@ -119,8 +119,8 @@ func TestWarewashing_GTCoverage(t *testing.T) { t.Logf("Parsed components: %v", cn) } - hazards, mitigations, nPatterns := warewashingEngineOutput() - t.Logf("Engine: %d patterns kept (relevance+cyber filter) -> %d hazards", nPatterns, len(hazards)) + hazards, mitigations, keptPatterns := warewashingEngineOutput() + t.Logf("Engine: %d patterns kept (relevance+cyber filter) -> %d hazards", len(keptPatterns), len(hazards)) result := CompareBenchmark(>, hazards, mitigations) precision := 0.0 @@ -180,3 +180,49 @@ func TestWarewashing_GTCoverage(t *testing.T) { t.Errorf("warewashing recall below 40%% floor: %.1f%%", result.CoverageScore*100) } } + +// TestWarewashing_DedupProposer exercises the offline dedup-candidate proposer +// end-to-end on the real warewashing engine output: detect candidates, screen +// each against the GT, and log the human-review queue. It asserts the WALL is +// self-consistent — a PASS verdict may never coincide with a recall drop. +func TestWarewashing_DedupProposer(t *testing.T) { + raw, err := os.ReadFile(filepath.Join("testdata", "ground_truth_warewashing.json")) + if err != nil { + t.Fatalf("read GT: %v", err) + } + var gt GroundTruth + if err := json.Unmarshal(raw, >); err != nil { + t.Fatalf("parse GT: %v", err) + } + + hazards, mits, kept := warewashingEngineOutput() + // 0.25 is a deliberately permissive candidate threshold: the proposer is meant + // to over-surface, because the deterministic GT wall below (and a human, and in + // slice 2 an LLM) is the precision filter — not the detector. + candidates := FindDedupCandidates(kept, 0.25) + t.Logf("Proposer: %d dedup candidate(s) from %d fired patterns", len(candidates), len(kept)) + + safe, blocked := 0, 0 + for _, c := range candidates { + sr := ScreenSupersession(>, hazards, mits, c.KeepHazardName, c.DropName) + var verdict string + switch { + case sr.RecallAfter < sr.RecallBefore: + verdict, blocked = "BLOCK (recall-load-bearing)", blocked+1 + case sr.DistinctGT: + verdict, blocked = "BLOCK (distinct GT "+sr.KeepGT+" vs "+sr.DropGT+")", blocked+1 + default: + verdict, safe = "RECALL-SAFE (needs semantic review)", safe+1 + } + t.Logf("[%s] keep %s / drop %s score=%.2f recall %.1f%%->%.1f%% | %s", + verdict, c.KeepPattern, c.DropPattern, c.Score, + sr.RecallBefore*100, sr.RecallAfter*100, c.Rationale) + + // The wall must be sound: Safe implies recall preserved AND not distinct. + if sr.Safe && (sr.RecallAfter < sr.RecallBefore || sr.DistinctGT) { + t.Errorf("screen inconsistent for drop %s: Safe but recall dropped or distinct GT", c.DropPattern) + } + } + t.Logf("Proposer summary: %d RECALL-SAFE candidate(s) for human/LLM review, %d BLOCKED by the GT wall — propose-only, nothing auto-applied", + safe, blocked) +} diff --git a/ai-compliance-sdk/internal/iace/proposer_dedup.go b/ai-compliance-sdk/internal/iace/proposer_dedup.go new file mode 100644 index 00000000..9ed0ccb8 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/proposer_dedup.go @@ -0,0 +1,152 @@ +package iace + +import ( + "fmt" + "math" + "regexp" + "sort" + "strings" +) + +// Offline dedup-candidate proposer (P2, type 1). DEV-TIME ONLY. +// +// It inspects the patterns that fired for one machine and proposes which look +// like duplicates, so a human (later an LLM) can decide a supersession/merge. It +// NEVER mutates the pattern library or the runtime — it only surfaces candidates. +// The deterministic GT screen (ScreenSupersession, proposer_screen.go) is the +// wall that proves a proposal is safe before a human ever sees it. +// +// Detection here is purely structural (category + zone + measure + scenario +// overlap) and therefore reproducible. Two safety rules bake in what P1 taught +// us about the dishwasher review: +// - only patterns with the SAME primary category are ever compared; +// - a pair with DIFFERENT operational states is NEVER proposed, because +// normal-operation and maintenance are legitimately distinct contexts with +// different protective measures (e.g. HP011 vs HP077). Merging them would +// erase the maintenance view. + +// DedupCandidate is a proposed near-duplicate pattern pair for one machine class. +type DedupCandidate struct { + KeepPattern string `json:"keep_pattern"` // higher-priority survivor + DropPattern string `json:"drop_pattern"` // supersession target + KeepName string `json:"keep_name"` + KeepHazardName string `json:"keep_hazard_name"` // keep pattern ScenarioDE (for the GT-distinctness screen) + DropName string `json:"drop_name"` // == generated hazard Name (ScenarioDE) of the drop pattern + Category string `json:"category"` + ZoneJaccard float64 `json:"zone_jaccard"` + MeasureJaccard float64 `json:"measure_jaccard"` + ScenarioJaccard float64 `json:"scenario_jaccard"` + Score float64 `json:"score"` + Rationale string `json:"rationale"` +} + +// FindDedupCandidates compares the fired patterns pairwise and returns near-dup +// candidates whose combined overlap score meets threshold, deterministically +// ordered (score desc, then drop-pattern id). The combined score weights measure +// overlap highest (shared measures are the strongest duplicate signal), then zone +// and scenario equally. +func FindDedupCandidates(fired []PatternMatch, threshold float64) []DedupCandidate { + var out []DedupCandidate + for i := 0; i < len(fired); i++ { + for j := i + 1; j < len(fired); j++ { + a, b := fired[i], fired[j] + ca := primaryCat(a) + if ca == "" || ca != primaryCat(b) { + continue + } + if !sameOpStateSet(a.OperationalStates, b.OperationalStates) { + continue // legitimate lifecycle variants — never propose a merge + } + zj := tokenJaccard(zoneTokenSet(a.ZoneDE), zoneTokenSet(b.ZoneDE)) + mj := tokenJaccard(toSet(a.SuggestedMeasureIDs), toSet(b.SuggestedMeasureIDs)) + sj := tokenJaccard(wordTokenSet(a.ScenarioDE), wordTokenSet(b.ScenarioDE)) + score := 0.4*mj + 0.3*zj + 0.3*sj + if score < threshold { + continue + } + keep, drop := a, b + if b.Priority > a.Priority { + keep, drop = b, a + } + out = append(out, DedupCandidate{ + KeepPattern: keep.PatternID, DropPattern: drop.PatternID, + KeepName: keep.PatternName, KeepHazardName: keep.ScenarioDE, DropName: drop.ScenarioDE, + Category: ca, ZoneJaccard: round2(zj), MeasureJaccard: round2(mj), + ScenarioJaccard: round2(sj), Score: round2(score), + Rationale: fmt.Sprintf( + "same category %q · measure overlap %.0f%% · zone overlap %.0f%% · scenario overlap %.0f%% → keep %s (P%d), supersede %s (P%d)", + ca, mj*100, zj*100, sj*100, keep.PatternID, keep.Priority, drop.PatternID, drop.Priority), + }) + } + } + sort.SliceStable(out, func(i, j int) bool { + if out[i].Score != out[j].Score { + return out[i].Score > out[j].Score + } + return out[i].DropPattern < out[j].DropPattern + }) + return out +} + +func primaryCat(pm PatternMatch) string { + if len(pm.HazardCats) == 0 { + return "" + } + return pm.HazardCats[0] +} + +func sameOpStateSet(a, b []string) bool { + sa, sb := toSet(a), toSet(b) + if len(sa) != len(sb) { + return false + } + for k := range sa { + if !sb[k] { + return false + } + } + return true +} + +var proposerWordSplit = regexp.MustCompile(`[^\p{L}]+`) + +// zoneTokenSet splits a comma-separated zone string into its component terms. +func zoneTokenSet(zone string) map[string]bool { + out := map[string]bool{} + for _, part := range strings.Split(strings.ToLower(zone), ",") { + if t := strings.TrimSpace(part); len([]rune(t)) >= 3 { + out[t] = true + } + } + return out +} + +// wordTokenSet tokenises free text into words of length >= 4 (drops connectives). +func wordTokenSet(s string) map[string]bool { + out := map[string]bool{} + for _, w := range proposerWordSplit.Split(strings.ToLower(s), -1) { + if len([]rune(w)) >= 4 { + out[w] = true + } + } + return out +} + +func tokenJaccard(a, b map[string]bool) float64 { + if len(a) == 0 && len(b) == 0 { + return 0 + } + inter := 0 + for k := range a { + if b[k] { + inter++ + } + } + union := len(a) + len(b) - inter + if union == 0 { + return 0 + } + return float64(inter) / float64(union) +} + +func round2(x float64) float64 { return math.Round(x*100) / 100 } diff --git a/ai-compliance-sdk/internal/iace/proposer_dedup_test.go b/ai-compliance-sdk/internal/iace/proposer_dedup_test.go new file mode 100644 index 00000000..d3418305 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/proposer_dedup_test.go @@ -0,0 +1,67 @@ +package iace + +import "testing" + +func mkPM(id, cat, zone, scenario string, prio int, measures, opstates []string) PatternMatch { + return PatternMatch{ + PatternID: id, PatternName: id, Priority: prio, + HazardCats: []string{cat}, ZoneDE: zone, ScenarioDE: scenario, + SuggestedMeasureIDs: measures, OperationalStates: opstates, + } +} + +func TestFindDedupCandidates_FindsOverlappingPair(t *testing.T) { + fired := []PatternMatch{ + mkPM("HPa", "update_failure", "Steuerung, SPS", "Software-Update der Steuerung scheitert nach Abbruch", 80, + []string{"M138", "M146"}, nil), + mkPM("HPb", "update_failure", "Steuerung, Antriebsregler", "Software-Update der Steuerung schlaegt fehl", 75, + []string{"M138", "M146", "M141"}, nil), + mkPM("HPc", "mechanical_hazard", "Tuer", "Quetschen der Finger an der Tuer", 70, + []string{"M003"}, nil), + } + got := FindDedupCandidates(fired, 0.4) + if len(got) != 1 { + t.Fatalf("want 1 candidate, got %d: %+v", len(got), got) + } + // Higher-priority pattern survives, lower one is the drop target. + if got[0].KeepPattern != "HPa" || got[0].DropPattern != "HPb" { + t.Errorf("want keep HPa / drop HPb, got keep %s / drop %s", got[0].KeepPattern, got[0].DropPattern) + } + if got[0].DropName != "Software-Update der Steuerung schlaegt fehl" { + t.Errorf("DropName must equal drop pattern ScenarioDE, got %q", got[0].DropName) + } +} + +func TestFindDedupCandidates_LifecycleGuard(t *testing.T) { + // Same category, zone and measures — but normal-operation vs maintenance. + // These are legitimate variants (HP011 vs HP077) and must NOT be proposed. + fired := []PatternMatch{ + mkPM("HP011", "electrical_hazard", "Schaltschrank, Klemmenkasten", "Person beruehrt spannungsfuehrende Teile", 95, + []string{"M481", "M482"}, nil), + mkPM("HP077", "electrical_hazard", "Schaltschrank, Klemmenkasten", "Person beruehrt spannungsfuehrende Teile", 80, + []string{"M481", "M482"}, []string{"maintenance"}), + } + if got := FindDedupCandidates(fired, 0.4); len(got) != 0 { + t.Fatalf("lifecycle guard failed: want 0 candidates, got %d: %+v", len(got), got) + } +} + +func TestFindDedupCandidates_DifferentCategoryIgnored(t *testing.T) { + fired := []PatternMatch{ + mkPM("HPa", "thermal_hazard", "Boiler", "Heisse Oberflaeche am Boiler", 80, []string{"M071"}, nil), + mkPM("HPb", "mechanical_hazard", "Boiler", "Heisse Oberflaeche am Boiler", 80, []string{"M071"}, nil), + } + if got := FindDedupCandidates(fired, 0.3); len(got) != 0 { + t.Fatalf("cross-category pair must not be proposed, got %d", len(got)) + } +} + +func TestFindDedupCandidates_BelowThresholdDropped(t *testing.T) { + fired := []PatternMatch{ + mkPM("HPa", "mechanical_hazard", "Tuer", "Quetschen an der Tuer", 80, []string{"M003"}, nil), + mkPM("HPb", "mechanical_hazard", "Foerderband", "Einzug am Foerderband", 80, []string{"M540"}, nil), + } + if got := FindDedupCandidates(fired, 0.4); len(got) != 0 { + t.Fatalf("disjoint pair must be below threshold, got %d: %+v", len(got), got) + } +} diff --git a/ai-compliance-sdk/internal/iace/proposer_screen.go b/ai-compliance-sdk/internal/iace/proposer_screen.go new file mode 100644 index 00000000..f7f582b5 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/proposer_screen.go @@ -0,0 +1,61 @@ +package iace + +import "github.com/google/uuid" + +// ScreenResult is the deterministic GT verdict for one proposed supersession. +type ScreenResult struct { + RecallBefore float64 `json:"recall_before"` + RecallAfter float64 `json:"recall_after"` + KeepGT string `json:"keep_gt,omitempty"` // GT entry the keeper credits (if any) + DropGT string `json:"drop_gt,omitempty"` // GT entry the drop credits (if any) + DistinctGT bool `json:"distinct_gt"` // keep & drop credit DIFFERENT GT entries -> distinct hazards + Safe bool `json:"safe"` // recall preserved AND not distinct +} + +// ScreenSupersession is the WALL between "propose" and "decide". A proposal is +// safe only if BOTH deterministic checks pass: +// +// 1. RECALL is not reduced when the drop-hazard (and its mitigations) is removed +// — otherwise the drop is load-bearing for GT coverage. +// 2. The two hazards do NOT credit DIFFERENT ground-truth entries. Recall alone +// is necessary but not sufficient: two genuinely distinct hazards that share +// the same measures (e.g. hot boiler surface vs hot ware on unloading) keep +// recall at 100% when one is dropped, yet must NOT be merged. If keep and +// drop each match a different GT entry, they are distinct. +// +// Whatever survives both is still only RECALL-SAFE — a candidate for a human (and +// in slice 2, an LLM) to confirm semantically. Deterministic; reuses +// CompareBenchmark; touches neither the library nor the runtime. +func ScreenSupersession(gt *GroundTruth, hazards []Hazard, mits []Mitigation, keepHazardName, dropHazardName string) ScreenResult { + before := CompareBenchmark(gt, hazards, mits) + + gtOf := map[string]string{} + for _, p := range before.MatchedPairs { + gtOf[p.EngineHazard.Name] = p.GTEntry.Nr + } + keepGT, dropGT := gtOf[keepHazardName], gtOf[dropHazardName] + distinct := keepGT != "" && dropGT != "" && keepGT != dropGT + + kept := make([]Hazard, 0, len(hazards)) + dropped := map[uuid.UUID]bool{} + for _, h := range hazards { + if h.Name == dropHazardName { + dropped[h.ID] = true + continue + } + kept = append(kept, h) + } + keptMits := make([]Mitigation, 0, len(mits)) + for _, m := range mits { + if !dropped[m.HazardID] { + keptMits = append(keptMits, m) + } + } + after := CompareBenchmark(gt, kept, keptMits) + + return ScreenResult{ + RecallBefore: before.CoverageScore, RecallAfter: after.CoverageScore, + KeepGT: keepGT, DropGT: dropGT, DistinctGT: distinct, + Safe: after.CoverageScore >= before.CoverageScore && !distinct, + } +}