feat(ai-sdk): offline dedup-candidate proposer + deterministic GT wall (P2 slice 1)

First thin slice of the offline library-improvement proposer. DEV-TIME ONLY, propose-only — it never mutates the pattern library or the runtime. - FindDedupCandidates (proposer_dedup.go): structural near-duplicate detection over the fired patterns (category + measure/zone/scenario overlap). Bakes in the P1 lesson: only same-category pairs compare, and pairs with different operational states are never proposed (normal-operation vs maintenance are legitimately distinct, e.g. HP011 vs HP077). - ScreenSupersession (proposer_screen.go): the wall. A proposal is safe only if (1) dropping the hazard does not reduce GT recall AND (2) keep/drop do not credit DIFFERENT GT entries. Check 2 catches distinct hazards that merely share measures (HP2201 hot surface GT 1.3 vs HP2202 hot ware GT 1.4) which recall alone would wave through. On real warewashing output: 3 candidates -> 1 BLOCKED (distinct GT), 2 RECALL-SAFE for human/LLM review (the update + winding/friction near-dupes). Nothing auto-applied. All 3 GTs unaffected (read-only). The LLM judgement and a CLI/file queue are slice 2. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-25 08:43:32 +02:00
parent 80862e7073
commit 8674b2cd9a
4 changed files with 330 additions and 4 deletions
@@ -45,7 +45,7 @@ var warewashingCyberCategories = map[string]bool{

 // warewashingEngineOutput runs the production chain and returns the filtered
 // hazards/mitigations the user would see for the UC-M.
-func warewashingEngineOutput() ([]Hazard, []Mitigation, int) {
+func warewashingEngineOutput() ([]Hazard, []Mitigation, []PatternMatch) {
 	res := ParseNarrative(warewashingNarrative, "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)")

 	var compIDs, compNames []string
@@ -94,7 +94,7 @@ func warewashingEngineOutput() ([]Hazard, []Mitigation, int) {
 	filtered := *out
 	filtered.MatchedPatterns = kept
 	hazards, mitigations := patternsToHazardsAndMitigations(&filtered)
-	return hazards, mitigations, len(kept)
+	return hazards, mitigations, kept
 }

 func TestWarewashing_GTCoverage(t *testing.T) {
@@ -119,8 +119,8 @@ func TestWarewashing_GTCoverage(t *testing.T) {
 		t.Logf("Parsed components: %v", cn)
 	}

-	hazards, mitigations, nPatterns := warewashingEngineOutput()
-	t.Logf("Engine: %d patterns kept (relevance+cyber filter) -> %d hazards", nPatterns, len(hazards))
+	hazards, mitigations, keptPatterns := warewashingEngineOutput()
+	t.Logf("Engine: %d patterns kept (relevance+cyber filter) -> %d hazards", len(keptPatterns), len(hazards))

 	result := CompareBenchmark(&gt, hazards, mitigations)
 	precision := 0.0
@@ -180,3 +180,49 @@ func TestWarewashing_GTCoverage(t *testing.T) {
 		t.Errorf("warewashing recall below 40%% floor: %.1f%%", result.CoverageScore*100)
 	}
 }
+
+// TestWarewashing_DedupProposer exercises the offline dedup-candidate proposer
+// end-to-end on the real warewashing engine output: detect candidates, screen
+// each against the GT, and log the human-review queue. It asserts the WALL is
+// self-consistent — a PASS verdict may never coincide with a recall drop.
+func TestWarewashing_DedupProposer(t *testing.T) {
+	raw, err := os.ReadFile(filepath.Join("testdata", "ground_truth_warewashing.json"))
+	if err != nil {
+		t.Fatalf("read GT: %v", err)
+	}
+	var gt GroundTruth
+	if err := json.Unmarshal(raw, &gt); err != nil {
+		t.Fatalf("parse GT: %v", err)
+	}
+
+	hazards, mits, kept := warewashingEngineOutput()
+	// 0.25 is a deliberately permissive candidate threshold: the proposer is meant
+	// to over-surface, because the deterministic GT wall below (and a human, and in
+	// slice 2 an LLM) is the precision filter — not the detector.
+	candidates := FindDedupCandidates(kept, 0.25)
+	t.Logf("Proposer: %d dedup candidate(s) from %d fired patterns", len(candidates), len(kept))
+
+	safe, blocked := 0, 0
+	for _, c := range candidates {
+		sr := ScreenSupersession(&gt, hazards, mits, c.KeepHazardName, c.DropName)
+		var verdict string
+		switch {
+		case sr.RecallAfter < sr.RecallBefore:
+			verdict, blocked = "BLOCK (recall-load-bearing)", blocked+1
+		case sr.DistinctGT:
+			verdict, blocked = "BLOCK (distinct GT "+sr.KeepGT+" vs "+sr.DropGT+")", blocked+1
+		default:
+			verdict, safe = "RECALL-SAFE (needs semantic review)", safe+1
+		}
+		t.Logf("[%s] keep %s / drop %s  score=%.2f  recall %.1f%%->%.1f%%  | %s",
+			verdict, c.KeepPattern, c.DropPattern, c.Score,
+			sr.RecallBefore*100, sr.RecallAfter*100, c.Rationale)
+
+		// The wall must be sound: Safe implies recall preserved AND not distinct.
+		if sr.Safe && (sr.RecallAfter < sr.RecallBefore || sr.DistinctGT) {
+			t.Errorf("screen inconsistent for drop %s: Safe but recall dropped or distinct GT", c.DropPattern)
+		}
+	}
+	t.Logf("Proposer summary: %d RECALL-SAFE candidate(s) for human/LLM review, %d BLOCKED by the GT wall — propose-only, nothing auto-applied",
+		safe, blocked)
+}