feat(ai-sdk): offline dedup-candidate proposer + deterministic GT wall (P2 slice 1)

First thin slice of the offline library-improvement proposer. DEV-TIME ONLY,
propose-only — it never mutates the pattern library or the runtime.

- FindDedupCandidates (proposer_dedup.go): structural near-duplicate detection
  over the fired patterns (category + measure/zone/scenario overlap). Bakes in
  the P1 lesson: only same-category pairs compare, and pairs with different
  operational states are never proposed (normal-operation vs maintenance are
  legitimately distinct, e.g. HP011 vs HP077).
- ScreenSupersession (proposer_screen.go): the wall. A proposal is safe only if
  (1) dropping the hazard does not reduce GT recall AND (2) keep/drop do not
  credit DIFFERENT GT entries. Check 2 catches distinct hazards that merely share
  measures (HP2201 hot surface GT 1.3 vs HP2202 hot ware GT 1.4) which recall
  alone would wave through.

On real warewashing output: 3 candidates -> 1 BLOCKED (distinct GT), 2
RECALL-SAFE for human/LLM review (the update + winding/friction near-dupes).
Nothing auto-applied. All 3 GTs unaffected (read-only). The LLM judgement and a
CLI/file queue are slice 2.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-25 08:43:32 +02:00
parent 80862e7073
commit 8674b2cd9a
4 changed files with 330 additions and 4 deletions
@@ -45,7 +45,7 @@ var warewashingCyberCategories = map[string]bool{
// warewashingEngineOutput runs the production chain and returns the filtered
// hazards/mitigations the user would see for the UC-M.
func warewashingEngineOutput() ([]Hazard, []Mitigation, int) {
func warewashingEngineOutput() ([]Hazard, []Mitigation, []PatternMatch) {
res := ParseNarrative(warewashingNarrative, "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)")
var compIDs, compNames []string
@@ -94,7 +94,7 @@ func warewashingEngineOutput() ([]Hazard, []Mitigation, int) {
filtered := *out
filtered.MatchedPatterns = kept
hazards, mitigations := patternsToHazardsAndMitigations(&filtered)
return hazards, mitigations, len(kept)
return hazards, mitigations, kept
}
func TestWarewashing_GTCoverage(t *testing.T) {
@@ -119,8 +119,8 @@ func TestWarewashing_GTCoverage(t *testing.T) {
t.Logf("Parsed components: %v", cn)
}
hazards, mitigations, nPatterns := warewashingEngineOutput()
t.Logf("Engine: %d patterns kept (relevance+cyber filter) -> %d hazards", nPatterns, len(hazards))
hazards, mitigations, keptPatterns := warewashingEngineOutput()
t.Logf("Engine: %d patterns kept (relevance+cyber filter) -> %d hazards", len(keptPatterns), len(hazards))
result := CompareBenchmark(&gt, hazards, mitigations)
precision := 0.0
@@ -180,3 +180,49 @@ func TestWarewashing_GTCoverage(t *testing.T) {
t.Errorf("warewashing recall below 40%% floor: %.1f%%", result.CoverageScore*100)
}
}
// TestWarewashing_DedupProposer exercises the offline dedup-candidate proposer
// end-to-end on the real warewashing engine output: detect candidates, screen
// each against the GT, and log the human-review queue. It asserts the WALL is
// self-consistent — a PASS verdict may never coincide with a recall drop.
func TestWarewashing_DedupProposer(t *testing.T) {
raw, err := os.ReadFile(filepath.Join("testdata", "ground_truth_warewashing.json"))
if err != nil {
t.Fatalf("read GT: %v", err)
}
var gt GroundTruth
if err := json.Unmarshal(raw, &gt); err != nil {
t.Fatalf("parse GT: %v", err)
}
hazards, mits, kept := warewashingEngineOutput()
// 0.25 is a deliberately permissive candidate threshold: the proposer is meant
// to over-surface, because the deterministic GT wall below (and a human, and in
// slice 2 an LLM) is the precision filter — not the detector.
candidates := FindDedupCandidates(kept, 0.25)
t.Logf("Proposer: %d dedup candidate(s) from %d fired patterns", len(candidates), len(kept))
safe, blocked := 0, 0
for _, c := range candidates {
sr := ScreenSupersession(&gt, hazards, mits, c.KeepHazardName, c.DropName)
var verdict string
switch {
case sr.RecallAfter < sr.RecallBefore:
verdict, blocked = "BLOCK (recall-load-bearing)", blocked+1
case sr.DistinctGT:
verdict, blocked = "BLOCK (distinct GT "+sr.KeepGT+" vs "+sr.DropGT+")", blocked+1
default:
verdict, safe = "RECALL-SAFE (needs semantic review)", safe+1
}
t.Logf("[%s] keep %s / drop %s score=%.2f recall %.1f%%->%.1f%% | %s",
verdict, c.KeepPattern, c.DropPattern, c.Score,
sr.RecallBefore*100, sr.RecallAfter*100, c.Rationale)
// The wall must be sound: Safe implies recall preserved AND not distinct.
if sr.Safe && (sr.RecallAfter < sr.RecallBefore || sr.DistinctGT) {
t.Errorf("screen inconsistent for drop %s: Safe but recall dropped or distinct GT", c.DropPattern)
}
}
t.Logf("Proposer summary: %d RECALL-SAFE candidate(s) for human/LLM review, %d BLOCKED by the GT wall — propose-only, nothing auto-applied",
safe, blocked)
}