feat(ai-sdk): offline dedup-candidate proposer + deterministic GT wall (P2 slice 1)
First thin slice of the offline library-improvement proposer. DEV-TIME ONLY, propose-only — it never mutates the pattern library or the runtime. - FindDedupCandidates (proposer_dedup.go): structural near-duplicate detection over the fired patterns (category + measure/zone/scenario overlap). Bakes in the P1 lesson: only same-category pairs compare, and pairs with different operational states are never proposed (normal-operation vs maintenance are legitimately distinct, e.g. HP011 vs HP077). - ScreenSupersession (proposer_screen.go): the wall. A proposal is safe only if (1) dropping the hazard does not reduce GT recall AND (2) keep/drop do not credit DIFFERENT GT entries. Check 2 catches distinct hazards that merely share measures (HP2201 hot surface GT 1.3 vs HP2202 hot ware GT 1.4) which recall alone would wave through. On real warewashing output: 3 candidates -> 1 BLOCKED (distinct GT), 2 RECALL-SAFE for human/LLM review (the update + winding/friction near-dupes). Nothing auto-applied. All 3 GTs unaffected (read-only). The LLM judgement and a CLI/file queue are slice 2. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -45,7 +45,7 @@ var warewashingCyberCategories = map[string]bool{
|
||||
|
||||
// warewashingEngineOutput runs the production chain and returns the filtered
|
||||
// hazards/mitigations the user would see for the UC-M.
|
||||
func warewashingEngineOutput() ([]Hazard, []Mitigation, int) {
|
||||
func warewashingEngineOutput() ([]Hazard, []Mitigation, []PatternMatch) {
|
||||
res := ParseNarrative(warewashingNarrative, "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)")
|
||||
|
||||
var compIDs, compNames []string
|
||||
@@ -94,7 +94,7 @@ func warewashingEngineOutput() ([]Hazard, []Mitigation, int) {
|
||||
filtered := *out
|
||||
filtered.MatchedPatterns = kept
|
||||
hazards, mitigations := patternsToHazardsAndMitigations(&filtered)
|
||||
return hazards, mitigations, len(kept)
|
||||
return hazards, mitigations, kept
|
||||
}
|
||||
|
||||
func TestWarewashing_GTCoverage(t *testing.T) {
|
||||
@@ -119,8 +119,8 @@ func TestWarewashing_GTCoverage(t *testing.T) {
|
||||
t.Logf("Parsed components: %v", cn)
|
||||
}
|
||||
|
||||
hazards, mitigations, nPatterns := warewashingEngineOutput()
|
||||
t.Logf("Engine: %d patterns kept (relevance+cyber filter) -> %d hazards", nPatterns, len(hazards))
|
||||
hazards, mitigations, keptPatterns := warewashingEngineOutput()
|
||||
t.Logf("Engine: %d patterns kept (relevance+cyber filter) -> %d hazards", len(keptPatterns), len(hazards))
|
||||
|
||||
result := CompareBenchmark(>, hazards, mitigations)
|
||||
precision := 0.0
|
||||
@@ -180,3 +180,49 @@ func TestWarewashing_GTCoverage(t *testing.T) {
|
||||
t.Errorf("warewashing recall below 40%% floor: %.1f%%", result.CoverageScore*100)
|
||||
}
|
||||
}
|
||||
|
||||
// TestWarewashing_DedupProposer exercises the offline dedup-candidate proposer
|
||||
// end-to-end on the real warewashing engine output: detect candidates, screen
|
||||
// each against the GT, and log the human-review queue. It asserts the WALL is
|
||||
// self-consistent — a PASS verdict may never coincide with a recall drop.
|
||||
func TestWarewashing_DedupProposer(t *testing.T) {
|
||||
raw, err := os.ReadFile(filepath.Join("testdata", "ground_truth_warewashing.json"))
|
||||
if err != nil {
|
||||
t.Fatalf("read GT: %v", err)
|
||||
}
|
||||
var gt GroundTruth
|
||||
if err := json.Unmarshal(raw, >); err != nil {
|
||||
t.Fatalf("parse GT: %v", err)
|
||||
}
|
||||
|
||||
hazards, mits, kept := warewashingEngineOutput()
|
||||
// 0.25 is a deliberately permissive candidate threshold: the proposer is meant
|
||||
// to over-surface, because the deterministic GT wall below (and a human, and in
|
||||
// slice 2 an LLM) is the precision filter — not the detector.
|
||||
candidates := FindDedupCandidates(kept, 0.25)
|
||||
t.Logf("Proposer: %d dedup candidate(s) from %d fired patterns", len(candidates), len(kept))
|
||||
|
||||
safe, blocked := 0, 0
|
||||
for _, c := range candidates {
|
||||
sr := ScreenSupersession(>, hazards, mits, c.KeepHazardName, c.DropName)
|
||||
var verdict string
|
||||
switch {
|
||||
case sr.RecallAfter < sr.RecallBefore:
|
||||
verdict, blocked = "BLOCK (recall-load-bearing)", blocked+1
|
||||
case sr.DistinctGT:
|
||||
verdict, blocked = "BLOCK (distinct GT "+sr.KeepGT+" vs "+sr.DropGT+")", blocked+1
|
||||
default:
|
||||
verdict, safe = "RECALL-SAFE (needs semantic review)", safe+1
|
||||
}
|
||||
t.Logf("[%s] keep %s / drop %s score=%.2f recall %.1f%%->%.1f%% | %s",
|
||||
verdict, c.KeepPattern, c.DropPattern, c.Score,
|
||||
sr.RecallBefore*100, sr.RecallAfter*100, c.Rationale)
|
||||
|
||||
// The wall must be sound: Safe implies recall preserved AND not distinct.
|
||||
if sr.Safe && (sr.RecallAfter < sr.RecallBefore || sr.DistinctGT) {
|
||||
t.Errorf("screen inconsistent for drop %s: Safe but recall dropped or distinct GT", c.DropPattern)
|
||||
}
|
||||
}
|
||||
t.Logf("Proposer summary: %d RECALL-SAFE candidate(s) for human/LLM review, %d BLOCKED by the GT wall — propose-only, nothing auto-applied",
|
||||
safe, blocked)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user