feat(ai-sdk): offline dedup-candidate proposer + deterministic GT wall (P2 slice 1)
First thin slice of the offline library-improvement proposer. DEV-TIME ONLY, propose-only — it never mutates the pattern library or the runtime. - FindDedupCandidates (proposer_dedup.go): structural near-duplicate detection over the fired patterns (category + measure/zone/scenario overlap). Bakes in the P1 lesson: only same-category pairs compare, and pairs with different operational states are never proposed (normal-operation vs maintenance are legitimately distinct, e.g. HP011 vs HP077). - ScreenSupersession (proposer_screen.go): the wall. A proposal is safe only if (1) dropping the hazard does not reduce GT recall AND (2) keep/drop do not credit DIFFERENT GT entries. Check 2 catches distinct hazards that merely share measures (HP2201 hot surface GT 1.3 vs HP2202 hot ware GT 1.4) which recall alone would wave through. On real warewashing output: 3 candidates -> 1 BLOCKED (distinct GT), 2 RECALL-SAFE for human/LLM review (the update + winding/friction near-dupes). Nothing auto-applied. All 3 GTs unaffected (read-only). The LLM judgement and a CLI/file queue are slice 2. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -45,7 +45,7 @@ var warewashingCyberCategories = map[string]bool{
|
||||
|
||||
// warewashingEngineOutput runs the production chain and returns the filtered
|
||||
// hazards/mitigations the user would see for the UC-M.
|
||||
func warewashingEngineOutput() ([]Hazard, []Mitigation, int) {
|
||||
func warewashingEngineOutput() ([]Hazard, []Mitigation, []PatternMatch) {
|
||||
res := ParseNarrative(warewashingNarrative, "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)")
|
||||
|
||||
var compIDs, compNames []string
|
||||
@@ -94,7 +94,7 @@ func warewashingEngineOutput() ([]Hazard, []Mitigation, int) {
|
||||
filtered := *out
|
||||
filtered.MatchedPatterns = kept
|
||||
hazards, mitigations := patternsToHazardsAndMitigations(&filtered)
|
||||
return hazards, mitigations, len(kept)
|
||||
return hazards, mitigations, kept
|
||||
}
|
||||
|
||||
func TestWarewashing_GTCoverage(t *testing.T) {
|
||||
@@ -119,8 +119,8 @@ func TestWarewashing_GTCoverage(t *testing.T) {
|
||||
t.Logf("Parsed components: %v", cn)
|
||||
}
|
||||
|
||||
hazards, mitigations, nPatterns := warewashingEngineOutput()
|
||||
t.Logf("Engine: %d patterns kept (relevance+cyber filter) -> %d hazards", nPatterns, len(hazards))
|
||||
hazards, mitigations, keptPatterns := warewashingEngineOutput()
|
||||
t.Logf("Engine: %d patterns kept (relevance+cyber filter) -> %d hazards", len(keptPatterns), len(hazards))
|
||||
|
||||
result := CompareBenchmark(>, hazards, mitigations)
|
||||
precision := 0.0
|
||||
@@ -180,3 +180,49 @@ func TestWarewashing_GTCoverage(t *testing.T) {
|
||||
t.Errorf("warewashing recall below 40%% floor: %.1f%%", result.CoverageScore*100)
|
||||
}
|
||||
}
|
||||
|
||||
// TestWarewashing_DedupProposer exercises the offline dedup-candidate proposer
|
||||
// end-to-end on the real warewashing engine output: detect candidates, screen
|
||||
// each against the GT, and log the human-review queue. It asserts the WALL is
|
||||
// self-consistent — a PASS verdict may never coincide with a recall drop.
|
||||
func TestWarewashing_DedupProposer(t *testing.T) {
|
||||
raw, err := os.ReadFile(filepath.Join("testdata", "ground_truth_warewashing.json"))
|
||||
if err != nil {
|
||||
t.Fatalf("read GT: %v", err)
|
||||
}
|
||||
var gt GroundTruth
|
||||
if err := json.Unmarshal(raw, >); err != nil {
|
||||
t.Fatalf("parse GT: %v", err)
|
||||
}
|
||||
|
||||
hazards, mits, kept := warewashingEngineOutput()
|
||||
// 0.25 is a deliberately permissive candidate threshold: the proposer is meant
|
||||
// to over-surface, because the deterministic GT wall below (and a human, and in
|
||||
// slice 2 an LLM) is the precision filter — not the detector.
|
||||
candidates := FindDedupCandidates(kept, 0.25)
|
||||
t.Logf("Proposer: %d dedup candidate(s) from %d fired patterns", len(candidates), len(kept))
|
||||
|
||||
safe, blocked := 0, 0
|
||||
for _, c := range candidates {
|
||||
sr := ScreenSupersession(>, hazards, mits, c.KeepHazardName, c.DropName)
|
||||
var verdict string
|
||||
switch {
|
||||
case sr.RecallAfter < sr.RecallBefore:
|
||||
verdict, blocked = "BLOCK (recall-load-bearing)", blocked+1
|
||||
case sr.DistinctGT:
|
||||
verdict, blocked = "BLOCK (distinct GT "+sr.KeepGT+" vs "+sr.DropGT+")", blocked+1
|
||||
default:
|
||||
verdict, safe = "RECALL-SAFE (needs semantic review)", safe+1
|
||||
}
|
||||
t.Logf("[%s] keep %s / drop %s score=%.2f recall %.1f%%->%.1f%% | %s",
|
||||
verdict, c.KeepPattern, c.DropPattern, c.Score,
|
||||
sr.RecallBefore*100, sr.RecallAfter*100, c.Rationale)
|
||||
|
||||
// The wall must be sound: Safe implies recall preserved AND not distinct.
|
||||
if sr.Safe && (sr.RecallAfter < sr.RecallBefore || sr.DistinctGT) {
|
||||
t.Errorf("screen inconsistent for drop %s: Safe but recall dropped or distinct GT", c.DropPattern)
|
||||
}
|
||||
}
|
||||
t.Logf("Proposer summary: %d RECALL-SAFE candidate(s) for human/LLM review, %d BLOCKED by the GT wall — propose-only, nothing auto-applied",
|
||||
safe, blocked)
|
||||
}
|
||||
|
||||
@@ -0,0 +1,152 @@
|
||||
package iace
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Offline dedup-candidate proposer (P2, type 1). DEV-TIME ONLY.
|
||||
//
|
||||
// It inspects the patterns that fired for one machine and proposes which look
|
||||
// like duplicates, so a human (later an LLM) can decide a supersession/merge. It
|
||||
// NEVER mutates the pattern library or the runtime — it only surfaces candidates.
|
||||
// The deterministic GT screen (ScreenSupersession, proposer_screen.go) is the
|
||||
// wall that proves a proposal is safe before a human ever sees it.
|
||||
//
|
||||
// Detection here is purely structural (category + zone + measure + scenario
|
||||
// overlap) and therefore reproducible. Two safety rules bake in what P1 taught
|
||||
// us about the dishwasher review:
|
||||
// - only patterns with the SAME primary category are ever compared;
|
||||
// - a pair with DIFFERENT operational states is NEVER proposed, because
|
||||
// normal-operation and maintenance are legitimately distinct contexts with
|
||||
// different protective measures (e.g. HP011 vs HP077). Merging them would
|
||||
// erase the maintenance view.
|
||||
|
||||
// DedupCandidate is a proposed near-duplicate pattern pair for one machine class.
|
||||
type DedupCandidate struct {
|
||||
KeepPattern string `json:"keep_pattern"` // higher-priority survivor
|
||||
DropPattern string `json:"drop_pattern"` // supersession target
|
||||
KeepName string `json:"keep_name"`
|
||||
KeepHazardName string `json:"keep_hazard_name"` // keep pattern ScenarioDE (for the GT-distinctness screen)
|
||||
DropName string `json:"drop_name"` // == generated hazard Name (ScenarioDE) of the drop pattern
|
||||
Category string `json:"category"`
|
||||
ZoneJaccard float64 `json:"zone_jaccard"`
|
||||
MeasureJaccard float64 `json:"measure_jaccard"`
|
||||
ScenarioJaccard float64 `json:"scenario_jaccard"`
|
||||
Score float64 `json:"score"`
|
||||
Rationale string `json:"rationale"`
|
||||
}
|
||||
|
||||
// FindDedupCandidates compares the fired patterns pairwise and returns near-dup
|
||||
// candidates whose combined overlap score meets threshold, deterministically
|
||||
// ordered (score desc, then drop-pattern id). The combined score weights measure
|
||||
// overlap highest (shared measures are the strongest duplicate signal), then zone
|
||||
// and scenario equally.
|
||||
func FindDedupCandidates(fired []PatternMatch, threshold float64) []DedupCandidate {
|
||||
var out []DedupCandidate
|
||||
for i := 0; i < len(fired); i++ {
|
||||
for j := i + 1; j < len(fired); j++ {
|
||||
a, b := fired[i], fired[j]
|
||||
ca := primaryCat(a)
|
||||
if ca == "" || ca != primaryCat(b) {
|
||||
continue
|
||||
}
|
||||
if !sameOpStateSet(a.OperationalStates, b.OperationalStates) {
|
||||
continue // legitimate lifecycle variants — never propose a merge
|
||||
}
|
||||
zj := tokenJaccard(zoneTokenSet(a.ZoneDE), zoneTokenSet(b.ZoneDE))
|
||||
mj := tokenJaccard(toSet(a.SuggestedMeasureIDs), toSet(b.SuggestedMeasureIDs))
|
||||
sj := tokenJaccard(wordTokenSet(a.ScenarioDE), wordTokenSet(b.ScenarioDE))
|
||||
score := 0.4*mj + 0.3*zj + 0.3*sj
|
||||
if score < threshold {
|
||||
continue
|
||||
}
|
||||
keep, drop := a, b
|
||||
if b.Priority > a.Priority {
|
||||
keep, drop = b, a
|
||||
}
|
||||
out = append(out, DedupCandidate{
|
||||
KeepPattern: keep.PatternID, DropPattern: drop.PatternID,
|
||||
KeepName: keep.PatternName, KeepHazardName: keep.ScenarioDE, DropName: drop.ScenarioDE,
|
||||
Category: ca, ZoneJaccard: round2(zj), MeasureJaccard: round2(mj),
|
||||
ScenarioJaccard: round2(sj), Score: round2(score),
|
||||
Rationale: fmt.Sprintf(
|
||||
"same category %q · measure overlap %.0f%% · zone overlap %.0f%% · scenario overlap %.0f%% → keep %s (P%d), supersede %s (P%d)",
|
||||
ca, mj*100, zj*100, sj*100, keep.PatternID, keep.Priority, drop.PatternID, drop.Priority),
|
||||
})
|
||||
}
|
||||
}
|
||||
sort.SliceStable(out, func(i, j int) bool {
|
||||
if out[i].Score != out[j].Score {
|
||||
return out[i].Score > out[j].Score
|
||||
}
|
||||
return out[i].DropPattern < out[j].DropPattern
|
||||
})
|
||||
return out
|
||||
}
|
||||
|
||||
func primaryCat(pm PatternMatch) string {
|
||||
if len(pm.HazardCats) == 0 {
|
||||
return ""
|
||||
}
|
||||
return pm.HazardCats[0]
|
||||
}
|
||||
|
||||
func sameOpStateSet(a, b []string) bool {
|
||||
sa, sb := toSet(a), toSet(b)
|
||||
if len(sa) != len(sb) {
|
||||
return false
|
||||
}
|
||||
for k := range sa {
|
||||
if !sb[k] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
var proposerWordSplit = regexp.MustCompile(`[^\p{L}]+`)
|
||||
|
||||
// zoneTokenSet splits a comma-separated zone string into its component terms.
|
||||
func zoneTokenSet(zone string) map[string]bool {
|
||||
out := map[string]bool{}
|
||||
for _, part := range strings.Split(strings.ToLower(zone), ",") {
|
||||
if t := strings.TrimSpace(part); len([]rune(t)) >= 3 {
|
||||
out[t] = true
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// wordTokenSet tokenises free text into words of length >= 4 (drops connectives).
|
||||
func wordTokenSet(s string) map[string]bool {
|
||||
out := map[string]bool{}
|
||||
for _, w := range proposerWordSplit.Split(strings.ToLower(s), -1) {
|
||||
if len([]rune(w)) >= 4 {
|
||||
out[w] = true
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func tokenJaccard(a, b map[string]bool) float64 {
|
||||
if len(a) == 0 && len(b) == 0 {
|
||||
return 0
|
||||
}
|
||||
inter := 0
|
||||
for k := range a {
|
||||
if b[k] {
|
||||
inter++
|
||||
}
|
||||
}
|
||||
union := len(a) + len(b) - inter
|
||||
if union == 0 {
|
||||
return 0
|
||||
}
|
||||
return float64(inter) / float64(union)
|
||||
}
|
||||
|
||||
func round2(x float64) float64 { return math.Round(x*100) / 100 }
|
||||
@@ -0,0 +1,67 @@
|
||||
package iace
|
||||
|
||||
import "testing"
|
||||
|
||||
func mkPM(id, cat, zone, scenario string, prio int, measures, opstates []string) PatternMatch {
|
||||
return PatternMatch{
|
||||
PatternID: id, PatternName: id, Priority: prio,
|
||||
HazardCats: []string{cat}, ZoneDE: zone, ScenarioDE: scenario,
|
||||
SuggestedMeasureIDs: measures, OperationalStates: opstates,
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindDedupCandidates_FindsOverlappingPair(t *testing.T) {
|
||||
fired := []PatternMatch{
|
||||
mkPM("HPa", "update_failure", "Steuerung, SPS", "Software-Update der Steuerung scheitert nach Abbruch", 80,
|
||||
[]string{"M138", "M146"}, nil),
|
||||
mkPM("HPb", "update_failure", "Steuerung, Antriebsregler", "Software-Update der Steuerung schlaegt fehl", 75,
|
||||
[]string{"M138", "M146", "M141"}, nil),
|
||||
mkPM("HPc", "mechanical_hazard", "Tuer", "Quetschen der Finger an der Tuer", 70,
|
||||
[]string{"M003"}, nil),
|
||||
}
|
||||
got := FindDedupCandidates(fired, 0.4)
|
||||
if len(got) != 1 {
|
||||
t.Fatalf("want 1 candidate, got %d: %+v", len(got), got)
|
||||
}
|
||||
// Higher-priority pattern survives, lower one is the drop target.
|
||||
if got[0].KeepPattern != "HPa" || got[0].DropPattern != "HPb" {
|
||||
t.Errorf("want keep HPa / drop HPb, got keep %s / drop %s", got[0].KeepPattern, got[0].DropPattern)
|
||||
}
|
||||
if got[0].DropName != "Software-Update der Steuerung schlaegt fehl" {
|
||||
t.Errorf("DropName must equal drop pattern ScenarioDE, got %q", got[0].DropName)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindDedupCandidates_LifecycleGuard(t *testing.T) {
|
||||
// Same category, zone and measures — but normal-operation vs maintenance.
|
||||
// These are legitimate variants (HP011 vs HP077) and must NOT be proposed.
|
||||
fired := []PatternMatch{
|
||||
mkPM("HP011", "electrical_hazard", "Schaltschrank, Klemmenkasten", "Person beruehrt spannungsfuehrende Teile", 95,
|
||||
[]string{"M481", "M482"}, nil),
|
||||
mkPM("HP077", "electrical_hazard", "Schaltschrank, Klemmenkasten", "Person beruehrt spannungsfuehrende Teile", 80,
|
||||
[]string{"M481", "M482"}, []string{"maintenance"}),
|
||||
}
|
||||
if got := FindDedupCandidates(fired, 0.4); len(got) != 0 {
|
||||
t.Fatalf("lifecycle guard failed: want 0 candidates, got %d: %+v", len(got), got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindDedupCandidates_DifferentCategoryIgnored(t *testing.T) {
|
||||
fired := []PatternMatch{
|
||||
mkPM("HPa", "thermal_hazard", "Boiler", "Heisse Oberflaeche am Boiler", 80, []string{"M071"}, nil),
|
||||
mkPM("HPb", "mechanical_hazard", "Boiler", "Heisse Oberflaeche am Boiler", 80, []string{"M071"}, nil),
|
||||
}
|
||||
if got := FindDedupCandidates(fired, 0.3); len(got) != 0 {
|
||||
t.Fatalf("cross-category pair must not be proposed, got %d", len(got))
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindDedupCandidates_BelowThresholdDropped(t *testing.T) {
|
||||
fired := []PatternMatch{
|
||||
mkPM("HPa", "mechanical_hazard", "Tuer", "Quetschen an der Tuer", 80, []string{"M003"}, nil),
|
||||
mkPM("HPb", "mechanical_hazard", "Foerderband", "Einzug am Foerderband", 80, []string{"M540"}, nil),
|
||||
}
|
||||
if got := FindDedupCandidates(fired, 0.4); len(got) != 0 {
|
||||
t.Fatalf("disjoint pair must be below threshold, got %d: %+v", len(got), got)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
package iace
|
||||
|
||||
import "github.com/google/uuid"
|
||||
|
||||
// ScreenResult is the deterministic GT verdict for one proposed supersession.
|
||||
type ScreenResult struct {
|
||||
RecallBefore float64 `json:"recall_before"`
|
||||
RecallAfter float64 `json:"recall_after"`
|
||||
KeepGT string `json:"keep_gt,omitempty"` // GT entry the keeper credits (if any)
|
||||
DropGT string `json:"drop_gt,omitempty"` // GT entry the drop credits (if any)
|
||||
DistinctGT bool `json:"distinct_gt"` // keep & drop credit DIFFERENT GT entries -> distinct hazards
|
||||
Safe bool `json:"safe"` // recall preserved AND not distinct
|
||||
}
|
||||
|
||||
// ScreenSupersession is the WALL between "propose" and "decide". A proposal is
|
||||
// safe only if BOTH deterministic checks pass:
|
||||
//
|
||||
// 1. RECALL is not reduced when the drop-hazard (and its mitigations) is removed
|
||||
// — otherwise the drop is load-bearing for GT coverage.
|
||||
// 2. The two hazards do NOT credit DIFFERENT ground-truth entries. Recall alone
|
||||
// is necessary but not sufficient: two genuinely distinct hazards that share
|
||||
// the same measures (e.g. hot boiler surface vs hot ware on unloading) keep
|
||||
// recall at 100% when one is dropped, yet must NOT be merged. If keep and
|
||||
// drop each match a different GT entry, they are distinct.
|
||||
//
|
||||
// Whatever survives both is still only RECALL-SAFE — a candidate for a human (and
|
||||
// in slice 2, an LLM) to confirm semantically. Deterministic; reuses
|
||||
// CompareBenchmark; touches neither the library nor the runtime.
|
||||
func ScreenSupersession(gt *GroundTruth, hazards []Hazard, mits []Mitigation, keepHazardName, dropHazardName string) ScreenResult {
|
||||
before := CompareBenchmark(gt, hazards, mits)
|
||||
|
||||
gtOf := map[string]string{}
|
||||
for _, p := range before.MatchedPairs {
|
||||
gtOf[p.EngineHazard.Name] = p.GTEntry.Nr
|
||||
}
|
||||
keepGT, dropGT := gtOf[keepHazardName], gtOf[dropHazardName]
|
||||
distinct := keepGT != "" && dropGT != "" && keepGT != dropGT
|
||||
|
||||
kept := make([]Hazard, 0, len(hazards))
|
||||
dropped := map[uuid.UUID]bool{}
|
||||
for _, h := range hazards {
|
||||
if h.Name == dropHazardName {
|
||||
dropped[h.ID] = true
|
||||
continue
|
||||
}
|
||||
kept = append(kept, h)
|
||||
}
|
||||
keptMits := make([]Mitigation, 0, len(mits))
|
||||
for _, m := range mits {
|
||||
if !dropped[m.HazardID] {
|
||||
keptMits = append(keptMits, m)
|
||||
}
|
||||
}
|
||||
after := CompareBenchmark(gt, kept, keptMits)
|
||||
|
||||
return ScreenResult{
|
||||
RecallBefore: before.CoverageScore, RecallAfter: after.CoverageScore,
|
||||
KeepGT: keepGT, DropGT: dropGT, DistinctGT: distinct,
|
||||
Safe: after.CoverageScore >= before.CoverageScore && !distinct,
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user