8674b2cd9a
First thin slice of the offline library-improvement proposer. DEV-TIME ONLY, propose-only — it never mutates the pattern library or the runtime. - FindDedupCandidates (proposer_dedup.go): structural near-duplicate detection over the fired patterns (category + measure/zone/scenario overlap). Bakes in the P1 lesson: only same-category pairs compare, and pairs with different operational states are never proposed (normal-operation vs maintenance are legitimately distinct, e.g. HP011 vs HP077). - ScreenSupersession (proposer_screen.go): the wall. A proposal is safe only if (1) dropping the hazard does not reduce GT recall AND (2) keep/drop do not credit DIFFERENT GT entries. Check 2 catches distinct hazards that merely share measures (HP2201 hot surface GT 1.3 vs HP2202 hot ware GT 1.4) which recall alone would wave through. On real warewashing output: 3 candidates -> 1 BLOCKED (distinct GT), 2 RECALL-SAFE for human/LLM review (the update + winding/friction near-dupes). Nothing auto-applied. All 3 GTs unaffected (read-only). The LLM judgement and a CLI/file queue are slice 2. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
153 lines
5.0 KiB
Go
153 lines
5.0 KiB
Go
package iace
|
|
|
|
import (
|
|
"fmt"
|
|
"math"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
)
|
|
|
|
// Offline dedup-candidate proposer (P2, type 1). DEV-TIME ONLY.
|
|
//
|
|
// It inspects the patterns that fired for one machine and proposes which look
|
|
// like duplicates, so a human (later an LLM) can decide a supersession/merge. It
|
|
// NEVER mutates the pattern library or the runtime — it only surfaces candidates.
|
|
// The deterministic GT screen (ScreenSupersession, proposer_screen.go) is the
|
|
// wall that proves a proposal is safe before a human ever sees it.
|
|
//
|
|
// Detection here is purely structural (category + zone + measure + scenario
|
|
// overlap) and therefore reproducible. Two safety rules bake in what P1 taught
|
|
// us about the dishwasher review:
|
|
// - only patterns with the SAME primary category are ever compared;
|
|
// - a pair with DIFFERENT operational states is NEVER proposed, because
|
|
// normal-operation and maintenance are legitimately distinct contexts with
|
|
// different protective measures (e.g. HP011 vs HP077). Merging them would
|
|
// erase the maintenance view.
|
|
|
|
// DedupCandidate is a proposed near-duplicate pattern pair for one machine class.
|
|
type DedupCandidate struct {
|
|
KeepPattern string `json:"keep_pattern"` // higher-priority survivor
|
|
DropPattern string `json:"drop_pattern"` // supersession target
|
|
KeepName string `json:"keep_name"`
|
|
KeepHazardName string `json:"keep_hazard_name"` // keep pattern ScenarioDE (for the GT-distinctness screen)
|
|
DropName string `json:"drop_name"` // == generated hazard Name (ScenarioDE) of the drop pattern
|
|
Category string `json:"category"`
|
|
ZoneJaccard float64 `json:"zone_jaccard"`
|
|
MeasureJaccard float64 `json:"measure_jaccard"`
|
|
ScenarioJaccard float64 `json:"scenario_jaccard"`
|
|
Score float64 `json:"score"`
|
|
Rationale string `json:"rationale"`
|
|
}
|
|
|
|
// FindDedupCandidates compares the fired patterns pairwise and returns near-dup
|
|
// candidates whose combined overlap score meets threshold, deterministically
|
|
// ordered (score desc, then drop-pattern id). The combined score weights measure
|
|
// overlap highest (shared measures are the strongest duplicate signal), then zone
|
|
// and scenario equally.
|
|
func FindDedupCandidates(fired []PatternMatch, threshold float64) []DedupCandidate {
|
|
var out []DedupCandidate
|
|
for i := 0; i < len(fired); i++ {
|
|
for j := i + 1; j < len(fired); j++ {
|
|
a, b := fired[i], fired[j]
|
|
ca := primaryCat(a)
|
|
if ca == "" || ca != primaryCat(b) {
|
|
continue
|
|
}
|
|
if !sameOpStateSet(a.OperationalStates, b.OperationalStates) {
|
|
continue // legitimate lifecycle variants — never propose a merge
|
|
}
|
|
zj := tokenJaccard(zoneTokenSet(a.ZoneDE), zoneTokenSet(b.ZoneDE))
|
|
mj := tokenJaccard(toSet(a.SuggestedMeasureIDs), toSet(b.SuggestedMeasureIDs))
|
|
sj := tokenJaccard(wordTokenSet(a.ScenarioDE), wordTokenSet(b.ScenarioDE))
|
|
score := 0.4*mj + 0.3*zj + 0.3*sj
|
|
if score < threshold {
|
|
continue
|
|
}
|
|
keep, drop := a, b
|
|
if b.Priority > a.Priority {
|
|
keep, drop = b, a
|
|
}
|
|
out = append(out, DedupCandidate{
|
|
KeepPattern: keep.PatternID, DropPattern: drop.PatternID,
|
|
KeepName: keep.PatternName, KeepHazardName: keep.ScenarioDE, DropName: drop.ScenarioDE,
|
|
Category: ca, ZoneJaccard: round2(zj), MeasureJaccard: round2(mj),
|
|
ScenarioJaccard: round2(sj), Score: round2(score),
|
|
Rationale: fmt.Sprintf(
|
|
"same category %q · measure overlap %.0f%% · zone overlap %.0f%% · scenario overlap %.0f%% → keep %s (P%d), supersede %s (P%d)",
|
|
ca, mj*100, zj*100, sj*100, keep.PatternID, keep.Priority, drop.PatternID, drop.Priority),
|
|
})
|
|
}
|
|
}
|
|
sort.SliceStable(out, func(i, j int) bool {
|
|
if out[i].Score != out[j].Score {
|
|
return out[i].Score > out[j].Score
|
|
}
|
|
return out[i].DropPattern < out[j].DropPattern
|
|
})
|
|
return out
|
|
}
|
|
|
|
func primaryCat(pm PatternMatch) string {
|
|
if len(pm.HazardCats) == 0 {
|
|
return ""
|
|
}
|
|
return pm.HazardCats[0]
|
|
}
|
|
|
|
func sameOpStateSet(a, b []string) bool {
|
|
sa, sb := toSet(a), toSet(b)
|
|
if len(sa) != len(sb) {
|
|
return false
|
|
}
|
|
for k := range sa {
|
|
if !sb[k] {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
var proposerWordSplit = regexp.MustCompile(`[^\p{L}]+`)
|
|
|
|
// zoneTokenSet splits a comma-separated zone string into its component terms.
|
|
func zoneTokenSet(zone string) map[string]bool {
|
|
out := map[string]bool{}
|
|
for _, part := range strings.Split(strings.ToLower(zone), ",") {
|
|
if t := strings.TrimSpace(part); len([]rune(t)) >= 3 {
|
|
out[t] = true
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// wordTokenSet tokenises free text into words of length >= 4 (drops connectives).
|
|
func wordTokenSet(s string) map[string]bool {
|
|
out := map[string]bool{}
|
|
for _, w := range proposerWordSplit.Split(strings.ToLower(s), -1) {
|
|
if len([]rune(w)) >= 4 {
|
|
out[w] = true
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func tokenJaccard(a, b map[string]bool) float64 {
|
|
if len(a) == 0 && len(b) == 0 {
|
|
return 0
|
|
}
|
|
inter := 0
|
|
for k := range a {
|
|
if b[k] {
|
|
inter++
|
|
}
|
|
}
|
|
union := len(a) + len(b) - inter
|
|
if union == 0 {
|
|
return 0
|
|
}
|
|
return float64(inter) / float64(union)
|
|
}
|
|
|
|
func round2(x float64) float64 { return math.Round(x*100) / 100 }
|