package iace import ( "fmt" "math" "regexp" "sort" "strings" ) // Offline dedup-candidate proposer (P2, type 1). DEV-TIME ONLY. // // It inspects the patterns that fired for one machine and proposes which look // like duplicates, so a human (later an LLM) can decide a supersession/merge. It // NEVER mutates the pattern library or the runtime — it only surfaces candidates. // The deterministic GT screen (ScreenSupersession, proposer_screen.go) is the // wall that proves a proposal is safe before a human ever sees it. // // Detection here is purely structural (category + zone + measure + scenario // overlap) and therefore reproducible. Two safety rules bake in what P1 taught // us about the dishwasher review: // - only patterns with the SAME primary category are ever compared; // - a pair with DIFFERENT operational states is NEVER proposed, because // normal-operation and maintenance are legitimately distinct contexts with // different protective measures (e.g. HP011 vs HP077). Merging them would // erase the maintenance view. // DedupCandidate is a proposed near-duplicate pattern pair for one machine class. type DedupCandidate struct { KeepPattern string `json:"keep_pattern"` // higher-priority survivor DropPattern string `json:"drop_pattern"` // supersession target KeepName string `json:"keep_name"` KeepHazardName string `json:"keep_hazard_name"` // keep pattern ScenarioDE (for the GT-distinctness screen) DropName string `json:"drop_name"` // == generated hazard Name (ScenarioDE) of the drop pattern Category string `json:"category"` ZoneJaccard float64 `json:"zone_jaccard"` MeasureJaccard float64 `json:"measure_jaccard"` ScenarioJaccard float64 `json:"scenario_jaccard"` Score float64 `json:"score"` Rationale string `json:"rationale"` } // FindDedupCandidates compares the fired patterns pairwise and returns near-dup // candidates whose combined overlap score meets threshold, deterministically // ordered (score desc, then drop-pattern id). The combined score weights measure // overlap highest (shared measures are the strongest duplicate signal), then zone // and scenario equally. func FindDedupCandidates(fired []PatternMatch, threshold float64) []DedupCandidate { var out []DedupCandidate for i := 0; i < len(fired); i++ { for j := i + 1; j < len(fired); j++ { a, b := fired[i], fired[j] ca := primaryCat(a) if ca == "" || ca != primaryCat(b) { continue } if !sameOpStateSet(a.OperationalStates, b.OperationalStates) { continue // legitimate lifecycle variants — never propose a merge } zj := tokenJaccard(zoneTokenSet(a.ZoneDE), zoneTokenSet(b.ZoneDE)) mj := tokenJaccard(toSet(a.SuggestedMeasureIDs), toSet(b.SuggestedMeasureIDs)) sj := tokenJaccard(wordTokenSet(a.ScenarioDE), wordTokenSet(b.ScenarioDE)) score := 0.4*mj + 0.3*zj + 0.3*sj if score < threshold { continue } keep, drop := a, b if b.Priority > a.Priority { keep, drop = b, a } out = append(out, DedupCandidate{ KeepPattern: keep.PatternID, DropPattern: drop.PatternID, KeepName: keep.PatternName, KeepHazardName: keep.ScenarioDE, DropName: drop.ScenarioDE, Category: ca, ZoneJaccard: round2(zj), MeasureJaccard: round2(mj), ScenarioJaccard: round2(sj), Score: round2(score), Rationale: fmt.Sprintf( "same category %q · measure overlap %.0f%% · zone overlap %.0f%% · scenario overlap %.0f%% → keep %s (P%d), supersede %s (P%d)", ca, mj*100, zj*100, sj*100, keep.PatternID, keep.Priority, drop.PatternID, drop.Priority), }) } } sort.SliceStable(out, func(i, j int) bool { if out[i].Score != out[j].Score { return out[i].Score > out[j].Score } return out[i].DropPattern < out[j].DropPattern }) return out } func primaryCat(pm PatternMatch) string { if len(pm.HazardCats) == 0 { return "" } return pm.HazardCats[0] } func sameOpStateSet(a, b []string) bool { sa, sb := toSet(a), toSet(b) if len(sa) != len(sb) { return false } for k := range sa { if !sb[k] { return false } } return true } var proposerWordSplit = regexp.MustCompile(`[^\p{L}]+`) // zoneTokenSet splits a comma-separated zone string into its component terms. func zoneTokenSet(zone string) map[string]bool { out := map[string]bool{} for _, part := range strings.Split(strings.ToLower(zone), ",") { if t := strings.TrimSpace(part); len([]rune(t)) >= 3 { out[t] = true } } return out } // wordTokenSet tokenises free text into words of length >= 4 (drops connectives). func wordTokenSet(s string) map[string]bool { out := map[string]bool{} for _, w := range proposerWordSplit.Split(strings.ToLower(s), -1) { if len([]rune(w)) >= 4 { out[w] = true } } return out } func tokenJaccard(a, b map[string]bool) float64 { if len(a) == 0 && len(b) == 0 { return 0 } inter := 0 for k := range a { if b[k] { inter++ } } union := len(a) + len(b) - inter if union == 0 { return 0 } return float64(inter) / float64(union) } func round2(x float64) float64 { return math.Round(x*100) / 100 }