breakpilot-compliance/ai-compliance-sdk/internal/iace/proposer_dedup.go

package iace

import (
	"fmt"
	"math"
	"regexp"
	"sort"
	"strings"
)

// Offline dedup-candidate proposer (P2, type 1). DEV-TIME ONLY.
//
// It inspects the patterns that fired for one machine and proposes which look
// like duplicates, so a human (later an LLM) can decide a supersession/merge. It
// NEVER mutates the pattern library or the runtime — it only surfaces candidates.
// The deterministic GT screen (ScreenSupersession, proposer_screen.go) is the
// wall that proves a proposal is safe before a human ever sees it.
//
// Detection here is purely structural (category + zone + measure + scenario
// overlap) and therefore reproducible. Two safety rules bake in what P1 taught
// us about the dishwasher review:
//   - only patterns with the SAME primary category are ever compared;
//   - a pair with DIFFERENT operational states is NEVER proposed, because
//     normal-operation and maintenance are legitimately distinct contexts with
//     different protective measures (e.g. HP011 vs HP077). Merging them would
//     erase the maintenance view.

// DedupCandidate is a proposed near-duplicate pattern pair for one machine class.
type DedupCandidate struct {
	KeepPattern     string  `json:"keep_pattern"` // higher-priority survivor
	DropPattern     string  `json:"drop_pattern"` // supersession target
	KeepName        string  `json:"keep_name"`
	KeepHazardName  string  `json:"keep_hazard_name"` // keep pattern ScenarioDE (for the GT-distinctness screen)
	DropName        string  `json:"drop_name"`        // == generated hazard Name (ScenarioDE) of the drop pattern
	Category        string  `json:"category"`
	ZoneJaccard     float64 `json:"zone_jaccard"`
	MeasureJaccard  float64 `json:"measure_jaccard"`
	ScenarioJaccard float64 `json:"scenario_jaccard"`
	Score           float64 `json:"score"`
	Rationale       string  `json:"rationale"`
}

// FindDedupCandidates compares the fired patterns pairwise and returns near-dup
// candidates whose combined overlap score meets threshold, deterministically
// ordered (score desc, then drop-pattern id). The combined score weights measure
// overlap highest (shared measures are the strongest duplicate signal), then zone
// and scenario equally.
func FindDedupCandidates(fired []PatternMatch, threshold float64) []DedupCandidate {
	var out []DedupCandidate
	for i := 0; i < len(fired); i++ {
		for j := i + 1; j < len(fired); j++ {
			a, b := fired[i], fired[j]
			ca := primaryCat(a)
			if ca == "" || ca != primaryCat(b) {
				continue
			}
			if !sameOpStateSet(a.OperationalStates, b.OperationalStates) {
				continue // legitimate lifecycle variants — never propose a merge
			}
			zj := tokenJaccard(zoneTokenSet(a.ZoneDE), zoneTokenSet(b.ZoneDE))
			mj := tokenJaccard(toSet(a.SuggestedMeasureIDs), toSet(b.SuggestedMeasureIDs))
			sj := tokenJaccard(wordTokenSet(a.ScenarioDE), wordTokenSet(b.ScenarioDE))
			score := 0.4*mj + 0.3*zj + 0.3*sj
			if score < threshold {
				continue
			}
			keep, drop := a, b
			if b.Priority > a.Priority {
				keep, drop = b, a
			}
			out = append(out, DedupCandidate{
				KeepPattern: keep.PatternID, DropPattern: drop.PatternID,
				KeepName: keep.PatternName, KeepHazardName: keep.ScenarioDE, DropName: drop.ScenarioDE,
				Category: ca, ZoneJaccard: round2(zj), MeasureJaccard: round2(mj),
				ScenarioJaccard: round2(sj), Score: round2(score),
				Rationale: fmt.Sprintf(
					"same category %q · measure overlap %.0f%% · zone overlap %.0f%% · scenario overlap %.0f%% → keep %s (P%d), supersede %s (P%d)",
					ca, mj*100, zj*100, sj*100, keep.PatternID, keep.Priority, drop.PatternID, drop.Priority),
			})
		}
	}
	sort.SliceStable(out, func(i, j int) bool {
		if out[i].Score != out[j].Score {
			return out[i].Score > out[j].Score
		}
		return out[i].DropPattern < out[j].DropPattern
	})
	return out
}

func primaryCat(pm PatternMatch) string {
	if len(pm.HazardCats) == 0 {
		return ""
	}
	return pm.HazardCats[0]
}

func sameOpStateSet(a, b []string) bool {
	sa, sb := toSet(a), toSet(b)
	if len(sa) != len(sb) {
		return false
	}
	for k := range sa {
		if !sb[k] {
			return false
		}
	}
	return true
}

var proposerWordSplit = regexp.MustCompile(`[^\p{L}]+`)

// zoneTokenSet splits a comma-separated zone string into its component terms.
func zoneTokenSet(zone string) map[string]bool {
	out := map[string]bool{}
	for _, part := range strings.Split(strings.ToLower(zone), ",") {
		if t := strings.TrimSpace(part); len([]rune(t)) >= 3 {
			out[t] = true
		}
	}
	return out
}

// wordTokenSet tokenises free text into words of length >= 4 (drops connectives).
func wordTokenSet(s string) map[string]bool {
	out := map[string]bool{}
	for _, w := range proposerWordSplit.Split(strings.ToLower(s), -1) {
		if len([]rune(w)) >= 4 {
			out[w] = true
		}
	}
	return out
}

func tokenJaccard(a, b map[string]bool) float64 {
	if len(a) == 0 && len(b) == 0 {
		return 0
	}
	inter := 0
	for k := range a {
		if b[k] {
			inter++
		}
	}
	union := len(a) + len(b) - inter
	if union == 0 {
		return 0
	}
	return float64(inter) / float64(union)
}

func round2(x float64) float64 { return math.Round(x*100) / 100 }