breakpilot-compliance/ai-compliance-sdk/internal/iace/audit/vocabulary.go

package audit

import (
	"regexp"
	"sort"
	"strings"

	"github.com/breakpilot/ai-compliance-sdk/internal/iace"
)

// runVocabularyImpl takes a limits-form payload (the structured machine
// description filled in by the engineer) and asks: which of its words
// are unknown to the keyword dictionary yet appear in any pattern's
// scenario/trigger/harm/zone text? Each such word is a dictionary gap —
// the engineer typed a term that some pattern is waiting for, but the
// parser cannot translate it into a tag.
func init() {
	runVocabularyImpl = runVocabulary
}

var tokenRE = regexp.MustCompile(`[a-zäöüßA-ZÄÖÜ]{4,}`)

// German + English stop words that show up in any narrative but carry
// no engineering meaning. Kept short on purpose — we only want to drop
// obvious filler.
var stopWords = map[string]bool{
	"oder": true, "und": true, "auch": true, "wenn": true, "wird": true,
	"werden": true, "kann": true, "koennen": true, "soll": true, "muss": true,
	"sind": true, "eine": true, "einer": true, "einem": true, "einen": true,
	"diese": true, "dieser": true, "dieses": true, "diesem": true, "diesen": true,
	"durch": true, "nach": true, "ueber": true, "unter": true, "zwischen": true,
	"nicht": true, "ohne": true, "fuer": true, "bzw": true, "etc": true,
	"sowie": true, "siehe": true, "etwa": true, "ggf": true, "the": true,
	"with": true, "from": true, "this": true, "that": true, "have": true,
	"insbesondere": true, "ausschliesslich": true, "ebenfalls": true,
	"jeweils": true, "weitere": true, "weiteren": true, "weiterer": true,
}

func runVocabulary(form map[string]any) VocabularyReport {
	limits, ok := form["limits_form"].(map[string]any)
	if !ok {
		// Form may already be the inner object
		limits = form
	}

	tokens := map[string]bool{}
	for _, v := range limits {
		extractTokens(v, tokens)
	}
	report := VocabularyReport{UniqueTokens: len(tokens)}

	dictTokens := dictionaryVocabulary()

	for tok := range tokens {
		if stopWords[tok] {
			continue
		}
		if dictTokenHit(tok, dictTokens) {
			report.KnownTokens = append(report.KnownTokens, tok)
		} else {
			report.UnknownTokens = append(report.UnknownTokens, tok)
		}
	}
	sort.Strings(report.KnownTokens)
	sort.Strings(report.UnknownTokens)

	// For each unknown token check if any pattern names it
	patterns := iace.AllPatterns()
	for _, tok := range report.UnknownTokens {
		hits := patternsMentioning(tok, patterns)
		if len(hits) == 0 {
			continue
		}
		report.SuggestedDictionaryEntries = append(report.SuggestedDictionaryEntries, DictionarySuggestion{
			Token:      tok,
			PatternIDs: hits,
		})
	}
	sort.Slice(report.SuggestedDictionaryEntries, func(i, j int) bool {
		return len(report.SuggestedDictionaryEntries[i].PatternIDs) > len(report.SuggestedDictionaryEntries[j].PatternIDs)
	})
	return report
}

func extractTokens(v any, out map[string]bool) {
	switch x := v.(type) {
	case string:
		for _, m := range tokenRE.FindAllString(x, -1) {
			out[strings.ToLower(m)] = true
		}
	case []any:
		for _, e := range x {
			extractTokens(e, out)
		}
	case map[string]any:
		for _, e := range x {
			extractTokens(e, out)
		}
	}
}

// dictionaryVocabulary builds the lowercase set of all keyword strings
// that the parser will recognize, including normalized forms (umlauts
// replaced like in the keyword dictionary).
func dictionaryVocabulary() map[string]bool {
	out := map[string]bool{}
	for _, kw := range iace.GetKeywordDictionary() {
		for _, k := range kw.Keywords {
			out[strings.ToLower(k)] = true
		}
	}
	return out
}

// dictTokenHit returns true if the token would be matched by any
// dictionary entry. Dictionary entries can be substrings, so we treat
// the dict as a set of stem-like matchers: a token is "known" if it
// equals a dict word OR contains a dict word as substring OR the dict
// word contains the token.
func dictTokenHit(tok string, dict map[string]bool) bool {
	if dict[tok] {
		return true
	}
	for d := range dict {
		if strings.Contains(tok, d) || strings.Contains(d, tok) {
			return true
		}
	}
	return false
}

// patternsMentioning returns up to 8 pattern IDs whose scenario/trigger/
// harm/zone text contains the token (case-insensitive substring).
func patternsMentioning(tok string, patterns []iace.HazardPattern) []string {
	tokLower := strings.ToLower(tok)
	seen := map[string]bool{}
	var out []string
	for _, p := range patterns {
		hay := strings.ToLower(p.ScenarioDE + " " + p.TriggerDE + " " + p.HarmDE + " " + p.ZoneDE + " " + p.NameDE)
		if !strings.Contains(hay, tokLower) {
			continue
		}
		if seen[p.ID] {
			continue
		}
		seen[p.ID] = true
		out = append(out, p.ID)
		if len(out) >= 8 {
			break
		}
	}
	return out
}