package audit import ( "regexp" "sort" "strings" "github.com/breakpilot/ai-compliance-sdk/internal/iace" ) // runVocabularyImpl takes a limits-form payload (the structured machine // description filled in by the engineer) and asks: which of its words // are unknown to the keyword dictionary yet appear in any pattern's // scenario/trigger/harm/zone text? Each such word is a dictionary gap — // the engineer typed a term that some pattern is waiting for, but the // parser cannot translate it into a tag. func init() { runVocabularyImpl = runVocabulary } var tokenRE = regexp.MustCompile(`[a-zäöüßA-ZÄÖÜ]{4,}`) // German + English stop words that show up in any narrative but carry // no engineering meaning. Kept short on purpose — we only want to drop // obvious filler. var stopWords = map[string]bool{ "oder": true, "und": true, "auch": true, "wenn": true, "wird": true, "werden": true, "kann": true, "koennen": true, "soll": true, "muss": true, "sind": true, "eine": true, "einer": true, "einem": true, "einen": true, "diese": true, "dieser": true, "dieses": true, "diesem": true, "diesen": true, "durch": true, "nach": true, "ueber": true, "unter": true, "zwischen": true, "nicht": true, "ohne": true, "fuer": true, "bzw": true, "etc": true, "sowie": true, "siehe": true, "etwa": true, "ggf": true, "the": true, "with": true, "from": true, "this": true, "that": true, "have": true, "insbesondere": true, "ausschliesslich": true, "ebenfalls": true, "jeweils": true, "weitere": true, "weiteren": true, "weiterer": true, } func runVocabulary(form map[string]any) VocabularyReport { limits, ok := form["limits_form"].(map[string]any) if !ok { // Form may already be the inner object limits = form } tokens := map[string]bool{} for _, v := range limits { extractTokens(v, tokens) } report := VocabularyReport{UniqueTokens: len(tokens)} dictTokens := dictionaryVocabulary() for tok := range tokens { if stopWords[tok] { continue } if dictTokenHit(tok, dictTokens) { report.KnownTokens = append(report.KnownTokens, tok) } else { report.UnknownTokens = append(report.UnknownTokens, tok) } } sort.Strings(report.KnownTokens) sort.Strings(report.UnknownTokens) // For each unknown token check if any pattern names it patterns := iace.AllPatterns() for _, tok := range report.UnknownTokens { hits := patternsMentioning(tok, patterns) if len(hits) == 0 { continue } report.SuggestedDictionaryEntries = append(report.SuggestedDictionaryEntries, DictionarySuggestion{ Token: tok, PatternIDs: hits, }) } sort.Slice(report.SuggestedDictionaryEntries, func(i, j int) bool { return len(report.SuggestedDictionaryEntries[i].PatternIDs) > len(report.SuggestedDictionaryEntries[j].PatternIDs) }) return report } func extractTokens(v any, out map[string]bool) { switch x := v.(type) { case string: for _, m := range tokenRE.FindAllString(x, -1) { out[strings.ToLower(m)] = true } case []any: for _, e := range x { extractTokens(e, out) } case map[string]any: for _, e := range x { extractTokens(e, out) } } } // dictionaryVocabulary builds the lowercase set of all keyword strings // that the parser will recognize, including normalized forms (umlauts // replaced like in the keyword dictionary). func dictionaryVocabulary() map[string]bool { out := map[string]bool{} for _, kw := range iace.GetKeywordDictionary() { for _, k := range kw.Keywords { out[strings.ToLower(k)] = true } } return out } // dictTokenHit returns true if the token would be matched by any // dictionary entry. Dictionary entries can be substrings, so we treat // the dict as a set of stem-like matchers: a token is "known" if it // equals a dict word OR contains a dict word as substring OR the dict // word contains the token. func dictTokenHit(tok string, dict map[string]bool) bool { if dict[tok] { return true } for d := range dict { if strings.Contains(tok, d) || strings.Contains(d, tok) { return true } } return false } // patternsMentioning returns up to 8 pattern IDs whose scenario/trigger/ // harm/zone text contains the token (case-insensitive substring). func patternsMentioning(tok string, patterns []iace.HazardPattern) []string { tokLower := strings.ToLower(tok) seen := map[string]bool{} var out []string for _, p := range patterns { hay := strings.ToLower(p.ScenarioDE + " " + p.TriggerDE + " " + p.HarmDE + " " + p.ZoneDE + " " + p.NameDE) if !strings.Contains(hay, tokLower) { continue } if seen[p.ID] { continue } seen[p.ID] = true out = append(out, p.ID) if len(out) >= 8 { break } } return out }