package policy import ( "context" "regexp" "strings" "sync" ) // PIIDetector detects personally identifiable information in text. type PIIDetector struct { store *Store compiledRules map[string]*regexp.Regexp rulesMu sync.RWMutex } // NewPIIDetector creates a new PIIDetector instance. func NewPIIDetector(store *Store) *PIIDetector { return &PIIDetector{ store: store, compiledRules: make(map[string]*regexp.Regexp), } } // Detect scans text for PII patterns and returns all matches. func (d *PIIDetector) Detect(ctx context.Context, text string) (*PIITestResponse, error) { rules, err := d.store.ListPIIRules(ctx, true) if err != nil { return nil, err } response := &PIITestResponse{ HasPII: false, Matches: []PIIMatch{}, ShouldBlock: false, } highestSeverity := PIISeverity("") for _, rule := range rules { matches := d.findMatches(text, &rule) if len(matches) > 0 { response.HasPII = true response.Matches = append(response.Matches, matches...) // Track highest severity if compareSeverity(rule.Severity, highestSeverity) > 0 { highestSeverity = rule.Severity } } } response.BlockLevel = highestSeverity response.ShouldBlock = highestSeverity == PIISeverityBlock return response, nil } // findMatches finds all matches for a single rule in the text. func (d *PIIDetector) findMatches(text string, rule *PIIRule) []PIIMatch { var matches []PIIMatch switch rule.RuleType { case PIIRuleTypeRegex: matches = d.findRegexMatches(text, rule) case PIIRuleTypeKeyword: matches = d.findKeywordMatches(text, rule) } return matches } // findRegexMatches finds all regex pattern matches in text. func (d *PIIDetector) findRegexMatches(text string, rule *PIIRule) []PIIMatch { re := d.getCompiledRegex(rule.ID.String(), rule.Pattern) if re == nil { return nil } var matches []PIIMatch allMatches := re.FindAllStringIndex(text, -1) for _, loc := range allMatches { matches = append(matches, PIIMatch{ RuleID: rule.ID, RuleName: rule.Name, RuleType: rule.RuleType, Severity: rule.Severity, Match: text[loc[0]:loc[1]], StartIndex: loc[0], EndIndex: loc[1], }) } return matches } // findKeywordMatches finds all keyword matches in text (case-insensitive). func (d *PIIDetector) findKeywordMatches(text string, rule *PIIRule) []PIIMatch { var matches []PIIMatch lowerText := strings.ToLower(text) // Split pattern by commas or pipes for multiple keywords keywords := strings.FieldsFunc(rule.Pattern, func(r rune) bool { return r == ',' || r == '|' }) for _, keyword := range keywords { keyword = strings.TrimSpace(keyword) if keyword == "" { continue } lowerKeyword := strings.ToLower(keyword) startIdx := 0 for { idx := strings.Index(lowerText[startIdx:], lowerKeyword) if idx == -1 { break } actualIdx := startIdx + idx matches = append(matches, PIIMatch{ RuleID: rule.ID, RuleName: rule.Name, RuleType: rule.RuleType, Severity: rule.Severity, Match: text[actualIdx : actualIdx+len(keyword)], StartIndex: actualIdx, EndIndex: actualIdx + len(keyword), }) startIdx = actualIdx + len(keyword) } } return matches } // getCompiledRegex returns a compiled regex, caching for performance. func (d *PIIDetector) getCompiledRegex(ruleID, pattern string) *regexp.Regexp { d.rulesMu.RLock() re, ok := d.compiledRules[ruleID] d.rulesMu.RUnlock() if ok { return re } // Compile and cache d.rulesMu.Lock() defer d.rulesMu.Unlock() // Double-check after acquiring write lock if re, ok = d.compiledRules[ruleID]; ok { return re } compiled, err := regexp.Compile(pattern) if err != nil { // Invalid regex - don't cache return nil } d.compiledRules[ruleID] = compiled return compiled } // ClearCache clears the compiled regex cache (call after rule updates). func (d *PIIDetector) ClearCache() { d.rulesMu.Lock() defer d.rulesMu.Unlock() d.compiledRules = make(map[string]*regexp.Regexp) } // RefreshRules reloads rules and clears the cache. func (d *PIIDetector) RefreshRules() { d.ClearCache() } // compareSeverity compares two severity levels. // Returns: 1 if a > b, -1 if a < b, 0 if equal. func compareSeverity(a, b PIISeverity) int { severityOrder := map[PIISeverity]int{ "": 0, PIISeverityWarn: 1, PIISeverityRedact: 2, PIISeverityBlock: 3, } aOrder := severityOrder[a] bOrder := severityOrder[b] if aOrder > bOrder { return 1 } else if aOrder < bOrder { return -1 } return 0 } // ============================================================================= // PREDEFINED PII PATTERNS (German Context) // ============================================================================= // DefaultPIIRules returns a set of default PII detection rules for German context. func DefaultPIIRules() []PIIRuleConfig { return []PIIRuleConfig{ // Email Addresses { Name: "Email Addresses", Type: "regex", Pattern: `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`, Severity: "block", }, // German Phone Numbers { Name: "German Phone Numbers", Type: "regex", Pattern: `(?:\+49|0)[\s.-]?\d{2,4}[\s.-]?\d{3,}[\s.-]?\d{2,}`, Severity: "block", }, // German Mobile Numbers { Name: "German Mobile Numbers", Type: "regex", Pattern: `(?:\+49|0)1[567]\d[\s.-]?\d{3,}[\s.-]?\d{2,}`, Severity: "block", }, // IBAN (German) { Name: "German IBAN", Type: "regex", Pattern: `DE\d{2}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{2}`, Severity: "block", }, // German Social Security Number (Sozialversicherungsnummer) { Name: "German Social Security Number", Type: "regex", Pattern: `\d{2}[0-3]\d[01]\d{2}[A-Z]\d{3}`, Severity: "block", }, // German Tax ID (Steuer-ID) { Name: "German Tax ID", Type: "regex", Pattern: `\d{2}\s?\d{3}\s?\d{3}\s?\d{3}`, Severity: "block", }, // Credit Card Numbers (Luhn-compatible patterns) { Name: "Credit Card Numbers", Type: "regex", Pattern: `(?:\d{4}[\s.-]?){3}\d{4}`, Severity: "block", }, // German Postal Code + City Pattern (potential address) { Name: "German Address Pattern", Type: "regex", Pattern: `\d{5}\s+[A-ZÄÖÜ][a-zäöüß]+`, Severity: "warn", }, // Date of Birth Patterns (DD.MM.YYYY) { Name: "Date of Birth", Type: "regex", Pattern: `(?:geboren|geb\.|Geburtsdatum|DoB)[\s:]*\d{1,2}[\./]\d{1,2}[\./]\d{2,4}`, Severity: "warn", }, // Personal Names with Titles { Name: "Personal Names with Titles", Type: "regex", Pattern: `(?:Herr|Frau|Dr\.|Prof\.)\s+[A-ZÄÖÜ][a-zäöüß]+\s+[A-ZÄÖÜ][a-zäöüß]+`, Severity: "warn", }, // German Health Insurance Number { Name: "Health Insurance Number", Type: "regex", Pattern: `[A-Z]\d{9}`, Severity: "block", }, // Vehicle Registration (German) { Name: "German Vehicle Registration", Type: "regex", Pattern: `[A-ZÄÖÜ]{1,3}[\s-]?[A-Z]{1,2}[\s-]?\d{1,4}[HE]?`, Severity: "warn", }, } } // ============================================================================= // REDACTION // ============================================================================= // RedactText redacts PII from text based on the matches. func (d *PIIDetector) RedactText(text string, matches []PIIMatch) string { if len(matches) == 0 { return text } // Sort matches by start index (descending) to replace from end sortedMatches := make([]PIIMatch, len(matches)) copy(sortedMatches, matches) // Simple bubble sort for small number of matches for i := 0; i < len(sortedMatches)-1; i++ { for j := 0; j < len(sortedMatches)-i-1; j++ { if sortedMatches[j].StartIndex < sortedMatches[j+1].StartIndex { sortedMatches[j], sortedMatches[j+1] = sortedMatches[j+1], sortedMatches[j] } } } result := text for _, match := range sortedMatches { if match.Severity == PIISeverityRedact || match.Severity == PIISeverityBlock { replacement := strings.Repeat("*", match.EndIndex-match.StartIndex) result = result[:match.StartIndex] + replacement + result[match.EndIndex:] } } return result } // FilterContent filters content based on PII detection. // Returns the filtered content and whether it should be blocked. func (d *PIIDetector) FilterContent(ctx context.Context, content string) (string, bool, error) { response, err := d.Detect(ctx, content) if err != nil { return content, false, err } if !response.HasPII { return content, false, nil } if response.ShouldBlock { return "", true, nil } // Redact content for warn/redact severity redacted := d.RedactText(content, response.Matches) return redacted, false, nil }