package llm import ( "regexp" "strings" "github.com/breakpilot/ai-compliance-sdk/internal/rbac" ) // PIIType represents a type of personally identifiable information type PIIType string const ( PIITypeEmail PIIType = "email" PIITypePhone PIIType = "phone" PIITypeIPv4 PIIType = "ip_v4" PIITypeIPv6 PIIType = "ip_v6" PIITypeIBAN PIIType = "iban" PIITypeUUID PIIType = "uuid" PIITypeName PIIType = "name" PIITypeSocialSec PIIType = "social_security" PIITypeCreditCard PIIType = "credit_card" PIITypeDateOfBirth PIIType = "date_of_birth" PIITypeSalary PIIType = "salary" PIITypeAddress PIIType = "address" ) // PIIPattern defines a pattern for identifying PII type PIIPattern struct { Type PIIType Pattern *regexp.Regexp Replacement string Level rbac.PIIRedactionLevel // Minimum level at which this is redacted } // PIIFinding represents a found PII instance type PIIFinding struct { Type string `json:"type"` Match string `json:"match"` Start int `json:"start"` End int `json:"end"` } // PIIDetector detects and redacts personally identifiable information type PIIDetector struct { patterns []*PIIPattern } // Pre-compiled patterns for common PII types var ( emailPattern = regexp.MustCompile(`\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b`) ipv4Pattern = regexp.MustCompile(`\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b`) ipv6Pattern = regexp.MustCompile(`\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b`) phonePattern = regexp.MustCompile(`(?:\+49|0049)[\s.-]?\d{2,4}[\s.-]?\d{3,8}|\b0\d{2,4}[\s.-]?\d{3,8}\b|\b\+\d{1,3}[\s.-]?\d{2,4}[\s.-]?\d{3,8}\b`) ibanPattern = regexp.MustCompile(`(?i)\b[A-Z]{2}\d{2}[\s]?(?:\d{4}[\s]?){3,5}\d{1,4}\b`) uuidPattern = regexp.MustCompile(`(?i)\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b`) namePattern = regexp.MustCompile(`\b(?:Herr|Frau|Hr\.|Fr\.|Mr\.|Mrs\.|Ms\.)\s+[A-ZÄÖÜ][a-zäöüß]+(?:\s+[A-ZÄÖÜ][a-zäöüß]+)?\b`) creditCardPattern = regexp.MustCompile(`\b(?:\d{4}[\s-]?){3}\d{4}\b`) dobPattern = regexp.MustCompile(`\b(?:0[1-9]|[12][0-9]|3[01])\.(?:0[1-9]|1[012])\.(?:19|20)\d{2}\b`) salaryPattern = regexp.MustCompile(`(?i)(?:gehalt|salary|lohn|vergütung|einkommen)[:\s]+(?:€|EUR|USD|\$)?\s*[\d.,]+(?:\s*(?:€|EUR|USD|\$))?`) addressPattern = regexp.MustCompile(`(?i)\b(?:str\.|straße|strasse|weg|platz|allee)\s+\d+[a-z]?\b`) ) // NewPIIDetector creates a new PII detector with default patterns func NewPIIDetector() *PIIDetector { return &PIIDetector{ patterns: DefaultPIIPatterns(), } } // NewPIIDetectorWithPatterns creates a new PII detector with custom patterns func NewPIIDetectorWithPatterns(patterns []*PIIPattern) *PIIDetector { return &PIIDetector{ patterns: patterns, } } // DefaultPIIPatterns returns the default set of PII patterns func DefaultPIIPatterns() []*PIIPattern { return []*PIIPattern{ {Type: PIITypeEmail, Pattern: emailPattern, Replacement: "[EMAIL_REDACTED]", Level: rbac.PIIRedactionMinimal}, {Type: PIITypeIPv4, Pattern: ipv4Pattern, Replacement: "[IP_REDACTED]", Level: rbac.PIIRedactionMinimal}, {Type: PIITypeIPv6, Pattern: ipv6Pattern, Replacement: "[IP_REDACTED]", Level: rbac.PIIRedactionMinimal}, {Type: PIITypePhone, Pattern: phonePattern, Replacement: "[PHONE_REDACTED]", Level: rbac.PIIRedactionMinimal}, } } // AllPIIPatterns returns all available PII patterns func AllPIIPatterns() []*PIIPattern { return []*PIIPattern{ {Type: PIITypeEmail, Pattern: emailPattern, Replacement: "[EMAIL_REDACTED]", Level: rbac.PIIRedactionMinimal}, {Type: PIITypeIPv4, Pattern: ipv4Pattern, Replacement: "[IP_REDACTED]", Level: rbac.PIIRedactionMinimal}, {Type: PIITypeIPv6, Pattern: ipv6Pattern, Replacement: "[IP_REDACTED]", Level: rbac.PIIRedactionMinimal}, {Type: PIITypePhone, Pattern: phonePattern, Replacement: "[PHONE_REDACTED]", Level: rbac.PIIRedactionMinimal}, {Type: PIITypeIBAN, Pattern: ibanPattern, Replacement: "[IBAN_REDACTED]", Level: rbac.PIIRedactionModerate}, {Type: PIITypeUUID, Pattern: uuidPattern, Replacement: "[UUID_REDACTED]", Level: rbac.PIIRedactionStrict}, {Type: PIITypeName, Pattern: namePattern, Replacement: "[NAME_REDACTED]", Level: rbac.PIIRedactionModerate}, {Type: PIITypeCreditCard, Pattern: creditCardPattern, Replacement: "[CARD_REDACTED]", Level: rbac.PIIRedactionMinimal}, {Type: PIITypeDateOfBirth, Pattern: dobPattern, Replacement: "[DOB_REDACTED]", Level: rbac.PIIRedactionModerate}, {Type: PIITypeSalary, Pattern: salaryPattern, Replacement: "[SALARY_REDACTED]", Level: rbac.PIIRedactionStrict}, {Type: PIITypeAddress, Pattern: addressPattern, Replacement: "[ADDRESS_REDACTED]", Level: rbac.PIIRedactionModerate}, } } // FindPII finds all PII in the text func (d *PIIDetector) FindPII(text string) []PIIFinding { if text == "" { return nil } var findings []PIIFinding for _, pattern := range d.patterns { matches := pattern.Pattern.FindAllStringIndex(text, -1) for _, match := range matches { findings = append(findings, PIIFinding{ Type: string(pattern.Type), Match: text[match[0]:match[1]], Start: match[0], End: match[1], }) } } return findings } // ContainsPII checks if the text contains any PII func (d *PIIDetector) ContainsPII(text string) bool { if text == "" { return false } for _, pattern := range d.patterns { if pattern.Pattern.MatchString(text) { return true } } return false } // Redact removes PII from the given text based on redaction level func (d *PIIDetector) Redact(text string, level rbac.PIIRedactionLevel) string { if text == "" || level == rbac.PIIRedactionNone { return text } result := text for _, pattern := range d.patterns { if d.shouldRedactAtLevel(pattern.Level, level) { result = pattern.Pattern.ReplaceAllString(result, pattern.Replacement) } } return result } // shouldRedactAtLevel determines if a pattern should be applied at the given level func (d *PIIDetector) shouldRedactAtLevel(patternLevel, requestedLevel rbac.PIIRedactionLevel) bool { levelOrder := map[rbac.PIIRedactionLevel]int{ rbac.PIIRedactionNone: 0, rbac.PIIRedactionMinimal: 1, rbac.PIIRedactionModerate: 2, rbac.PIIRedactionStrict: 3, } return levelOrder[requestedLevel] >= levelOrder[patternLevel] } // RedactMap redacts PII from all string values in a map func (d *PIIDetector) RedactMap(data map[string]any, level rbac.PIIRedactionLevel) map[string]any { result := make(map[string]any) for key, value := range data { switch v := value.(type) { case string: result[key] = d.Redact(v, level) case map[string]any: result[key] = d.RedactMap(v, level) case []any: result[key] = d.redactSlice(v, level) default: result[key] = v } } return result } func (d *PIIDetector) redactSlice(data []any, level rbac.PIIRedactionLevel) []any { result := make([]any, len(data)) for i, value := range data { switch v := value.(type) { case string: result[i] = d.Redact(v, level) case map[string]any: result[i] = d.RedactMap(v, level) case []any: result[i] = d.redactSlice(v, level) default: result[i] = v } } return result } // SafeLogString creates a safe-to-log version of a string func (d *PIIDetector) SafeLogString(text string) string { return d.Redact(text, rbac.PIIRedactionStrict) } // DetectDataCategories attempts to detect data categories in text func (d *PIIDetector) DetectDataCategories(text string) []string { if text == "" { return nil } var categories []string textLower := strings.ToLower(text) // Salary detection if salaryPattern.MatchString(text) || strings.Contains(textLower, "gehalt") || strings.Contains(textLower, "salary") { categories = append(categories, "salary") } // Health detection healthKeywords := []string{"diagnose", "krankheit", "medikament", "therapie", "arzt", "krankenhaus", "health", "medical", "diagnosis", "treatment", "hospital"} for _, kw := range healthKeywords { if strings.Contains(textLower, kw) { categories = append(categories, "health") break } } // Financial detection if ibanPattern.MatchString(text) || creditCardPattern.MatchString(text) || strings.Contains(textLower, "konto") || strings.Contains(textLower, "bank") { categories = append(categories, "financial") } // Personal detection (names, addresses, DOB) if namePattern.MatchString(text) || addressPattern.MatchString(text) || dobPattern.MatchString(text) { categories = append(categories, "personal") } // HR detection hrKeywords := []string{"mitarbeiter", "employee", "kündigung", "termination", "beförderung", "promotion", "leistungsbeurteilung", "performance review", "personalakte"} for _, kw := range hrKeywords { if strings.Contains(textLower, kw) { categories = append(categories, "hr") break } } return categories } // Global default detector var defaultDetector = NewPIIDetectorWithPatterns(AllPIIPatterns()) // RedactPII is a convenience function using the default detector func RedactPII(text string, level rbac.PIIRedactionLevel) string { return defaultDetector.Redact(text, level) } // ContainsPIIDefault checks if text contains PII using default patterns func ContainsPIIDefault(text string) bool { return defaultDetector.ContainsPII(text) } // FindPIIDefault finds PII using default patterns func FindPIIDefault(text string) []PIIFinding { return defaultDetector.FindPII(text) } // DetectDataCategoriesDefault detects data categories using default detector func DetectDataCategoriesDefault(text string) []string { return defaultDetector.DetectDataCategories(text) }