Files
breakpilot-lehrer/edu-search-service/internal/policy/pii_detector.go
Benjamin Boenisch 414e0f5ec0
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
feat: edu-search-service migriert, voice-service/geo-service entfernt
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor)
- opensearch + edu-search-service in docker-compose.yml hinzugefuegt
- voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core)
- geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt)
- CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt
  (Go lint, test mit go mod download, build, SBOM)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:36:38 +01:00

351 lines
8.7 KiB
Go

package policy
import (
"context"
"regexp"
"strings"
"sync"
)
// PIIDetector detects personally identifiable information in text.
type PIIDetector struct {
store *Store
compiledRules map[string]*regexp.Regexp
rulesMu sync.RWMutex
}
// NewPIIDetector creates a new PIIDetector instance.
func NewPIIDetector(store *Store) *PIIDetector {
return &PIIDetector{
store: store,
compiledRules: make(map[string]*regexp.Regexp),
}
}
// Detect scans text for PII patterns and returns all matches.
func (d *PIIDetector) Detect(ctx context.Context, text string) (*PIITestResponse, error) {
rules, err := d.store.ListPIIRules(ctx, true)
if err != nil {
return nil, err
}
response := &PIITestResponse{
HasPII: false,
Matches: []PIIMatch{},
ShouldBlock: false,
}
highestSeverity := PIISeverity("")
for _, rule := range rules {
matches := d.findMatches(text, &rule)
if len(matches) > 0 {
response.HasPII = true
response.Matches = append(response.Matches, matches...)
// Track highest severity
if compareSeverity(rule.Severity, highestSeverity) > 0 {
highestSeverity = rule.Severity
}
}
}
response.BlockLevel = highestSeverity
response.ShouldBlock = highestSeverity == PIISeverityBlock
return response, nil
}
// findMatches finds all matches for a single rule in the text.
func (d *PIIDetector) findMatches(text string, rule *PIIRule) []PIIMatch {
var matches []PIIMatch
switch rule.RuleType {
case PIIRuleTypeRegex:
matches = d.findRegexMatches(text, rule)
case PIIRuleTypeKeyword:
matches = d.findKeywordMatches(text, rule)
}
return matches
}
// findRegexMatches finds all regex pattern matches in text.
func (d *PIIDetector) findRegexMatches(text string, rule *PIIRule) []PIIMatch {
re := d.getCompiledRegex(rule.ID.String(), rule.Pattern)
if re == nil {
return nil
}
var matches []PIIMatch
allMatches := re.FindAllStringIndex(text, -1)
for _, loc := range allMatches {
matches = append(matches, PIIMatch{
RuleID: rule.ID,
RuleName: rule.Name,
RuleType: rule.RuleType,
Severity: rule.Severity,
Match: text[loc[0]:loc[1]],
StartIndex: loc[0],
EndIndex: loc[1],
})
}
return matches
}
// findKeywordMatches finds all keyword matches in text (case-insensitive).
func (d *PIIDetector) findKeywordMatches(text string, rule *PIIRule) []PIIMatch {
var matches []PIIMatch
lowerText := strings.ToLower(text)
// Split pattern by commas or pipes for multiple keywords
keywords := strings.FieldsFunc(rule.Pattern, func(r rune) bool {
return r == ',' || r == '|'
})
for _, keyword := range keywords {
keyword = strings.TrimSpace(keyword)
if keyword == "" {
continue
}
lowerKeyword := strings.ToLower(keyword)
startIdx := 0
for {
idx := strings.Index(lowerText[startIdx:], lowerKeyword)
if idx == -1 {
break
}
actualIdx := startIdx + idx
matches = append(matches, PIIMatch{
RuleID: rule.ID,
RuleName: rule.Name,
RuleType: rule.RuleType,
Severity: rule.Severity,
Match: text[actualIdx : actualIdx+len(keyword)],
StartIndex: actualIdx,
EndIndex: actualIdx + len(keyword),
})
startIdx = actualIdx + len(keyword)
}
}
return matches
}
// getCompiledRegex returns a compiled regex, caching for performance.
func (d *PIIDetector) getCompiledRegex(ruleID, pattern string) *regexp.Regexp {
d.rulesMu.RLock()
re, ok := d.compiledRules[ruleID]
d.rulesMu.RUnlock()
if ok {
return re
}
// Compile and cache
d.rulesMu.Lock()
defer d.rulesMu.Unlock()
// Double-check after acquiring write lock
if re, ok = d.compiledRules[ruleID]; ok {
return re
}
compiled, err := regexp.Compile(pattern)
if err != nil {
// Invalid regex - don't cache
return nil
}
d.compiledRules[ruleID] = compiled
return compiled
}
// ClearCache clears the compiled regex cache (call after rule updates).
func (d *PIIDetector) ClearCache() {
d.rulesMu.Lock()
defer d.rulesMu.Unlock()
d.compiledRules = make(map[string]*regexp.Regexp)
}
// RefreshRules reloads rules and clears the cache.
func (d *PIIDetector) RefreshRules() {
d.ClearCache()
}
// compareSeverity compares two severity levels.
// Returns: 1 if a > b, -1 if a < b, 0 if equal.
func compareSeverity(a, b PIISeverity) int {
severityOrder := map[PIISeverity]int{
"": 0,
PIISeverityWarn: 1,
PIISeverityRedact: 2,
PIISeverityBlock: 3,
}
aOrder := severityOrder[a]
bOrder := severityOrder[b]
if aOrder > bOrder {
return 1
} else if aOrder < bOrder {
return -1
}
return 0
}
// =============================================================================
// PREDEFINED PII PATTERNS (German Context)
// =============================================================================
// DefaultPIIRules returns a set of default PII detection rules for German context.
func DefaultPIIRules() []PIIRuleConfig {
return []PIIRuleConfig{
// Email Addresses
{
Name: "Email Addresses",
Type: "regex",
Pattern: `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`,
Severity: "block",
},
// German Phone Numbers
{
Name: "German Phone Numbers",
Type: "regex",
Pattern: `(?:\+49|0)[\s.-]?\d{2,4}[\s.-]?\d{3,}[\s.-]?\d{2,}`,
Severity: "block",
},
// German Mobile Numbers
{
Name: "German Mobile Numbers",
Type: "regex",
Pattern: `(?:\+49|0)1[567]\d[\s.-]?\d{3,}[\s.-]?\d{2,}`,
Severity: "block",
},
// IBAN (German)
{
Name: "German IBAN",
Type: "regex",
Pattern: `DE\d{2}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{2}`,
Severity: "block",
},
// German Social Security Number (Sozialversicherungsnummer)
{
Name: "German Social Security Number",
Type: "regex",
Pattern: `\d{2}[0-3]\d[01]\d{2}[A-Z]\d{3}`,
Severity: "block",
},
// German Tax ID (Steuer-ID)
{
Name: "German Tax ID",
Type: "regex",
Pattern: `\d{2}\s?\d{3}\s?\d{3}\s?\d{3}`,
Severity: "block",
},
// Credit Card Numbers (Luhn-compatible patterns)
{
Name: "Credit Card Numbers",
Type: "regex",
Pattern: `(?:\d{4}[\s.-]?){3}\d{4}`,
Severity: "block",
},
// German Postal Code + City Pattern (potential address)
{
Name: "German Address Pattern",
Type: "regex",
Pattern: `\d{5}\s+[A-ZÄÖÜ][a-zäöüß]+`,
Severity: "warn",
},
// Date of Birth Patterns (DD.MM.YYYY)
{
Name: "Date of Birth",
Type: "regex",
Pattern: `(?:geboren|geb\.|Geburtsdatum|DoB)[\s:]*\d{1,2}[\./]\d{1,2}[\./]\d{2,4}`,
Severity: "warn",
},
// Personal Names with Titles
{
Name: "Personal Names with Titles",
Type: "regex",
Pattern: `(?:Herr|Frau|Dr\.|Prof\.)\s+[A-ZÄÖÜ][a-zäöüß]+\s+[A-ZÄÖÜ][a-zäöüß]+`,
Severity: "warn",
},
// German Health Insurance Number
{
Name: "Health Insurance Number",
Type: "regex",
Pattern: `[A-Z]\d{9}`,
Severity: "block",
},
// Vehicle Registration (German)
{
Name: "German Vehicle Registration",
Type: "regex",
Pattern: `[A-ZÄÖÜ]{1,3}[\s-]?[A-Z]{1,2}[\s-]?\d{1,4}[HE]?`,
Severity: "warn",
},
}
}
// =============================================================================
// REDACTION
// =============================================================================
// RedactText redacts PII from text based on the matches.
func (d *PIIDetector) RedactText(text string, matches []PIIMatch) string {
if len(matches) == 0 {
return text
}
// Sort matches by start index (descending) to replace from end
sortedMatches := make([]PIIMatch, len(matches))
copy(sortedMatches, matches)
// Simple bubble sort for small number of matches
for i := 0; i < len(sortedMatches)-1; i++ {
for j := 0; j < len(sortedMatches)-i-1; j++ {
if sortedMatches[j].StartIndex < sortedMatches[j+1].StartIndex {
sortedMatches[j], sortedMatches[j+1] = sortedMatches[j+1], sortedMatches[j]
}
}
}
result := text
for _, match := range sortedMatches {
if match.Severity == PIISeverityRedact || match.Severity == PIISeverityBlock {
replacement := strings.Repeat("*", match.EndIndex-match.StartIndex)
result = result[:match.StartIndex] + replacement + result[match.EndIndex:]
}
}
return result
}
// FilterContent filters content based on PII detection.
// Returns the filtered content and whether it should be blocked.
func (d *PIIDetector) FilterContent(ctx context.Context, content string) (string, bool, error) {
response, err := d.Detect(ctx, content)
if err != nil {
return content, false, err
}
if !response.HasPII {
return content, false, nil
}
if response.ShouldBlock {
return "", true, nil
}
// Redact content for warn/redact severity
redacted := d.RedactText(content, response.Matches)
return redacted, false, nil
}