All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Successful in 1m45s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 21s
- edu-search-service von breakpilot-pwa nach breakpilot-lehrer kopiert (ohne vendor) - opensearch + edu-search-service in docker-compose.yml hinzugefuegt - voice-service aus docker-compose.yml entfernt (jetzt in breakpilot-core) - geo-service aus docker-compose.yml entfernt (nicht mehr benoetigt) - CI/CD: edu-search-service zu Gitea Actions und Woodpecker hinzugefuegt (Go lint, test mit go mod download, build, SBOM) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
351 lines
8.7 KiB
Go
351 lines
8.7 KiB
Go
package policy
|
|
|
|
import (
|
|
"context"
|
|
"regexp"
|
|
"strings"
|
|
"sync"
|
|
)
|
|
|
|
// PIIDetector detects personally identifiable information in text.
|
|
type PIIDetector struct {
|
|
store *Store
|
|
compiledRules map[string]*regexp.Regexp
|
|
rulesMu sync.RWMutex
|
|
}
|
|
|
|
// NewPIIDetector creates a new PIIDetector instance.
|
|
func NewPIIDetector(store *Store) *PIIDetector {
|
|
return &PIIDetector{
|
|
store: store,
|
|
compiledRules: make(map[string]*regexp.Regexp),
|
|
}
|
|
}
|
|
|
|
// Detect scans text for PII patterns and returns all matches.
|
|
func (d *PIIDetector) Detect(ctx context.Context, text string) (*PIITestResponse, error) {
|
|
rules, err := d.store.ListPIIRules(ctx, true)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
response := &PIITestResponse{
|
|
HasPII: false,
|
|
Matches: []PIIMatch{},
|
|
ShouldBlock: false,
|
|
}
|
|
|
|
highestSeverity := PIISeverity("")
|
|
|
|
for _, rule := range rules {
|
|
matches := d.findMatches(text, &rule)
|
|
if len(matches) > 0 {
|
|
response.HasPII = true
|
|
response.Matches = append(response.Matches, matches...)
|
|
|
|
// Track highest severity
|
|
if compareSeverity(rule.Severity, highestSeverity) > 0 {
|
|
highestSeverity = rule.Severity
|
|
}
|
|
}
|
|
}
|
|
|
|
response.BlockLevel = highestSeverity
|
|
response.ShouldBlock = highestSeverity == PIISeverityBlock
|
|
|
|
return response, nil
|
|
}
|
|
|
|
// findMatches finds all matches for a single rule in the text.
|
|
func (d *PIIDetector) findMatches(text string, rule *PIIRule) []PIIMatch {
|
|
var matches []PIIMatch
|
|
|
|
switch rule.RuleType {
|
|
case PIIRuleTypeRegex:
|
|
matches = d.findRegexMatches(text, rule)
|
|
case PIIRuleTypeKeyword:
|
|
matches = d.findKeywordMatches(text, rule)
|
|
}
|
|
|
|
return matches
|
|
}
|
|
|
|
// findRegexMatches finds all regex pattern matches in text.
|
|
func (d *PIIDetector) findRegexMatches(text string, rule *PIIRule) []PIIMatch {
|
|
re := d.getCompiledRegex(rule.ID.String(), rule.Pattern)
|
|
if re == nil {
|
|
return nil
|
|
}
|
|
|
|
var matches []PIIMatch
|
|
allMatches := re.FindAllStringIndex(text, -1)
|
|
|
|
for _, loc := range allMatches {
|
|
matches = append(matches, PIIMatch{
|
|
RuleID: rule.ID,
|
|
RuleName: rule.Name,
|
|
RuleType: rule.RuleType,
|
|
Severity: rule.Severity,
|
|
Match: text[loc[0]:loc[1]],
|
|
StartIndex: loc[0],
|
|
EndIndex: loc[1],
|
|
})
|
|
}
|
|
|
|
return matches
|
|
}
|
|
|
|
// findKeywordMatches finds all keyword matches in text (case-insensitive).
|
|
func (d *PIIDetector) findKeywordMatches(text string, rule *PIIRule) []PIIMatch {
|
|
var matches []PIIMatch
|
|
lowerText := strings.ToLower(text)
|
|
|
|
// Split pattern by commas or pipes for multiple keywords
|
|
keywords := strings.FieldsFunc(rule.Pattern, func(r rune) bool {
|
|
return r == ',' || r == '|'
|
|
})
|
|
|
|
for _, keyword := range keywords {
|
|
keyword = strings.TrimSpace(keyword)
|
|
if keyword == "" {
|
|
continue
|
|
}
|
|
|
|
lowerKeyword := strings.ToLower(keyword)
|
|
startIdx := 0
|
|
|
|
for {
|
|
idx := strings.Index(lowerText[startIdx:], lowerKeyword)
|
|
if idx == -1 {
|
|
break
|
|
}
|
|
|
|
actualIdx := startIdx + idx
|
|
matches = append(matches, PIIMatch{
|
|
RuleID: rule.ID,
|
|
RuleName: rule.Name,
|
|
RuleType: rule.RuleType,
|
|
Severity: rule.Severity,
|
|
Match: text[actualIdx : actualIdx+len(keyword)],
|
|
StartIndex: actualIdx,
|
|
EndIndex: actualIdx + len(keyword),
|
|
})
|
|
|
|
startIdx = actualIdx + len(keyword)
|
|
}
|
|
}
|
|
|
|
return matches
|
|
}
|
|
|
|
// getCompiledRegex returns a compiled regex, caching for performance.
|
|
func (d *PIIDetector) getCompiledRegex(ruleID, pattern string) *regexp.Regexp {
|
|
d.rulesMu.RLock()
|
|
re, ok := d.compiledRules[ruleID]
|
|
d.rulesMu.RUnlock()
|
|
|
|
if ok {
|
|
return re
|
|
}
|
|
|
|
// Compile and cache
|
|
d.rulesMu.Lock()
|
|
defer d.rulesMu.Unlock()
|
|
|
|
// Double-check after acquiring write lock
|
|
if re, ok = d.compiledRules[ruleID]; ok {
|
|
return re
|
|
}
|
|
|
|
compiled, err := regexp.Compile(pattern)
|
|
if err != nil {
|
|
// Invalid regex - don't cache
|
|
return nil
|
|
}
|
|
|
|
d.compiledRules[ruleID] = compiled
|
|
return compiled
|
|
}
|
|
|
|
// ClearCache clears the compiled regex cache (call after rule updates).
|
|
func (d *PIIDetector) ClearCache() {
|
|
d.rulesMu.Lock()
|
|
defer d.rulesMu.Unlock()
|
|
d.compiledRules = make(map[string]*regexp.Regexp)
|
|
}
|
|
|
|
// RefreshRules reloads rules and clears the cache.
|
|
func (d *PIIDetector) RefreshRules() {
|
|
d.ClearCache()
|
|
}
|
|
|
|
// compareSeverity compares two severity levels.
|
|
// Returns: 1 if a > b, -1 if a < b, 0 if equal.
|
|
func compareSeverity(a, b PIISeverity) int {
|
|
severityOrder := map[PIISeverity]int{
|
|
"": 0,
|
|
PIISeverityWarn: 1,
|
|
PIISeverityRedact: 2,
|
|
PIISeverityBlock: 3,
|
|
}
|
|
|
|
aOrder := severityOrder[a]
|
|
bOrder := severityOrder[b]
|
|
|
|
if aOrder > bOrder {
|
|
return 1
|
|
} else if aOrder < bOrder {
|
|
return -1
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// =============================================================================
|
|
// PREDEFINED PII PATTERNS (German Context)
|
|
// =============================================================================
|
|
|
|
// DefaultPIIRules returns a set of default PII detection rules for German context.
|
|
func DefaultPIIRules() []PIIRuleConfig {
|
|
return []PIIRuleConfig{
|
|
// Email Addresses
|
|
{
|
|
Name: "Email Addresses",
|
|
Type: "regex",
|
|
Pattern: `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`,
|
|
Severity: "block",
|
|
},
|
|
// German Phone Numbers
|
|
{
|
|
Name: "German Phone Numbers",
|
|
Type: "regex",
|
|
Pattern: `(?:\+49|0)[\s.-]?\d{2,4}[\s.-]?\d{3,}[\s.-]?\d{2,}`,
|
|
Severity: "block",
|
|
},
|
|
// German Mobile Numbers
|
|
{
|
|
Name: "German Mobile Numbers",
|
|
Type: "regex",
|
|
Pattern: `(?:\+49|0)1[567]\d[\s.-]?\d{3,}[\s.-]?\d{2,}`,
|
|
Severity: "block",
|
|
},
|
|
// IBAN (German)
|
|
{
|
|
Name: "German IBAN",
|
|
Type: "regex",
|
|
Pattern: `DE\d{2}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{2}`,
|
|
Severity: "block",
|
|
},
|
|
// German Social Security Number (Sozialversicherungsnummer)
|
|
{
|
|
Name: "German Social Security Number",
|
|
Type: "regex",
|
|
Pattern: `\d{2}[0-3]\d[01]\d{2}[A-Z]\d{3}`,
|
|
Severity: "block",
|
|
},
|
|
// German Tax ID (Steuer-ID)
|
|
{
|
|
Name: "German Tax ID",
|
|
Type: "regex",
|
|
Pattern: `\d{2}\s?\d{3}\s?\d{3}\s?\d{3}`,
|
|
Severity: "block",
|
|
},
|
|
// Credit Card Numbers (Luhn-compatible patterns)
|
|
{
|
|
Name: "Credit Card Numbers",
|
|
Type: "regex",
|
|
Pattern: `(?:\d{4}[\s.-]?){3}\d{4}`,
|
|
Severity: "block",
|
|
},
|
|
// German Postal Code + City Pattern (potential address)
|
|
{
|
|
Name: "German Address Pattern",
|
|
Type: "regex",
|
|
Pattern: `\d{5}\s+[A-ZÄÖÜ][a-zäöüß]+`,
|
|
Severity: "warn",
|
|
},
|
|
// Date of Birth Patterns (DD.MM.YYYY)
|
|
{
|
|
Name: "Date of Birth",
|
|
Type: "regex",
|
|
Pattern: `(?:geboren|geb\.|Geburtsdatum|DoB)[\s:]*\d{1,2}[\./]\d{1,2}[\./]\d{2,4}`,
|
|
Severity: "warn",
|
|
},
|
|
// Personal Names with Titles
|
|
{
|
|
Name: "Personal Names with Titles",
|
|
Type: "regex",
|
|
Pattern: `(?:Herr|Frau|Dr\.|Prof\.)\s+[A-ZÄÖÜ][a-zäöüß]+\s+[A-ZÄÖÜ][a-zäöüß]+`,
|
|
Severity: "warn",
|
|
},
|
|
// German Health Insurance Number
|
|
{
|
|
Name: "Health Insurance Number",
|
|
Type: "regex",
|
|
Pattern: `[A-Z]\d{9}`,
|
|
Severity: "block",
|
|
},
|
|
// Vehicle Registration (German)
|
|
{
|
|
Name: "German Vehicle Registration",
|
|
Type: "regex",
|
|
Pattern: `[A-ZÄÖÜ]{1,3}[\s-]?[A-Z]{1,2}[\s-]?\d{1,4}[HE]?`,
|
|
Severity: "warn",
|
|
},
|
|
}
|
|
}
|
|
|
|
// =============================================================================
|
|
// REDACTION
|
|
// =============================================================================
|
|
|
|
// RedactText redacts PII from text based on the matches.
|
|
func (d *PIIDetector) RedactText(text string, matches []PIIMatch) string {
|
|
if len(matches) == 0 {
|
|
return text
|
|
}
|
|
|
|
// Sort matches by start index (descending) to replace from end
|
|
sortedMatches := make([]PIIMatch, len(matches))
|
|
copy(sortedMatches, matches)
|
|
|
|
// Simple bubble sort for small number of matches
|
|
for i := 0; i < len(sortedMatches)-1; i++ {
|
|
for j := 0; j < len(sortedMatches)-i-1; j++ {
|
|
if sortedMatches[j].StartIndex < sortedMatches[j+1].StartIndex {
|
|
sortedMatches[j], sortedMatches[j+1] = sortedMatches[j+1], sortedMatches[j]
|
|
}
|
|
}
|
|
}
|
|
|
|
result := text
|
|
for _, match := range sortedMatches {
|
|
if match.Severity == PIISeverityRedact || match.Severity == PIISeverityBlock {
|
|
replacement := strings.Repeat("*", match.EndIndex-match.StartIndex)
|
|
result = result[:match.StartIndex] + replacement + result[match.EndIndex:]
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// FilterContent filters content based on PII detection.
|
|
// Returns the filtered content and whether it should be blocked.
|
|
func (d *PIIDetector) FilterContent(ctx context.Context, content string) (string, bool, error) {
|
|
response, err := d.Detect(ctx, content)
|
|
if err != nil {
|
|
return content, false, err
|
|
}
|
|
|
|
if !response.HasPII {
|
|
return content, false, nil
|
|
}
|
|
|
|
if response.ShouldBlock {
|
|
return "", true, nil
|
|
}
|
|
|
|
// Redact content for warn/redact severity
|
|
redacted := d.RedactText(content, response.Matches)
|
|
return redacted, false, nil
|
|
}
|