fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
350
edu-search-service/internal/policy/pii_detector.go
Normal file
350
edu-search-service/internal/policy/pii_detector.go
Normal file
@@ -0,0 +1,350 @@
|
||||
package policy
|
||||
|
||||
import (
|
||||
"context"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
)
|
||||
|
||||
// PIIDetector detects personally identifiable information in text.
|
||||
type PIIDetector struct {
|
||||
store *Store
|
||||
compiledRules map[string]*regexp.Regexp
|
||||
rulesMu sync.RWMutex
|
||||
}
|
||||
|
||||
// NewPIIDetector creates a new PIIDetector instance.
|
||||
func NewPIIDetector(store *Store) *PIIDetector {
|
||||
return &PIIDetector{
|
||||
store: store,
|
||||
compiledRules: make(map[string]*regexp.Regexp),
|
||||
}
|
||||
}
|
||||
|
||||
// Detect scans text for PII patterns and returns all matches.
|
||||
func (d *PIIDetector) Detect(ctx context.Context, text string) (*PIITestResponse, error) {
|
||||
rules, err := d.store.ListPIIRules(ctx, true)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
response := &PIITestResponse{
|
||||
HasPII: false,
|
||||
Matches: []PIIMatch{},
|
||||
ShouldBlock: false,
|
||||
}
|
||||
|
||||
highestSeverity := PIISeverity("")
|
||||
|
||||
for _, rule := range rules {
|
||||
matches := d.findMatches(text, &rule)
|
||||
if len(matches) > 0 {
|
||||
response.HasPII = true
|
||||
response.Matches = append(response.Matches, matches...)
|
||||
|
||||
// Track highest severity
|
||||
if compareSeverity(rule.Severity, highestSeverity) > 0 {
|
||||
highestSeverity = rule.Severity
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
response.BlockLevel = highestSeverity
|
||||
response.ShouldBlock = highestSeverity == PIISeverityBlock
|
||||
|
||||
return response, nil
|
||||
}
|
||||
|
||||
// findMatches finds all matches for a single rule in the text.
|
||||
func (d *PIIDetector) findMatches(text string, rule *PIIRule) []PIIMatch {
|
||||
var matches []PIIMatch
|
||||
|
||||
switch rule.RuleType {
|
||||
case PIIRuleTypeRegex:
|
||||
matches = d.findRegexMatches(text, rule)
|
||||
case PIIRuleTypeKeyword:
|
||||
matches = d.findKeywordMatches(text, rule)
|
||||
}
|
||||
|
||||
return matches
|
||||
}
|
||||
|
||||
// findRegexMatches finds all regex pattern matches in text.
|
||||
func (d *PIIDetector) findRegexMatches(text string, rule *PIIRule) []PIIMatch {
|
||||
re := d.getCompiledRegex(rule.ID.String(), rule.Pattern)
|
||||
if re == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var matches []PIIMatch
|
||||
allMatches := re.FindAllStringIndex(text, -1)
|
||||
|
||||
for _, loc := range allMatches {
|
||||
matches = append(matches, PIIMatch{
|
||||
RuleID: rule.ID,
|
||||
RuleName: rule.Name,
|
||||
RuleType: rule.RuleType,
|
||||
Severity: rule.Severity,
|
||||
Match: text[loc[0]:loc[1]],
|
||||
StartIndex: loc[0],
|
||||
EndIndex: loc[1],
|
||||
})
|
||||
}
|
||||
|
||||
return matches
|
||||
}
|
||||
|
||||
// findKeywordMatches finds all keyword matches in text (case-insensitive).
|
||||
func (d *PIIDetector) findKeywordMatches(text string, rule *PIIRule) []PIIMatch {
|
||||
var matches []PIIMatch
|
||||
lowerText := strings.ToLower(text)
|
||||
|
||||
// Split pattern by commas or pipes for multiple keywords
|
||||
keywords := strings.FieldsFunc(rule.Pattern, func(r rune) bool {
|
||||
return r == ',' || r == '|'
|
||||
})
|
||||
|
||||
for _, keyword := range keywords {
|
||||
keyword = strings.TrimSpace(keyword)
|
||||
if keyword == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
lowerKeyword := strings.ToLower(keyword)
|
||||
startIdx := 0
|
||||
|
||||
for {
|
||||
idx := strings.Index(lowerText[startIdx:], lowerKeyword)
|
||||
if idx == -1 {
|
||||
break
|
||||
}
|
||||
|
||||
actualIdx := startIdx + idx
|
||||
matches = append(matches, PIIMatch{
|
||||
RuleID: rule.ID,
|
||||
RuleName: rule.Name,
|
||||
RuleType: rule.RuleType,
|
||||
Severity: rule.Severity,
|
||||
Match: text[actualIdx : actualIdx+len(keyword)],
|
||||
StartIndex: actualIdx,
|
||||
EndIndex: actualIdx + len(keyword),
|
||||
})
|
||||
|
||||
startIdx = actualIdx + len(keyword)
|
||||
}
|
||||
}
|
||||
|
||||
return matches
|
||||
}
|
||||
|
||||
// getCompiledRegex returns a compiled regex, caching for performance.
|
||||
func (d *PIIDetector) getCompiledRegex(ruleID, pattern string) *regexp.Regexp {
|
||||
d.rulesMu.RLock()
|
||||
re, ok := d.compiledRules[ruleID]
|
||||
d.rulesMu.RUnlock()
|
||||
|
||||
if ok {
|
||||
return re
|
||||
}
|
||||
|
||||
// Compile and cache
|
||||
d.rulesMu.Lock()
|
||||
defer d.rulesMu.Unlock()
|
||||
|
||||
// Double-check after acquiring write lock
|
||||
if re, ok = d.compiledRules[ruleID]; ok {
|
||||
return re
|
||||
}
|
||||
|
||||
compiled, err := regexp.Compile(pattern)
|
||||
if err != nil {
|
||||
// Invalid regex - don't cache
|
||||
return nil
|
||||
}
|
||||
|
||||
d.compiledRules[ruleID] = compiled
|
||||
return compiled
|
||||
}
|
||||
|
||||
// ClearCache clears the compiled regex cache (call after rule updates).
|
||||
func (d *PIIDetector) ClearCache() {
|
||||
d.rulesMu.Lock()
|
||||
defer d.rulesMu.Unlock()
|
||||
d.compiledRules = make(map[string]*regexp.Regexp)
|
||||
}
|
||||
|
||||
// RefreshRules reloads rules and clears the cache.
|
||||
func (d *PIIDetector) RefreshRules() {
|
||||
d.ClearCache()
|
||||
}
|
||||
|
||||
// compareSeverity compares two severity levels.
|
||||
// Returns: 1 if a > b, -1 if a < b, 0 if equal.
|
||||
func compareSeverity(a, b PIISeverity) int {
|
||||
severityOrder := map[PIISeverity]int{
|
||||
"": 0,
|
||||
PIISeverityWarn: 1,
|
||||
PIISeverityRedact: 2,
|
||||
PIISeverityBlock: 3,
|
||||
}
|
||||
|
||||
aOrder := severityOrder[a]
|
||||
bOrder := severityOrder[b]
|
||||
|
||||
if aOrder > bOrder {
|
||||
return 1
|
||||
} else if aOrder < bOrder {
|
||||
return -1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// PREDEFINED PII PATTERNS (German Context)
|
||||
// =============================================================================
|
||||
|
||||
// DefaultPIIRules returns a set of default PII detection rules for German context.
|
||||
func DefaultPIIRules() []PIIRuleConfig {
|
||||
return []PIIRuleConfig{
|
||||
// Email Addresses
|
||||
{
|
||||
Name: "Email Addresses",
|
||||
Type: "regex",
|
||||
Pattern: `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`,
|
||||
Severity: "block",
|
||||
},
|
||||
// German Phone Numbers
|
||||
{
|
||||
Name: "German Phone Numbers",
|
||||
Type: "regex",
|
||||
Pattern: `(?:\+49|0)[\s.-]?\d{2,4}[\s.-]?\d{3,}[\s.-]?\d{2,}`,
|
||||
Severity: "block",
|
||||
},
|
||||
// German Mobile Numbers
|
||||
{
|
||||
Name: "German Mobile Numbers",
|
||||
Type: "regex",
|
||||
Pattern: `(?:\+49|0)1[567]\d[\s.-]?\d{3,}[\s.-]?\d{2,}`,
|
||||
Severity: "block",
|
||||
},
|
||||
// IBAN (German)
|
||||
{
|
||||
Name: "German IBAN",
|
||||
Type: "regex",
|
||||
Pattern: `DE\d{2}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{2}`,
|
||||
Severity: "block",
|
||||
},
|
||||
// German Social Security Number (Sozialversicherungsnummer)
|
||||
{
|
||||
Name: "German Social Security Number",
|
||||
Type: "regex",
|
||||
Pattern: `\d{2}[0-3]\d[01]\d{2}[A-Z]\d{3}`,
|
||||
Severity: "block",
|
||||
},
|
||||
// German Tax ID (Steuer-ID)
|
||||
{
|
||||
Name: "German Tax ID",
|
||||
Type: "regex",
|
||||
Pattern: `\d{2}\s?\d{3}\s?\d{3}\s?\d{3}`,
|
||||
Severity: "block",
|
||||
},
|
||||
// Credit Card Numbers (Luhn-compatible patterns)
|
||||
{
|
||||
Name: "Credit Card Numbers",
|
||||
Type: "regex",
|
||||
Pattern: `(?:\d{4}[\s.-]?){3}\d{4}`,
|
||||
Severity: "block",
|
||||
},
|
||||
// German Postal Code + City Pattern (potential address)
|
||||
{
|
||||
Name: "German Address Pattern",
|
||||
Type: "regex",
|
||||
Pattern: `\d{5}\s+[A-ZÄÖÜ][a-zäöüß]+`,
|
||||
Severity: "warn",
|
||||
},
|
||||
// Date of Birth Patterns (DD.MM.YYYY)
|
||||
{
|
||||
Name: "Date of Birth",
|
||||
Type: "regex",
|
||||
Pattern: `(?:geboren|geb\.|Geburtsdatum|DoB)[\s:]*\d{1,2}[\./]\d{1,2}[\./]\d{2,4}`,
|
||||
Severity: "warn",
|
||||
},
|
||||
// Personal Names with Titles
|
||||
{
|
||||
Name: "Personal Names with Titles",
|
||||
Type: "regex",
|
||||
Pattern: `(?:Herr|Frau|Dr\.|Prof\.)\s+[A-ZÄÖÜ][a-zäöüß]+\s+[A-ZÄÖÜ][a-zäöüß]+`,
|
||||
Severity: "warn",
|
||||
},
|
||||
// German Health Insurance Number
|
||||
{
|
||||
Name: "Health Insurance Number",
|
||||
Type: "regex",
|
||||
Pattern: `[A-Z]\d{9}`,
|
||||
Severity: "block",
|
||||
},
|
||||
// Vehicle Registration (German)
|
||||
{
|
||||
Name: "German Vehicle Registration",
|
||||
Type: "regex",
|
||||
Pattern: `[A-ZÄÖÜ]{1,3}[\s-]?[A-Z]{1,2}[\s-]?\d{1,4}[HE]?`,
|
||||
Severity: "warn",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// REDACTION
|
||||
// =============================================================================
|
||||
|
||||
// RedactText redacts PII from text based on the matches.
|
||||
func (d *PIIDetector) RedactText(text string, matches []PIIMatch) string {
|
||||
if len(matches) == 0 {
|
||||
return text
|
||||
}
|
||||
|
||||
// Sort matches by start index (descending) to replace from end
|
||||
sortedMatches := make([]PIIMatch, len(matches))
|
||||
copy(sortedMatches, matches)
|
||||
|
||||
// Simple bubble sort for small number of matches
|
||||
for i := 0; i < len(sortedMatches)-1; i++ {
|
||||
for j := 0; j < len(sortedMatches)-i-1; j++ {
|
||||
if sortedMatches[j].StartIndex < sortedMatches[j+1].StartIndex {
|
||||
sortedMatches[j], sortedMatches[j+1] = sortedMatches[j+1], sortedMatches[j]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result := text
|
||||
for _, match := range sortedMatches {
|
||||
if match.Severity == PIISeverityRedact || match.Severity == PIISeverityBlock {
|
||||
replacement := strings.Repeat("*", match.EndIndex-match.StartIndex)
|
||||
result = result[:match.StartIndex] + replacement + result[match.EndIndex:]
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// FilterContent filters content based on PII detection.
|
||||
// Returns the filtered content and whether it should be blocked.
|
||||
func (d *PIIDetector) FilterContent(ctx context.Context, content string) (string, bool, error) {
|
||||
response, err := d.Detect(ctx, content)
|
||||
if err != nil {
|
||||
return content, false, err
|
||||
}
|
||||
|
||||
if !response.HasPII {
|
||||
return content, false, nil
|
||||
}
|
||||
|
||||
if response.ShouldBlock {
|
||||
return "", true, nil
|
||||
}
|
||||
|
||||
// Redact content for warn/redact severity
|
||||
redacted := d.RedactText(content, response.Matches)
|
||||
return redacted, false, nil
|
||||
}
|
||||
Reference in New Issue
Block a user