fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
249
backend/llm_gateway/services/pii_detector.py
Normal file
249
backend/llm_gateway/services/pii_detector.py
Normal file
@@ -0,0 +1,249 @@
|
||||
"""
|
||||
PII Detector Service.
|
||||
|
||||
Erkennt und redaktiert personenbezogene Daten (PII) in Texten
|
||||
bevor sie an externe Services wie Tavily gesendet werden.
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class PIIType(Enum):
|
||||
"""Typen von PII."""
|
||||
EMAIL = "email"
|
||||
PHONE = "phone"
|
||||
IBAN = "iban"
|
||||
CREDIT_CARD = "credit_card"
|
||||
SSN = "ssn" # Sozialversicherungsnummer
|
||||
NAME = "name"
|
||||
ADDRESS = "address"
|
||||
DATE_OF_BIRTH = "date_of_birth"
|
||||
IP_ADDRESS = "ip_address"
|
||||
|
||||
|
||||
@dataclass
|
||||
class PIIMatch:
|
||||
"""Ein gefundenes PII-Element."""
|
||||
type: PIIType
|
||||
value: str
|
||||
start: int
|
||||
end: int
|
||||
replacement: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class RedactionResult:
|
||||
"""Ergebnis der PII-Redaktion."""
|
||||
original_text: str
|
||||
redacted_text: str
|
||||
matches: list[PIIMatch] = field(default_factory=list)
|
||||
pii_found: bool = False
|
||||
|
||||
|
||||
class PIIDetector:
|
||||
"""
|
||||
Service zur Erkennung und Redaktion von PII.
|
||||
|
||||
Verwendet Regex-Pattern für deutsche und internationale Formate.
|
||||
"""
|
||||
|
||||
# Regex Patterns für verschiedene PII-Typen
|
||||
PATTERNS = {
|
||||
PIIType.EMAIL: r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
||||
|
||||
# Deutsche Telefonnummern (verschiedene Formate)
|
||||
PIIType.PHONE: r'(?:\+49|0049|0)[\s\-/]?(?:\d{2,5})[\s\-/]?(?:\d{3,8})[\s\-/]?(?:\d{0,5})',
|
||||
|
||||
# IBAN (deutsch und international)
|
||||
PIIType.IBAN: r'\b[A-Z]{2}\d{2}[\s]?(?:\d{4}[\s]?){4,7}\d{0,2}\b',
|
||||
|
||||
# Kreditkarten (Visa, Mastercard, Amex)
|
||||
PIIType.CREDIT_CARD: r'\b(?:4\d{3}|5[1-5]\d{2}|3[47]\d{2})[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b',
|
||||
|
||||
# Deutsche Sozialversicherungsnummer
|
||||
PIIType.SSN: r'\b\d{2}[\s]?\d{6}[\s]?[A-Z][\s]?\d{3}\b',
|
||||
|
||||
# IP-Adressen (IPv4)
|
||||
PIIType.IP_ADDRESS: r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b',
|
||||
|
||||
# Geburtsdatum (deutsche Formate)
|
||||
PIIType.DATE_OF_BIRTH: r'\b(?:0?[1-9]|[12]\d|3[01])\.(?:0?[1-9]|1[0-2])\.(?:19|20)\d{2}\b',
|
||||
}
|
||||
|
||||
# Ersetzungstexte
|
||||
REPLACEMENTS = {
|
||||
PIIType.EMAIL: "[EMAIL_REDACTED]",
|
||||
PIIType.PHONE: "[PHONE_REDACTED]",
|
||||
PIIType.IBAN: "[IBAN_REDACTED]",
|
||||
PIIType.CREDIT_CARD: "[CARD_REDACTED]",
|
||||
PIIType.SSN: "[SSN_REDACTED]",
|
||||
PIIType.NAME: "[NAME_REDACTED]",
|
||||
PIIType.ADDRESS: "[ADDRESS_REDACTED]",
|
||||
PIIType.DATE_OF_BIRTH: "[DOB_REDACTED]",
|
||||
PIIType.IP_ADDRESS: "[IP_REDACTED]",
|
||||
}
|
||||
|
||||
# Priorität für überlappende Matches (höher = wird bevorzugt)
|
||||
PRIORITY = {
|
||||
PIIType.EMAIL: 100,
|
||||
PIIType.IBAN: 90,
|
||||
PIIType.CREDIT_CARD: 85,
|
||||
PIIType.SSN: 80,
|
||||
PIIType.IP_ADDRESS: 70,
|
||||
PIIType.DATE_OF_BIRTH: 60,
|
||||
PIIType.PHONE: 50, # Niedrigere Priorität wegen False Positives
|
||||
PIIType.NAME: 40,
|
||||
PIIType.ADDRESS: 30,
|
||||
}
|
||||
|
||||
def __init__(self, enabled_types: Optional[list[PIIType]] = None):
|
||||
"""
|
||||
Initialisiert den PII Detector.
|
||||
|
||||
Args:
|
||||
enabled_types: Liste der zu erkennenden PII-Typen.
|
||||
None = alle Typen aktiviert.
|
||||
Leere Liste = keine Erkennung.
|
||||
"""
|
||||
if enabled_types is not None:
|
||||
self.enabled_types = enabled_types
|
||||
else:
|
||||
self.enabled_types = list(PIIType)
|
||||
|
||||
self._compiled_patterns = {
|
||||
pii_type: re.compile(pattern, re.IGNORECASE)
|
||||
for pii_type, pattern in self.PATTERNS.items()
|
||||
if pii_type in self.enabled_types
|
||||
}
|
||||
|
||||
def detect(self, text: str) -> list[PIIMatch]:
|
||||
"""
|
||||
Erkennt PII in einem Text.
|
||||
|
||||
Bei überlappenden Matches wird der Match mit höherer Priorität
|
||||
bevorzugt (z.B. IBAN über Telefon).
|
||||
|
||||
Args:
|
||||
text: Der zu analysierende Text.
|
||||
|
||||
Returns:
|
||||
Liste der gefundenen PII-Matches.
|
||||
"""
|
||||
all_matches = []
|
||||
|
||||
for pii_type, pattern in self._compiled_patterns.items():
|
||||
for match in pattern.finditer(text):
|
||||
all_matches.append(PIIMatch(
|
||||
type=pii_type,
|
||||
value=match.group(),
|
||||
start=match.start(),
|
||||
end=match.end(),
|
||||
replacement=self.REPLACEMENTS[pii_type],
|
||||
))
|
||||
|
||||
# Überlappende Matches filtern (höhere Priorität gewinnt)
|
||||
matches = self._filter_overlapping(all_matches)
|
||||
|
||||
# Nach Position sortieren (für korrekte Redaktion)
|
||||
matches.sort(key=lambda m: m.start)
|
||||
return matches
|
||||
|
||||
def _filter_overlapping(self, matches: list[PIIMatch]) -> list[PIIMatch]:
|
||||
"""
|
||||
Filtert überlappende Matches, bevorzugt höhere Priorität.
|
||||
|
||||
Args:
|
||||
matches: Alle gefundenen Matches.
|
||||
|
||||
Returns:
|
||||
Gefilterte Liste ohne Überlappungen.
|
||||
"""
|
||||
if not matches:
|
||||
return []
|
||||
|
||||
# Nach Priorität sortieren (höchste zuerst)
|
||||
sorted_matches = sorted(
|
||||
matches,
|
||||
key=lambda m: self.PRIORITY.get(m.type, 0),
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
result = []
|
||||
used_ranges: list[tuple[int, int]] = []
|
||||
|
||||
for match in sorted_matches:
|
||||
# Prüfen ob dieser Match mit einem bereits akzeptierten überlappt
|
||||
overlaps = False
|
||||
for start, end in used_ranges:
|
||||
# Überlappung wenn: match.start < end AND match.end > start
|
||||
if match.start < end and match.end > start:
|
||||
overlaps = True
|
||||
break
|
||||
|
||||
if not overlaps:
|
||||
result.append(match)
|
||||
used_ranges.append((match.start, match.end))
|
||||
|
||||
return result
|
||||
|
||||
def redact(self, text: str) -> RedactionResult:
|
||||
"""
|
||||
Erkennt und redaktiert PII in einem Text.
|
||||
|
||||
Args:
|
||||
text: Der zu redaktierende Text.
|
||||
|
||||
Returns:
|
||||
RedactionResult mit originalem und redaktiertem Text.
|
||||
"""
|
||||
matches = self.detect(text)
|
||||
|
||||
if not matches:
|
||||
return RedactionResult(
|
||||
original_text=text,
|
||||
redacted_text=text,
|
||||
matches=[],
|
||||
pii_found=False,
|
||||
)
|
||||
|
||||
# Von hinten nach vorne ersetzen (um Indizes zu erhalten)
|
||||
redacted = text
|
||||
for match in reversed(matches):
|
||||
redacted = redacted[:match.start] + match.replacement + redacted[match.end:]
|
||||
|
||||
return RedactionResult(
|
||||
original_text=text,
|
||||
redacted_text=redacted,
|
||||
matches=matches,
|
||||
pii_found=True,
|
||||
)
|
||||
|
||||
def contains_pii(self, text: str) -> bool:
|
||||
"""
|
||||
Prüft schnell, ob Text PII enthält.
|
||||
|
||||
Args:
|
||||
text: Der zu prüfende Text.
|
||||
|
||||
Returns:
|
||||
True wenn PII gefunden wurde.
|
||||
"""
|
||||
for pattern in self._compiled_patterns.values():
|
||||
if pattern.search(text):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# Singleton Instance
|
||||
_pii_detector: Optional[PIIDetector] = None
|
||||
|
||||
|
||||
def get_pii_detector() -> PIIDetector:
|
||||
"""Gibt Singleton-Instanz des PII Detectors zurück."""
|
||||
global _pii_detector
|
||||
if _pii_detector is None:
|
||||
_pii_detector = PIIDetector()
|
||||
return _pii_detector
|
||||
Reference in New Issue
Block a user