This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
BreakPilot Dev 19855efacc
Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
feat: BreakPilot PWA - Full codebase (clean push without large binaries)
All services: admin-v2, studio-v2, website, ai-compliance-sdk,
consent-service, klausur-service, voice-service, and infrastructure.
Large PDFs and compiled binaries excluded via .gitignore.
2026-02-11 13:25:58 +01:00

250 lines
7.2 KiB
Python

"""
PII Detector Service.
Erkennt und redaktiert personenbezogene Daten (PII) in Texten
bevor sie an externe Services wie Tavily gesendet werden.
"""
import re
from dataclasses import dataclass, field
from typing import Optional
from enum import Enum
class PIIType(Enum):
"""Typen von PII."""
EMAIL = "email"
PHONE = "phone"
IBAN = "iban"
CREDIT_CARD = "credit_card"
SSN = "ssn" # Sozialversicherungsnummer
NAME = "name"
ADDRESS = "address"
DATE_OF_BIRTH = "date_of_birth"
IP_ADDRESS = "ip_address"
@dataclass
class PIIMatch:
"""Ein gefundenes PII-Element."""
type: PIIType
value: str
start: int
end: int
replacement: str
@dataclass
class RedactionResult:
"""Ergebnis der PII-Redaktion."""
original_text: str
redacted_text: str
matches: list[PIIMatch] = field(default_factory=list)
pii_found: bool = False
class PIIDetector:
"""
Service zur Erkennung und Redaktion von PII.
Verwendet Regex-Pattern für deutsche und internationale Formate.
"""
# Regex Patterns für verschiedene PII-Typen
PATTERNS = {
PIIType.EMAIL: r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
# Deutsche Telefonnummern (verschiedene Formate)
PIIType.PHONE: r'(?:\+49|0049|0)[\s\-/]?(?:\d{2,5})[\s\-/]?(?:\d{3,8})[\s\-/]?(?:\d{0,5})',
# IBAN (deutsch und international)
PIIType.IBAN: r'\b[A-Z]{2}\d{2}[\s]?(?:\d{4}[\s]?){4,7}\d{0,2}\b',
# Kreditkarten (Visa, Mastercard, Amex)
PIIType.CREDIT_CARD: r'\b(?:4\d{3}|5[1-5]\d{2}|3[47]\d{2})[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b',
# Deutsche Sozialversicherungsnummer
PIIType.SSN: r'\b\d{2}[\s]?\d{6}[\s]?[A-Z][\s]?\d{3}\b',
# IP-Adressen (IPv4)
PIIType.IP_ADDRESS: r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b',
# Geburtsdatum (deutsche Formate)
PIIType.DATE_OF_BIRTH: r'\b(?:0?[1-9]|[12]\d|3[01])\.(?:0?[1-9]|1[0-2])\.(?:19|20)\d{2}\b',
}
# Ersetzungstexte
REPLACEMENTS = {
PIIType.EMAIL: "[EMAIL_REDACTED]",
PIIType.PHONE: "[PHONE_REDACTED]",
PIIType.IBAN: "[IBAN_REDACTED]",
PIIType.CREDIT_CARD: "[CARD_REDACTED]",
PIIType.SSN: "[SSN_REDACTED]",
PIIType.NAME: "[NAME_REDACTED]",
PIIType.ADDRESS: "[ADDRESS_REDACTED]",
PIIType.DATE_OF_BIRTH: "[DOB_REDACTED]",
PIIType.IP_ADDRESS: "[IP_REDACTED]",
}
# Priorität für überlappende Matches (höher = wird bevorzugt)
PRIORITY = {
PIIType.EMAIL: 100,
PIIType.IBAN: 90,
PIIType.CREDIT_CARD: 85,
PIIType.SSN: 80,
PIIType.IP_ADDRESS: 70,
PIIType.DATE_OF_BIRTH: 60,
PIIType.PHONE: 50, # Niedrigere Priorität wegen False Positives
PIIType.NAME: 40,
PIIType.ADDRESS: 30,
}
def __init__(self, enabled_types: Optional[list[PIIType]] = None):
"""
Initialisiert den PII Detector.
Args:
enabled_types: Liste der zu erkennenden PII-Typen.
None = alle Typen aktiviert.
Leere Liste = keine Erkennung.
"""
if enabled_types is not None:
self.enabled_types = enabled_types
else:
self.enabled_types = list(PIIType)
self._compiled_patterns = {
pii_type: re.compile(pattern, re.IGNORECASE)
for pii_type, pattern in self.PATTERNS.items()
if pii_type in self.enabled_types
}
def detect(self, text: str) -> list[PIIMatch]:
"""
Erkennt PII in einem Text.
Bei überlappenden Matches wird der Match mit höherer Priorität
bevorzugt (z.B. IBAN über Telefon).
Args:
text: Der zu analysierende Text.
Returns:
Liste der gefundenen PII-Matches.
"""
all_matches = []
for pii_type, pattern in self._compiled_patterns.items():
for match in pattern.finditer(text):
all_matches.append(PIIMatch(
type=pii_type,
value=match.group(),
start=match.start(),
end=match.end(),
replacement=self.REPLACEMENTS[pii_type],
))
# Überlappende Matches filtern (höhere Priorität gewinnt)
matches = self._filter_overlapping(all_matches)
# Nach Position sortieren (für korrekte Redaktion)
matches.sort(key=lambda m: m.start)
return matches
def _filter_overlapping(self, matches: list[PIIMatch]) -> list[PIIMatch]:
"""
Filtert überlappende Matches, bevorzugt höhere Priorität.
Args:
matches: Alle gefundenen Matches.
Returns:
Gefilterte Liste ohne Überlappungen.
"""
if not matches:
return []
# Nach Priorität sortieren (höchste zuerst)
sorted_matches = sorted(
matches,
key=lambda m: self.PRIORITY.get(m.type, 0),
reverse=True,
)
result = []
used_ranges: list[tuple[int, int]] = []
for match in sorted_matches:
# Prüfen ob dieser Match mit einem bereits akzeptierten überlappt
overlaps = False
for start, end in used_ranges:
# Überlappung wenn: match.start < end AND match.end > start
if match.start < end and match.end > start:
overlaps = True
break
if not overlaps:
result.append(match)
used_ranges.append((match.start, match.end))
return result
def redact(self, text: str) -> RedactionResult:
"""
Erkennt und redaktiert PII in einem Text.
Args:
text: Der zu redaktierende Text.
Returns:
RedactionResult mit originalem und redaktiertem Text.
"""
matches = self.detect(text)
if not matches:
return RedactionResult(
original_text=text,
redacted_text=text,
matches=[],
pii_found=False,
)
# Von hinten nach vorne ersetzen (um Indizes zu erhalten)
redacted = text
for match in reversed(matches):
redacted = redacted[:match.start] + match.replacement + redacted[match.end:]
return RedactionResult(
original_text=text,
redacted_text=redacted,
matches=matches,
pii_found=True,
)
def contains_pii(self, text: str) -> bool:
"""
Prüft schnell, ob Text PII enthält.
Args:
text: Der zu prüfende Text.
Returns:
True wenn PII gefunden wurde.
"""
for pattern in self._compiled_patterns.values():
if pattern.search(text):
return True
return False
# Singleton Instance
_pii_detector: Optional[PIIDetector] = None
def get_pii_detector() -> PIIDetector:
"""Gibt Singleton-Instanz des PII Detectors zurück."""
global _pii_detector
if _pii_detector is None:
_pii_detector = PIIDetector()
return _pii_detector