/** * PII Sanitizer — Bereinigt Kontextdaten vor LLM-Aufruf * * Entfernt personenbezogene Daten (PII) aus AllowedFacts * bevor sie an das LLM weitergegeben werden. * * Bei Fehler: Hard Abort — kein LLM-Aufruf ohne erfolgreiche Sanitization. */ import type { AllowedFacts } from './allowed-facts' // ============================================================================ // Types // ============================================================================ /** Bereinigtes Faktenbudget (PII-frei) */ export type SanitizedFacts = AllowedFacts & { __sanitized: true } /** Audit-Protokoll der Sanitization */ export interface SanitizationAudit { sanitizationApplied: boolean redactedFieldsCount: number redactedFieldNames: string[] } /** Ergebnis der Sanitization */ export interface SanitizationResult { facts: SanitizedFacts audit: SanitizationAudit } /** Sanitization-Fehler (loest Hard Abort aus) */ export class SanitizationError extends Error { constructor( message: string, public readonly field: string, public readonly reason: string ) { super(message) this.name = 'SanitizationError' } } // ============================================================================ // PII Detection Patterns // ============================================================================ const PII_PATTERNS = { email: /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, phone: /(\+?\d{1,3}[-.\s]?)?\(?\d{2,5}\)?[-.\s]?\d{3,10}/g, ipAddress: /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/g, internalId: /\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b/gi, apiKey: /\b(sk-|pk-|api[_-]?key[_-]?)[a-zA-Z0-9]{20,}\b/gi, } as const // ============================================================================ // Sanitizer // ============================================================================ /** * Bereinigt AllowedFacts von PII vor dem LLM-Aufruf. * * @throws {SanitizationError} Wenn ein Feld nicht bereinigt werden kann */ export function sanitizeAllowedFacts(facts: AllowedFacts): SanitizationResult { const redactedFields: string[] = [] // Kopie erstellen const sanitized: AllowedFacts = { ...facts, specialFeatures: [...facts.specialFeatures], triggeredRegulations: [...facts.triggeredRegulations], primaryUseCases: [...facts.primaryUseCases], narrativeTags: { ...facts.narrativeTags }, } // Firmenname: erlaubt (wird benoetigt), aber PII darin pruefen sanitized.companyName = sanitizeString(facts.companyName, 'companyName', redactedFields) // Rechtsform: erlaubt, kurzer Wert sanitized.legalForm = sanitizeString(facts.legalForm, 'legalForm', redactedFields) // Branche: erlaubt sanitized.industry = sanitizeString(facts.industry, 'industry', redactedFields) // Standort: erlaubt (Stadt/Region), aber keine Strasse/Hausnummer sanitized.location = sanitizeAddress(facts.location, 'location', redactedFields) // Mitarbeiterzahl: erlaubt (kein PII) // employeeCount bleibt unveraendert // Teamstruktur: erlaubt (generisch) sanitized.teamStructure = sanitizeString(facts.teamStructure, 'teamStructure', redactedFields) // IT-Landschaft: erlaubt (generisch) sanitized.itLandscape = sanitizeString(facts.itLandscape, 'itLandscape', redactedFields) // Besonderheiten: pruefen sanitized.specialFeatures = facts.specialFeatures.map((f, i) => sanitizeString(f, `specialFeatures[${i}]`, redactedFields) ) // Regulierungen: erlaubt (generisch) sanitized.triggeredRegulations = facts.triggeredRegulations.map((r, i) => sanitizeString(r, `triggeredRegulations[${i}]`, redactedFields) ) // Use Cases: pruefen sanitized.primaryUseCases = facts.primaryUseCases.map((uc, i) => sanitizeString(uc, `primaryUseCases[${i}]`, redactedFields) ) // Narrative Tags: deterministisch, kein PII moeglich // Bleiben unveraendert return { facts: { ...sanitized, __sanitized: true } as SanitizedFacts, audit: { sanitizationApplied: true, redactedFieldsCount: redactedFields.length, redactedFieldNames: redactedFields, }, } } /** * Prueft ob ein SanitizedFacts-Objekt tatsaechlich bereinigt wurde. */ export function isSanitized(facts: unknown): facts is SanitizedFacts { return ( typeof facts === 'object' && facts !== null && '__sanitized' in facts && (facts as SanitizedFacts).__sanitized === true ) } // ============================================================================ // Private Helpers // ============================================================================ /** * Bereinigt einen String-Wert von PII. * Gibt den bereinigten String zurueck und fuegt redacted Fields hinzu. */ function sanitizeString( value: string, fieldName: string, redactedFields: string[] ): string { if (!value) return value let result = value let wasRedacted = false // E-Mail-Adressen entfernen if (PII_PATTERNS.email.test(result)) { result = result.replace(PII_PATTERNS.email, '[REDACTED]') wasRedacted = true } // Reset regex lastIndex PII_PATTERNS.email.lastIndex = 0 // Telefonnummern entfernen (nur wenn >= 6 Ziffern) const phoneMatches = result.match(PII_PATTERNS.phone) if (phoneMatches) { for (const match of phoneMatches) { if (match.replace(/\D/g, '').length >= 6) { result = result.replace(match, '[REDACTED]') wasRedacted = true } } } PII_PATTERNS.phone.lastIndex = 0 // IP-Adressen entfernen if (PII_PATTERNS.ipAddress.test(result)) { result = result.replace(PII_PATTERNS.ipAddress, '[REDACTED]') wasRedacted = true } PII_PATTERNS.ipAddress.lastIndex = 0 // Interne IDs (UUIDs) entfernen if (PII_PATTERNS.internalId.test(result)) { result = result.replace(PII_PATTERNS.internalId, '[REDACTED]') wasRedacted = true } PII_PATTERNS.internalId.lastIndex = 0 // API Keys entfernen if (PII_PATTERNS.apiKey.test(result)) { result = result.replace(PII_PATTERNS.apiKey, '[REDACTED]') wasRedacted = true } PII_PATTERNS.apiKey.lastIndex = 0 if (wasRedacted) { redactedFields.push(fieldName) } return result } /** * Bereinigt Adress-Felder: behaelt Stadt/Region, entfernt Strasse/Hausnummer. */ function sanitizeAddress( value: string, fieldName: string, redactedFields: string[] ): string { if (!value) return value // Zuerst generische PII-Bereinigung let result = sanitizeString(value, fieldName, redactedFields) // Strasse + Hausnummer Pattern (deutsch) const streetPattern = /\b[A-ZÄÖÜ][a-zäöüß]+(?:straße|str\.|weg|gasse|platz|allee|ring|damm)\s*\d+[a-z]?\b/gi if (streetPattern.test(result)) { result = result.replace(streetPattern, '') if (!redactedFields.includes(fieldName)) { redactedFields.push(fieldName) } } // PLZ-Pattern (5-stellig deutsch) const plzPattern = /\b\d{5}\s+/g if (plzPattern.test(result)) { result = result.replace(plzPattern, '') if (!redactedFields.includes(fieldName)) { redactedFields.push(fieldName) } } return result.trim() } /** * Validiert das gesamte SanitizedFacts-Objekt auf verbleibende PII. * Gibt Warnungen zurueck wenn doch noch PII gefunden wird. */ export function validateNoRemainingPII(facts: SanitizedFacts): string[] { const warnings: string[] = [] const allValues = extractAllStringValues(facts as unknown as Record) for (const { path, value } of allValues) { if (path === '__sanitized') continue PII_PATTERNS.email.lastIndex = 0 if (PII_PATTERNS.email.test(value)) { warnings.push(`Verbleibende E-Mail in ${path}`) } PII_PATTERNS.ipAddress.lastIndex = 0 if (PII_PATTERNS.ipAddress.test(value)) { warnings.push(`Verbleibende IP-Adresse in ${path}`) } PII_PATTERNS.apiKey.lastIndex = 0 if (PII_PATTERNS.apiKey.test(value)) { warnings.push(`Verbleibender API-Key in ${path}`) } } return warnings } /** * Extrahiert alle String-Werte aus einem Objekt (rekursiv). */ function extractAllStringValues( obj: Record, prefix = '' ): Array<{ path: string; value: string }> { const results: Array<{ path: string; value: string }> = [] for (const [key, val] of Object.entries(obj)) { const path = prefix ? `${prefix}.${key}` : key if (typeof val === 'string') { results.push({ path, value: val }) } else if (Array.isArray(val)) { for (let i = 0; i < val.length; i++) { if (typeof val[i] === 'string') { results.push({ path: `${path}[${i}]`, value: val[i] }) } else if (typeof val[i] === 'object' && val[i] !== null) { results.push(...extractAllStringValues(val[i] as Record, `${path}[${i}]`)) } } } else if (typeof val === 'object' && val !== null) { results.push(...extractAllStringValues(val as Record, path)) } } return results }