Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-ai-compliance (push) Failing after 30s
CI / test-python-backend-compliance (push) Successful in 30s
CI / test-python-document-crawler (push) Successful in 21s
CI / test-python-dsms-gateway (push) Successful in 17s
- Ruff: 144 auto-fixes (unused imports, == None → is None), F821/F811/F841 manuell - CVEs: python-multipart>=0.0.22, weasyprint>=68.0, pillow>=12.1.1, npm audit fix (0 vulns) - TS: 5 tote Drafting-Engine-Dateien entfernt, allowed-facts/sanitizer/StepHeader/context fixes - Tests: +104 (ISMS 58, Evidence 18, VVT 14, Generation 14) → 1449 passed - Refactoring: collect_ci_evidence (F→A), row_to_response (E→A), extract_requirements (E→A) - Dead Code: pca-platform, 7 Go-Handler, dsr_api.py, duplicate Schemas entfernt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
299 lines
8.8 KiB
TypeScript
299 lines
8.8 KiB
TypeScript
/**
|
|
* PII Sanitizer — Bereinigt Kontextdaten vor LLM-Aufruf
|
|
*
|
|
* Entfernt personenbezogene Daten (PII) aus AllowedFacts
|
|
* bevor sie an das LLM weitergegeben werden.
|
|
*
|
|
* Bei Fehler: Hard Abort — kein LLM-Aufruf ohne erfolgreiche Sanitization.
|
|
*/
|
|
|
|
import type { AllowedFacts } from './allowed-facts'
|
|
|
|
// ============================================================================
|
|
// Types
|
|
// ============================================================================
|
|
|
|
/** Bereinigtes Faktenbudget (PII-frei) */
|
|
export type SanitizedFacts = AllowedFacts & {
|
|
__sanitized: true
|
|
}
|
|
|
|
/** Audit-Protokoll der Sanitization */
|
|
export interface SanitizationAudit {
|
|
sanitizationApplied: boolean
|
|
redactedFieldsCount: number
|
|
redactedFieldNames: string[]
|
|
}
|
|
|
|
/** Ergebnis der Sanitization */
|
|
export interface SanitizationResult {
|
|
facts: SanitizedFacts
|
|
audit: SanitizationAudit
|
|
}
|
|
|
|
/** Sanitization-Fehler (loest Hard Abort aus) */
|
|
export class SanitizationError extends Error {
|
|
constructor(
|
|
message: string,
|
|
public readonly field: string,
|
|
public readonly reason: string
|
|
) {
|
|
super(message)
|
|
this.name = 'SanitizationError'
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// PII Detection Patterns
|
|
// ============================================================================
|
|
|
|
const PII_PATTERNS = {
|
|
email: /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g,
|
|
phone: /(\+?\d{1,3}[-.\s]?)?\(?\d{2,5}\)?[-.\s]?\d{3,10}/g,
|
|
ipAddress: /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/g,
|
|
internalId: /\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b/gi,
|
|
apiKey: /\b(sk-|pk-|api[_-]?key[_-]?)[a-zA-Z0-9]{20,}\b/gi,
|
|
} as const
|
|
|
|
// ============================================================================
|
|
// Sanitizer
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Bereinigt AllowedFacts von PII vor dem LLM-Aufruf.
|
|
*
|
|
* @throws {SanitizationError} Wenn ein Feld nicht bereinigt werden kann
|
|
*/
|
|
export function sanitizeAllowedFacts(facts: AllowedFacts): SanitizationResult {
|
|
const redactedFields: string[] = []
|
|
|
|
// Kopie erstellen
|
|
const sanitized: AllowedFacts = {
|
|
...facts,
|
|
specialFeatures: [...facts.specialFeatures],
|
|
triggeredRegulations: [...facts.triggeredRegulations],
|
|
primaryUseCases: [...facts.primaryUseCases],
|
|
narrativeTags: { ...facts.narrativeTags },
|
|
}
|
|
|
|
// Firmenname: erlaubt (wird benoetigt), aber PII darin pruefen
|
|
sanitized.companyName = sanitizeString(facts.companyName, 'companyName', redactedFields)
|
|
|
|
// Rechtsform: erlaubt, kurzer Wert
|
|
sanitized.legalForm = sanitizeString(facts.legalForm, 'legalForm', redactedFields)
|
|
|
|
// Branche: erlaubt
|
|
sanitized.industry = sanitizeString(facts.industry, 'industry', redactedFields)
|
|
|
|
// Standort: erlaubt (Stadt/Region), aber keine Strasse/Hausnummer
|
|
sanitized.location = sanitizeAddress(facts.location, 'location', redactedFields)
|
|
|
|
// Mitarbeiterzahl: erlaubt (kein PII)
|
|
// employeeCount bleibt unveraendert
|
|
|
|
// Teamstruktur: erlaubt (generisch)
|
|
sanitized.teamStructure = sanitizeString(facts.teamStructure, 'teamStructure', redactedFields)
|
|
|
|
// IT-Landschaft: erlaubt (generisch)
|
|
sanitized.itLandscape = sanitizeString(facts.itLandscape, 'itLandscape', redactedFields)
|
|
|
|
// Besonderheiten: pruefen
|
|
sanitized.specialFeatures = facts.specialFeatures.map((f, i) =>
|
|
sanitizeString(f, `specialFeatures[${i}]`, redactedFields)
|
|
)
|
|
|
|
// Regulierungen: erlaubt (generisch)
|
|
sanitized.triggeredRegulations = facts.triggeredRegulations.map((r, i) =>
|
|
sanitizeString(r, `triggeredRegulations[${i}]`, redactedFields)
|
|
)
|
|
|
|
// Use Cases: pruefen
|
|
sanitized.primaryUseCases = facts.primaryUseCases.map((uc, i) =>
|
|
sanitizeString(uc, `primaryUseCases[${i}]`, redactedFields)
|
|
)
|
|
|
|
// Narrative Tags: deterministisch, kein PII moeglich
|
|
// Bleiben unveraendert
|
|
|
|
return {
|
|
facts: { ...sanitized, __sanitized: true } as SanitizedFacts,
|
|
audit: {
|
|
sanitizationApplied: true,
|
|
redactedFieldsCount: redactedFields.length,
|
|
redactedFieldNames: redactedFields,
|
|
},
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Prueft ob ein SanitizedFacts-Objekt tatsaechlich bereinigt wurde.
|
|
*/
|
|
export function isSanitized(facts: unknown): facts is SanitizedFacts {
|
|
return (
|
|
typeof facts === 'object' &&
|
|
facts !== null &&
|
|
'__sanitized' in facts &&
|
|
(facts as SanitizedFacts).__sanitized === true
|
|
)
|
|
}
|
|
|
|
// ============================================================================
|
|
// Private Helpers
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Bereinigt einen String-Wert von PII.
|
|
* Gibt den bereinigten String zurueck und fuegt redacted Fields hinzu.
|
|
*/
|
|
function sanitizeString(
|
|
value: string,
|
|
fieldName: string,
|
|
redactedFields: string[]
|
|
): string {
|
|
if (!value) return value
|
|
|
|
let result = value
|
|
let wasRedacted = false
|
|
|
|
// E-Mail-Adressen entfernen
|
|
if (PII_PATTERNS.email.test(result)) {
|
|
result = result.replace(PII_PATTERNS.email, '[REDACTED]')
|
|
wasRedacted = true
|
|
}
|
|
// Reset regex lastIndex
|
|
PII_PATTERNS.email.lastIndex = 0
|
|
|
|
// Telefonnummern entfernen (nur wenn >= 6 Ziffern)
|
|
const phoneMatches = result.match(PII_PATTERNS.phone)
|
|
if (phoneMatches) {
|
|
for (const match of phoneMatches) {
|
|
if (match.replace(/\D/g, '').length >= 6) {
|
|
result = result.replace(match, '[REDACTED]')
|
|
wasRedacted = true
|
|
}
|
|
}
|
|
}
|
|
PII_PATTERNS.phone.lastIndex = 0
|
|
|
|
// IP-Adressen entfernen
|
|
if (PII_PATTERNS.ipAddress.test(result)) {
|
|
result = result.replace(PII_PATTERNS.ipAddress, '[REDACTED]')
|
|
wasRedacted = true
|
|
}
|
|
PII_PATTERNS.ipAddress.lastIndex = 0
|
|
|
|
// Interne IDs (UUIDs) entfernen
|
|
if (PII_PATTERNS.internalId.test(result)) {
|
|
result = result.replace(PII_PATTERNS.internalId, '[REDACTED]')
|
|
wasRedacted = true
|
|
}
|
|
PII_PATTERNS.internalId.lastIndex = 0
|
|
|
|
// API Keys entfernen
|
|
if (PII_PATTERNS.apiKey.test(result)) {
|
|
result = result.replace(PII_PATTERNS.apiKey, '[REDACTED]')
|
|
wasRedacted = true
|
|
}
|
|
PII_PATTERNS.apiKey.lastIndex = 0
|
|
|
|
if (wasRedacted) {
|
|
redactedFields.push(fieldName)
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
/**
|
|
* Bereinigt Adress-Felder: behaelt Stadt/Region, entfernt Strasse/Hausnummer.
|
|
*/
|
|
function sanitizeAddress(
|
|
value: string,
|
|
fieldName: string,
|
|
redactedFields: string[]
|
|
): string {
|
|
if (!value) return value
|
|
|
|
// Zuerst generische PII-Bereinigung
|
|
let result = sanitizeString(value, fieldName, redactedFields)
|
|
|
|
// Strasse + Hausnummer Pattern (deutsch)
|
|
const streetPattern = /\b[A-ZÄÖÜ][a-zäöüß]+(?:straße|str\.|weg|gasse|platz|allee|ring|damm)\s*\d+[a-z]?\b/gi
|
|
if (streetPattern.test(result)) {
|
|
result = result.replace(streetPattern, '')
|
|
if (!redactedFields.includes(fieldName)) {
|
|
redactedFields.push(fieldName)
|
|
}
|
|
}
|
|
|
|
// PLZ-Pattern (5-stellig deutsch)
|
|
const plzPattern = /\b\d{5}\s+/g
|
|
if (plzPattern.test(result)) {
|
|
result = result.replace(plzPattern, '')
|
|
if (!redactedFields.includes(fieldName)) {
|
|
redactedFields.push(fieldName)
|
|
}
|
|
}
|
|
|
|
return result.trim()
|
|
}
|
|
|
|
/**
|
|
* Validiert das gesamte SanitizedFacts-Objekt auf verbleibende PII.
|
|
* Gibt Warnungen zurueck wenn doch noch PII gefunden wird.
|
|
*/
|
|
export function validateNoRemainingPII(facts: SanitizedFacts): string[] {
|
|
const warnings: string[] = []
|
|
const allValues = extractAllStringValues(facts as unknown as Record<string, unknown>)
|
|
|
|
for (const { path, value } of allValues) {
|
|
if (path === '__sanitized') continue
|
|
|
|
PII_PATTERNS.email.lastIndex = 0
|
|
if (PII_PATTERNS.email.test(value)) {
|
|
warnings.push(`Verbleibende E-Mail in ${path}`)
|
|
}
|
|
|
|
PII_PATTERNS.ipAddress.lastIndex = 0
|
|
if (PII_PATTERNS.ipAddress.test(value)) {
|
|
warnings.push(`Verbleibende IP-Adresse in ${path}`)
|
|
}
|
|
|
|
PII_PATTERNS.apiKey.lastIndex = 0
|
|
if (PII_PATTERNS.apiKey.test(value)) {
|
|
warnings.push(`Verbleibender API-Key in ${path}`)
|
|
}
|
|
}
|
|
|
|
return warnings
|
|
}
|
|
|
|
/**
|
|
* Extrahiert alle String-Werte aus einem Objekt (rekursiv).
|
|
*/
|
|
function extractAllStringValues(
|
|
obj: Record<string, unknown>,
|
|
prefix = ''
|
|
): Array<{ path: string; value: string }> {
|
|
const results: Array<{ path: string; value: string }> = []
|
|
|
|
for (const [key, val] of Object.entries(obj)) {
|
|
const path = prefix ? `${prefix}.${key}` : key
|
|
|
|
if (typeof val === 'string') {
|
|
results.push({ path, value: val })
|
|
} else if (Array.isArray(val)) {
|
|
for (let i = 0; i < val.length; i++) {
|
|
if (typeof val[i] === 'string') {
|
|
results.push({ path: `${path}[${i}]`, value: val[i] })
|
|
} else if (typeof val[i] === 'object' && val[i] !== null) {
|
|
results.push(...extractAllStringValues(val[i] as Record<string, unknown>, `${path}[${i}]`))
|
|
}
|
|
}
|
|
} else if (typeof val === 'object' && val !== null) {
|
|
results.push(...extractAllStringValues(val as Record<string, unknown>, path))
|
|
}
|
|
}
|
|
|
|
return results
|
|
}
|