Files
breakpilot-compliance/admin-compliance/lib/sdk/drafting-engine/sanitizer.ts
Benjamin Admin 95fcba34cd
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-ai-compliance (push) Failing after 30s
CI / test-python-backend-compliance (push) Successful in 30s
CI / test-python-document-crawler (push) Successful in 21s
CI / test-python-dsms-gateway (push) Successful in 17s
fix(quality): Ruff/CVE/TS-Fixes, 104 neue Tests, Complexity-Refactoring
- Ruff: 144 auto-fixes (unused imports, == None → is None), F821/F811/F841 manuell
- CVEs: python-multipart>=0.0.22, weasyprint>=68.0, pillow>=12.1.1, npm audit fix (0 vulns)
- TS: 5 tote Drafting-Engine-Dateien entfernt, allowed-facts/sanitizer/StepHeader/context fixes
- Tests: +104 (ISMS 58, Evidence 18, VVT 14, Generation 14) → 1449 passed
- Refactoring: collect_ci_evidence (F→A), row_to_response (E→A), extract_requirements (E→A)
- Dead Code: pca-platform, 7 Go-Handler, dsr_api.py, duplicate Schemas entfernt

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 19:00:33 +01:00

299 lines
8.8 KiB
TypeScript

/**
* PII Sanitizer — Bereinigt Kontextdaten vor LLM-Aufruf
*
* Entfernt personenbezogene Daten (PII) aus AllowedFacts
* bevor sie an das LLM weitergegeben werden.
*
* Bei Fehler: Hard Abort — kein LLM-Aufruf ohne erfolgreiche Sanitization.
*/
import type { AllowedFacts } from './allowed-facts'
// ============================================================================
// Types
// ============================================================================
/** Bereinigtes Faktenbudget (PII-frei) */
export type SanitizedFacts = AllowedFacts & {
__sanitized: true
}
/** Audit-Protokoll der Sanitization */
export interface SanitizationAudit {
sanitizationApplied: boolean
redactedFieldsCount: number
redactedFieldNames: string[]
}
/** Ergebnis der Sanitization */
export interface SanitizationResult {
facts: SanitizedFacts
audit: SanitizationAudit
}
/** Sanitization-Fehler (loest Hard Abort aus) */
export class SanitizationError extends Error {
constructor(
message: string,
public readonly field: string,
public readonly reason: string
) {
super(message)
this.name = 'SanitizationError'
}
}
// ============================================================================
// PII Detection Patterns
// ============================================================================
const PII_PATTERNS = {
email: /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g,
phone: /(\+?\d{1,3}[-.\s]?)?\(?\d{2,5}\)?[-.\s]?\d{3,10}/g,
ipAddress: /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/g,
internalId: /\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b/gi,
apiKey: /\b(sk-|pk-|api[_-]?key[_-]?)[a-zA-Z0-9]{20,}\b/gi,
} as const
// ============================================================================
// Sanitizer
// ============================================================================
/**
* Bereinigt AllowedFacts von PII vor dem LLM-Aufruf.
*
* @throws {SanitizationError} Wenn ein Feld nicht bereinigt werden kann
*/
export function sanitizeAllowedFacts(facts: AllowedFacts): SanitizationResult {
const redactedFields: string[] = []
// Kopie erstellen
const sanitized: AllowedFacts = {
...facts,
specialFeatures: [...facts.specialFeatures],
triggeredRegulations: [...facts.triggeredRegulations],
primaryUseCases: [...facts.primaryUseCases],
narrativeTags: { ...facts.narrativeTags },
}
// Firmenname: erlaubt (wird benoetigt), aber PII darin pruefen
sanitized.companyName = sanitizeString(facts.companyName, 'companyName', redactedFields)
// Rechtsform: erlaubt, kurzer Wert
sanitized.legalForm = sanitizeString(facts.legalForm, 'legalForm', redactedFields)
// Branche: erlaubt
sanitized.industry = sanitizeString(facts.industry, 'industry', redactedFields)
// Standort: erlaubt (Stadt/Region), aber keine Strasse/Hausnummer
sanitized.location = sanitizeAddress(facts.location, 'location', redactedFields)
// Mitarbeiterzahl: erlaubt (kein PII)
// employeeCount bleibt unveraendert
// Teamstruktur: erlaubt (generisch)
sanitized.teamStructure = sanitizeString(facts.teamStructure, 'teamStructure', redactedFields)
// IT-Landschaft: erlaubt (generisch)
sanitized.itLandscape = sanitizeString(facts.itLandscape, 'itLandscape', redactedFields)
// Besonderheiten: pruefen
sanitized.specialFeatures = facts.specialFeatures.map((f, i) =>
sanitizeString(f, `specialFeatures[${i}]`, redactedFields)
)
// Regulierungen: erlaubt (generisch)
sanitized.triggeredRegulations = facts.triggeredRegulations.map((r, i) =>
sanitizeString(r, `triggeredRegulations[${i}]`, redactedFields)
)
// Use Cases: pruefen
sanitized.primaryUseCases = facts.primaryUseCases.map((uc, i) =>
sanitizeString(uc, `primaryUseCases[${i}]`, redactedFields)
)
// Narrative Tags: deterministisch, kein PII moeglich
// Bleiben unveraendert
return {
facts: { ...sanitized, __sanitized: true } as SanitizedFacts,
audit: {
sanitizationApplied: true,
redactedFieldsCount: redactedFields.length,
redactedFieldNames: redactedFields,
},
}
}
/**
* Prueft ob ein SanitizedFacts-Objekt tatsaechlich bereinigt wurde.
*/
export function isSanitized(facts: unknown): facts is SanitizedFacts {
return (
typeof facts === 'object' &&
facts !== null &&
'__sanitized' in facts &&
(facts as SanitizedFacts).__sanitized === true
)
}
// ============================================================================
// Private Helpers
// ============================================================================
/**
* Bereinigt einen String-Wert von PII.
* Gibt den bereinigten String zurueck und fuegt redacted Fields hinzu.
*/
function sanitizeString(
value: string,
fieldName: string,
redactedFields: string[]
): string {
if (!value) return value
let result = value
let wasRedacted = false
// E-Mail-Adressen entfernen
if (PII_PATTERNS.email.test(result)) {
result = result.replace(PII_PATTERNS.email, '[REDACTED]')
wasRedacted = true
}
// Reset regex lastIndex
PII_PATTERNS.email.lastIndex = 0
// Telefonnummern entfernen (nur wenn >= 6 Ziffern)
const phoneMatches = result.match(PII_PATTERNS.phone)
if (phoneMatches) {
for (const match of phoneMatches) {
if (match.replace(/\D/g, '').length >= 6) {
result = result.replace(match, '[REDACTED]')
wasRedacted = true
}
}
}
PII_PATTERNS.phone.lastIndex = 0
// IP-Adressen entfernen
if (PII_PATTERNS.ipAddress.test(result)) {
result = result.replace(PII_PATTERNS.ipAddress, '[REDACTED]')
wasRedacted = true
}
PII_PATTERNS.ipAddress.lastIndex = 0
// Interne IDs (UUIDs) entfernen
if (PII_PATTERNS.internalId.test(result)) {
result = result.replace(PII_PATTERNS.internalId, '[REDACTED]')
wasRedacted = true
}
PII_PATTERNS.internalId.lastIndex = 0
// API Keys entfernen
if (PII_PATTERNS.apiKey.test(result)) {
result = result.replace(PII_PATTERNS.apiKey, '[REDACTED]')
wasRedacted = true
}
PII_PATTERNS.apiKey.lastIndex = 0
if (wasRedacted) {
redactedFields.push(fieldName)
}
return result
}
/**
* Bereinigt Adress-Felder: behaelt Stadt/Region, entfernt Strasse/Hausnummer.
*/
function sanitizeAddress(
value: string,
fieldName: string,
redactedFields: string[]
): string {
if (!value) return value
// Zuerst generische PII-Bereinigung
let result = sanitizeString(value, fieldName, redactedFields)
// Strasse + Hausnummer Pattern (deutsch)
const streetPattern = /\b[A-ZÄÖÜ][a-zäöüß]+(?:straße|str\.|weg|gasse|platz|allee|ring|damm)\s*\d+[a-z]?\b/gi
if (streetPattern.test(result)) {
result = result.replace(streetPattern, '')
if (!redactedFields.includes(fieldName)) {
redactedFields.push(fieldName)
}
}
// PLZ-Pattern (5-stellig deutsch)
const plzPattern = /\b\d{5}\s+/g
if (plzPattern.test(result)) {
result = result.replace(plzPattern, '')
if (!redactedFields.includes(fieldName)) {
redactedFields.push(fieldName)
}
}
return result.trim()
}
/**
* Validiert das gesamte SanitizedFacts-Objekt auf verbleibende PII.
* Gibt Warnungen zurueck wenn doch noch PII gefunden wird.
*/
export function validateNoRemainingPII(facts: SanitizedFacts): string[] {
const warnings: string[] = []
const allValues = extractAllStringValues(facts as unknown as Record<string, unknown>)
for (const { path, value } of allValues) {
if (path === '__sanitized') continue
PII_PATTERNS.email.lastIndex = 0
if (PII_PATTERNS.email.test(value)) {
warnings.push(`Verbleibende E-Mail in ${path}`)
}
PII_PATTERNS.ipAddress.lastIndex = 0
if (PII_PATTERNS.ipAddress.test(value)) {
warnings.push(`Verbleibende IP-Adresse in ${path}`)
}
PII_PATTERNS.apiKey.lastIndex = 0
if (PII_PATTERNS.apiKey.test(value)) {
warnings.push(`Verbleibender API-Key in ${path}`)
}
}
return warnings
}
/**
* Extrahiert alle String-Werte aus einem Objekt (rekursiv).
*/
function extractAllStringValues(
obj: Record<string, unknown>,
prefix = ''
): Array<{ path: string; value: string }> {
const results: Array<{ path: string; value: string }> = []
for (const [key, val] of Object.entries(obj)) {
const path = prefix ? `${prefix}.${key}` : key
if (typeof val === 'string') {
results.push({ path, value: val })
} else if (Array.isArray(val)) {
for (let i = 0; i < val.length; i++) {
if (typeof val[i] === 'string') {
results.push({ path: `${path}[${i}]`, value: val[i] })
} else if (typeof val[i] === 'object' && val[i] !== null) {
results.push(...extractAllStringValues(val[i] as Record<string, unknown>, `${path}[${i}]`))
}
}
} else if (typeof val === 'object' && val !== null) {
results.push(...extractAllStringValues(val as Record<string, unknown>, path))
}
}
return results
}