breakpilot-compliance/admin-compliance/lib/sdk/drafting-engine/allowed-facts.ts

/**
 * Allowed Facts Governance — Kontrolliertes Faktenbudget fuer LLM
 *
 * Definiert welche Fakten das LLM in Prosa-Bloecken verwenden darf
 * und welche Themen explizit verboten sind.
 *
 * Verhindert Halluzinationen durch explizite Whitelisting.
 */

import type { SDKState, CompanyProfile } from '../types'
import type { NarrativeTags } from './narrative-tags'

// ============================================================================
// Types
// ============================================================================

/** Explizites Faktenbudget fuer das LLM */
export interface AllowedFacts {
  // Firmenprofil
  companyName: string
  legalForm: string
  industry: string
  location: string
  employeeCount: number

  // Organisation
  teamStructure: string
  itLandscape: string
  specialFeatures: string[]

  // Compliance-Kontext
  triggeredRegulations: string[]
  primaryUseCases: string[]

  // Narrative Tags (deterministisch)
  narrativeTags: NarrativeTags
}

/** Regeln welche Themen erlaubt/verboten sind */
export interface FactPolicy {
  allowedTopics: string[]
  disallowedTopics: string[]
}

// ============================================================================
// Default Policy
// ============================================================================

export const DEFAULT_FACT_POLICY: FactPolicy = {
  allowedTopics: [
    'Branche',
    'Unternehmensgroesse',
    'Teamstruktur',
    'IT-Strategie',
    'Regulatorischer Kontext',
    'Anwendungsfaelle',
    'Organisationsform',
    'Standort',
    'Rechtsform',
  ],
  disallowedTopics: [
    'Umsatz',
    'Gewinn',
    'Kundenzahlen',
    'konkrete Zertifizierungen',
    'interne Tool-Namen',
    'Personennamen',
    'E-Mail-Adressen',
    'Telefonnummern',
    'IP-Adressen',
    'konkrete Prozentwerte',
    'konkrete Scores',
    'Compliance-Level-Bezeichnungen',
    'interne Projektnamen',
    'Passwoerter',
    'API-Keys',
    'Vertragsinhalte',
    'Gehaltsinformationen',
  ],
}

// ============================================================================
// Builder
// ============================================================================

/**
 * Extrahiert AllowedFacts aus dem SDKState.
 * Nur explizit freigegebene Felder werden uebernommen.
 */
export function buildAllowedFacts(
  state: SDKState,
  narrativeTags: NarrativeTags
): AllowedFacts {
  const profile = state.companyProfile
  const scope = state.complianceScope

  return {
    companyName: profile?.companyName ?? 'Unbekannt',
    legalForm: profile?.legalForm ?? '',
    industry: Array.isArray(profile?.industry) ? profile.industry.join(', ') : (profile?.industry ?? ''),
    location: profile?.headquartersCity ?? '',
    employeeCount: parseEmployeeCount(profile?.employeeCount),

    teamStructure: deriveTeamStructure(profile),
    itLandscape: deriveItLandscape(profile),
    specialFeatures: deriveSpecialFeatures(profile),

    triggeredRegulations: deriveTriggeredRegulations(scope),
    primaryUseCases: derivePrimaryUseCases(state),

    narrativeTags,
  }
}

// ============================================================================
// Serialization
// ============================================================================

/**
 * Serialisiert AllowedFacts fuer den LLM-Prompt.
 */
export function allowedFactsToPromptString(facts: AllowedFacts): string {
  const lines = [
    `- Firma: ${facts.companyName}${facts.legalForm ? ` (${facts.legalForm})` : ''}`,
    `- Branche: ${facts.industry || 'nicht angegeben'}`,
    `- Standort: ${facts.location || 'nicht angegeben'}`,
    `- Mitarbeiter: ${facts.employeeCount || 'nicht angegeben'}`,
    `- Teamstruktur: ${facts.teamStructure || 'nicht angegeben'}`,
    `- IT-Umgebung: ${facts.itLandscape || 'nicht angegeben'}`,
  ]

  if (facts.triggeredRegulations.length > 0) {
    lines.push(`- Relevante Regulierungen: ${facts.triggeredRegulations.join(', ')}`)
  }
  if (facts.primaryUseCases.length > 0) {
    lines.push(`- Anwendungsfaelle: ${facts.primaryUseCases.join(', ')}`)
  }
  if (facts.specialFeatures.length > 0) {
    lines.push(`- Besonderheiten: ${facts.specialFeatures.join(', ')}`)
  }

  return lines.join('\n')
}

/**
 * Serialisiert die Disallowed Topics fuer den LLM-Prompt.
 */
export function disallowedTopicsToPromptString(policy: FactPolicy = DEFAULT_FACT_POLICY): string {
  return policy.disallowedTopics.map(t => `- ${t}`).join('\n')
}

// ============================================================================
// Validation
// ============================================================================

/**
 * Prueft ob ein Text potentiell verbotene Themen enthaelt.
 * Gibt eine Liste der erkannten Verstoesse zurueck.
 */
export function checkForDisallowedContent(
  text: string,
  policy: FactPolicy = DEFAULT_FACT_POLICY
): string[] {
  const violations: string[] = []
  const lower = text.toLowerCase()

  // Prozentwerte
  if (/\d+\s*%/.test(text)) {
    violations.push('Konkrete Prozentwerte gefunden')
  }

  // Score-Muster
  if (/score[:\s]*\d+/i.test(text)) {
    violations.push('Konkrete Scores gefunden')
  }

  // Compliance-Level Bezeichnungen
  if (/\b(L1|L2|L3|L4)\b/.test(text)) {
    violations.push('Compliance-Level-Bezeichnungen (L1-L4) gefunden')
  }

  // E-Mail-Adressen
  if (/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/.test(text)) {
    violations.push('E-Mail-Adresse gefunden')
  }

  // Telefonnummern
  if (/(\+?\d{1,3}[-.\s]?)?\(?\d{2,5}\)?[-.\s]?\d{3,10}/.test(text)) {
    // Nur wenn es nicht die Mitarbeiterzahl ist (einstellig/zweistellig)
    const matches = text.match(/(\+?\d{1,3}[-.\s]?)?\(?\d{2,5}\)?[-.\s]?\d{3,10}/g) || []
    for (const m of matches) {
      if (m.replace(/\D/g, '').length >= 6) {
        violations.push('Telefonnummer gefunden')
        break
      }
    }
  }

  // IP-Adressen
  if (/\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/.test(text)) {
    violations.push('IP-Adresse gefunden')
  }

  // Direkte Ansprache
  if (/\b(Sie|Ihr|Ihnen|Ihrem|Ihrer)\b/.test(text)) {
    violations.push('Direkte Ansprache (Sie/Ihr) gefunden')
  }

  return violations
}

// ============================================================================
// Private Helpers
// ============================================================================

/**
 * Parst den employeeCount-String (z.B. "1-9", "50-249", "1000+") in eine Zahl.
 * Verwendet den Mittelwert des Bereichs oder den unteren Wert bei "+".
 */
function parseEmployeeCount(value: string | undefined | null): number {
  if (!value) return 0
  // Handle "1000+" style
  const plusMatch = value.match(/^(\d+)\+$/)
  if (plusMatch) return parseInt(plusMatch[1], 10)
  // Handle "50-249" style ranges
  const rangeMatch = value.match(/^(\d+)-(\d+)$/)
  if (rangeMatch) {
    const low = parseInt(rangeMatch[1], 10)
    const high = parseInt(rangeMatch[2], 10)
    return Math.round((low + high) / 2)
  }
  // Try plain number
  const num = parseInt(value, 10)
  return isNaN(num) ? 0 : num
}

function deriveTeamStructure(profile: CompanyProfile | null): string {
  if (!profile) return ''
  // Ableitung aus verfuegbaren Profildaten
  const count = parseEmployeeCount(profile.employeeCount)
  if (count > 500) return 'Konzernstruktur'
  if (count > 50) return 'mittelstaendisch'
  return 'Kleinunternehmen'
}

function deriveItLandscape(profile: CompanyProfile | null): string {
  if (!profile) return ''
  return profile.businessModel?.includes('SaaS') ? 'Cloud-First' :
         profile.businessModel?.includes('Cloud') ? 'Cloud-First' :
         'Hybrid'
}

function deriveSpecialFeatures(profile: CompanyProfile | null): string[] {
  if (!profile) return []
  const features: string[] = []
  const count = parseEmployeeCount(profile.employeeCount)
  if (count > 250) features.push('Grossunternehmen')
  if (profile.dpoName) features.push('Interner DSB benannt')
  return features
}

function deriveTriggeredRegulations(
  scope: import('../compliance-scope-types').ComplianceScopeState | null
): string[] {
  if (!scope?.decision) return ['DSGVO']
  const regs = new Set<string>(['DSGVO'])
  const triggers = scope.decision.triggeredHardTriggers || []
  for (const t of triggers) {
    if (t.rule.id.includes('ai_act') || t.rule.id.includes('ai-act')) regs.add('AI Act')
    if (t.rule.id.includes('nis2') || t.rule.id.includes('NIS2')) regs.add('NIS2')
    if (t.rule.id.includes('ttdsg') || t.rule.id.includes('TTDSG')) regs.add('TTDSG')
  }
  return Array.from(regs)
}

function derivePrimaryUseCases(state: SDKState): string[] {
  if (!state.useCases || state.useCases.length === 0) return []
  return state.useCases.slice(0, 3).map(uc => uc.name || 'Unbenannt')
}