feat: Unified Compliance-Check — 8 document types in one form

New 3-tab structure: Website-Scan, Compliance-Check, Banner-Check.

Compliance-Check Tab (replaces Dokumenten-Pruefung + Impressum-Check):
- 8 document rows: DSI, Impressum, Social Media, Cookie, AGB,
  Nutzungsbedingungen, Widerruf, DSB-Kontakt
- Each row: URL input + "Text laden" + file upload + manual text
- "Text laden" extracts via consent-tester, shows in editable textarea
- User verifies/corrects text before checking
- Empty fields = "not present" → own finding

Business Profiler (business_profiler.py):
- Detects B2B/B2C/B2G from all documents together
- Recognizes regulated professions, online shops, editorial content
- Context-aware: INFO checks become PASS/FAIL based on profile

Backend: /compliance-check + /extract-text endpoints
Frontend: ComplianceCheckTab.tsx + DocumentRow.tsx
API proxies: compliance-check/route.ts + extract-text/route.ts

Also: Impressum regex fixes (Telefon, AG, Geschaeftsfuehrung)
and INFO severity for context-dependent checks.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-11 20:56:10 +02:00
parent b214cbc003
commit 0d0e705117
8 changed files with 1252 additions and 8 deletions
@@ -0,0 +1,39 @@
/**
* Unified Compliance Check Proxy
* POST: start check for all documents, GET: poll status
*/
import { NextRequest, NextResponse } from 'next/server'
const BACKEND_URL = process.env.BACKEND_API_URL || 'http://backend-compliance:8002'
export async function POST(request: NextRequest) {
try {
const body = await request.text()
const response = await fetch(`${BACKEND_URL}/api/compliance/agent/compliance-check`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body,
signal: AbortSignal.timeout(30000),
})
const data = await response.json()
return NextResponse.json(data, { status: response.status })
} catch (error) {
return NextResponse.json({ error: 'Pruefung konnte nicht gestartet werden' }, { status: 503 })
}
}
export async function GET(request: NextRequest) {
const checkId = request.nextUrl.searchParams.get('check_id')
if (!checkId) return NextResponse.json({ error: 'check_id required' }, { status: 400 })
try {
const response = await fetch(
`${BACKEND_URL}/api/compliance/agent/compliance-check/${checkId}`,
{ signal: AbortSignal.timeout(10000) },
)
const data = await response.json()
return NextResponse.json(data)
} catch {
return NextResponse.json({ error: 'Status-Abfrage fehlgeschlagen' }, { status: 503 })
}
}
@@ -0,0 +1,27 @@
/**
* Text Extraction Proxy — extract text from a URL via consent-tester
* POST: { url: string } -> { text, word_count, title, error }
*/
import { NextRequest, NextResponse } from 'next/server'
const BACKEND_URL = process.env.BACKEND_API_URL || 'http://backend-compliance:8002'
export async function POST(request: NextRequest) {
try {
const body = await request.text()
const response = await fetch(`${BACKEND_URL}/api/compliance/agent/extract-text`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body,
signal: AbortSignal.timeout(120000),
})
const data = await response.json()
return NextResponse.json(data, { status: response.status })
} catch (error) {
return NextResponse.json(
{ text: '', word_count: 0, title: '', error: 'Text-Extraktion fehlgeschlagen' },
{ status: 503 },
)
}
}
@@ -0,0 +1,352 @@
'use client'
import React, { useState, useCallback } from 'react'
import { ChecklistView } from './ChecklistView'
import { DocumentRow } from './DocumentRow'
const DOCUMENT_TYPES = [
{ id: 'dse', label: 'DSI (Datenschutzinformation)', required: true },
{ id: 'impressum', label: 'Impressum', required: true },
{ id: 'social_media', label: 'Social Media DSE', required: false },
{ id: 'cookie', label: 'Cookie-Richtlinie', required: false },
{ id: 'agb', label: 'AGB', required: false },
{ id: 'nutzungsbedingungen', label: 'Nutzungsbedingungen', required: false },
{ id: 'widerruf', label: 'Widerrufsbelehrung', required: false },
{ id: 'dsb', label: 'DSB-Kontakt', required: false },
] as const
type DocTypeId = typeof DOCUMENT_TYPES[number]['id']
interface DocState {
url: string
text: string
loading: boolean
error: string | null
}
type DocsState = Record<DocTypeId, DocState>
const STORAGE_KEY_STATE = 'compliance-check-state'
const STORAGE_KEY_RESULTS = 'compliance-check-results'
const STORAGE_KEY_HISTORY = 'compliance-check-history'
function emptyDocState(): DocState {
return { url: '', text: '', loading: false, error: null }
}
function initState(): DocsState {
if (typeof window === 'undefined') {
return Object.fromEntries(DOCUMENT_TYPES.map(d => [d.id, emptyDocState()])) as DocsState
}
try {
const saved = localStorage.getItem(STORAGE_KEY_STATE)
if (saved) {
const parsed = JSON.parse(saved) as Record<string, { url?: string; text?: string }>
return Object.fromEntries(
DOCUMENT_TYPES.map(d => [d.id, {
url: parsed[d.id]?.url || '',
text: parsed[d.id]?.text || '',
loading: false,
error: null,
}])
) as DocsState
}
} catch { /* ignore */ }
return Object.fromEntries(DOCUMENT_TYPES.map(d => [d.id, emptyDocState()])) as DocsState
}
function countWords(text: string): number {
if (!text.trim()) return 0
return text.trim().split(/\s+/).length
}
interface HistoryEntry {
date: string
docCount: number
findings: number
resultKey: string
}
export function ComplianceCheckTab() {
const [docs, setDocs] = useState<DocsState>(initState)
const [useAgent, setUseAgent] = useState(false)
const [loading, setLoading] = useState(false)
const [progress, setProgress] = useState('')
const [results, setResults] = useState<any>(() => {
if (typeof window === 'undefined') return null
try { const s = localStorage.getItem(STORAGE_KEY_RESULTS); return s ? JSON.parse(s) : null } catch { return null }
})
const [error, setError] = useState<string | null>(null)
const [history, setHistory] = useState<HistoryEntry[]>(() => {
if (typeof window === 'undefined') return []
try { return JSON.parse(localStorage.getItem(STORAGE_KEY_HISTORY) || '[]') } catch { return [] }
})
// Persist URLs and texts (not loading/error state)
React.useEffect(() => {
const toSave: Record<string, { url: string; text: string }> = {}
for (const [key, val] of Object.entries(docs)) {
toSave[key] = { url: val.url, text: val.text }
}
try { localStorage.setItem(STORAGE_KEY_STATE, JSON.stringify(toSave)) } catch { /* quota */ }
}, [docs])
const updateDoc = useCallback((docType: DocTypeId, patch: Partial<DocState>) => {
setDocs(prev => ({ ...prev, [docType]: { ...prev[docType], ...patch } }))
}, [])
const handleFetchText = useCallback(async (docType: DocTypeId) => {
const url = docs[docType].url.trim()
if (!url) return
updateDoc(docType, { loading: true, error: null })
try {
const res = await fetch('/api/sdk/v1/agent/extract-text', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ url }),
})
if (!res.ok) {
const msg = res.status === 404
? 'Seite nicht erreichbar'
: `Fehler beim Laden (${res.status})`
throw new Error(msg)
}
const data = await res.json()
updateDoc(docType, { text: data.text || '', loading: false })
} catch (e) {
updateDoc(docType, {
loading: false,
error: e instanceof Error ? e.message : 'Text konnte nicht geladen werden',
})
}
}, [docs, updateDoc])
const handleFileUpload = useCallback(async (docType: DocTypeId, file: File) => {
// For now, read as text. PDF/DOCX parsing can be added server-side later.
const reader = new FileReader()
reader.onload = () => {
updateDoc(docType, { text: reader.result as string })
}
reader.readAsText(file)
}, [updateDoc])
const filledCount = Object.values(docs).filter(d => d.url.trim() || d.text.trim()).length
const handleSubmit = async () => {
if (filledCount === 0) return
setLoading(true)
setError(null)
setResults(null)
setProgress('Compliance-Check wird gestartet...')
try {
const entries = DOCUMENT_TYPES
.filter(dt => docs[dt.id].url.trim() || docs[dt.id].text.trim())
.map(dt => ({
doc_type: dt.id,
label: dt.label,
url: docs[dt.id].url.trim(),
text: docs[dt.id].text.trim() || undefined,
}))
const startRes = await fetch('/api/sdk/v1/agent/compliance-check', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
entries,
use_agent: useAgent,
}),
})
if (!startRes.ok) throw new Error(`Pruefung konnte nicht gestartet werden: ${startRes.status}`)
const { check_id } = await startRes.json()
if (!check_id) throw new Error('Keine Check-ID erhalten')
// Poll for results
let attempts = 0
while (attempts < 120) {
await new Promise(r => setTimeout(r, 3000))
const pollRes = await fetch(`/api/sdk/v1/agent/compliance-check?check_id=${check_id}`)
if (!pollRes.ok) { attempts++; continue }
const pollData = await pollRes.json()
if (pollData.progress) setProgress(pollData.progress)
if (pollData.status === 'completed' && pollData.result) {
setResults(pollData.result)
setProgress('')
localStorage.setItem(STORAGE_KEY_RESULTS, JSON.stringify(pollData.result))
const resultKey = `compliance-check-result-${Date.now()}`
try { localStorage.setItem(resultKey, JSON.stringify(pollData.result)) } catch { /* quota */ }
const entry: HistoryEntry = {
date: new Date().toISOString(),
docCount: entries.length,
findings: pollData.result.total_findings || 0,
resultKey,
}
const updated = [entry, ...history].slice(0, 30)
setHistory(updated)
localStorage.setItem(STORAGE_KEY_HISTORY, JSON.stringify(updated))
break
}
if (pollData.status === 'failed') {
throw new Error(pollData.error || 'Pruefung fehlgeschlagen')
}
attempts++
}
if (attempts >= 120) throw new Error('Zeitlimit ueberschritten')
} catch (e) {
setError(e instanceof Error ? e.message : 'Unbekannter Fehler')
setProgress('')
} finally {
setLoading(false)
}
}
const loadFromHistory = (entry: HistoryEntry) => {
if (entry.resultKey) {
try {
const saved = localStorage.getItem(entry.resultKey)
if (saved) { setResults(JSON.parse(saved)); return }
} catch { /* ignore */ }
}
try {
const last = localStorage.getItem(STORAGE_KEY_RESULTS)
if (last) setResults(JSON.parse(last))
} catch { /* ignore */ }
}
return (
<div className="space-y-4">
{/* Info box */}
<div className="bg-purple-50 border border-purple-200 rounded-lg p-4">
<h3 className="text-sm font-semibold text-purple-900">Compliance-Check (Alle Dokumente)</h3>
<p className="text-xs text-purple-700 mt-1">
Geben Sie die URLs Ihrer Rechtstexte ein oder laden Sie die Dokumente hoch.
Das System prueft alle Pflichtangaben nach DSGVO, TDDDG, TMG und UWG.
Pflichtdokumente sind mit * markiert.
</p>
</div>
{/* Document rows */}
<div className="space-y-2">
{DOCUMENT_TYPES.map(dt => (
<DocumentRow
key={dt.id}
label={dt.label}
docType={dt.id}
required={dt.required}
url={docs[dt.id].url}
text={docs[dt.id].text}
loading={docs[dt.id].loading}
error={docs[dt.id].error}
wordCount={countWords(docs[dt.id].text)}
onUrlChange={url => updateDoc(dt.id, { url })}
onFetchText={() => handleFetchText(dt.id)}
onTextChange={text => updateDoc(dt.id, { text })}
onFileUpload={file => handleFileUpload(dt.id, file)}
/>
))}
</div>
{/* Agent toggle + submit */}
<div className="flex items-center justify-between">
<button
type="button"
onClick={() => setUseAgent(!useAgent)}
className={`flex items-center gap-2 px-3 py-1.5 rounded-full text-xs font-medium border transition-colors ${
useAgent
? 'bg-emerald-100 border-emerald-300 text-emerald-800'
: 'bg-gray-50 border-gray-200 text-gray-500 hover:bg-gray-100'
}`}
>
<span className={`w-2 h-2 rounded-full ${useAgent ? 'bg-emerald-500' : 'bg-gray-300'}`} />
{useAgent ? 'KI-Agent aktiv (alle MCs)' : 'KI-Agent aus'}
</button>
<span className="text-xs text-gray-500">
{filledCount} von {DOCUMENT_TYPES.length} Dokumenten ausgefuellt
</span>
</div>
{/* Submit button */}
<button
onClick={handleSubmit}
disabled={loading || filledCount === 0}
className="w-full px-4 py-3 bg-purple-600 text-white rounded-lg font-medium hover:bg-purple-700 disabled:opacity-50 transition-colors text-sm flex items-center justify-center gap-2"
>
{loading ? (
<>
<svg className="animate-spin w-4 h-4" fill="none" viewBox="0 0 24 24">
<circle className="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" strokeWidth="4" />
<path className="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4z" />
</svg>
Pruefe...
</>
) : (
`Compliance-Check starten (${filledCount} Dokument${filledCount !== 1 ? 'e' : ''})`
)}
</button>
{/* Progress */}
{progress && (
<div className="bg-purple-50 border border-purple-200 rounded-lg p-3 text-sm text-purple-700 flex items-center gap-3">
<svg className="animate-spin w-4 h-4 text-purple-500 shrink-0" fill="none" viewBox="0 0 24 24">
<circle className="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" strokeWidth="4" />
<path className="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4z" />
</svg>
{progress}
</div>
)}
{/* Error */}
{error && (
<div className="bg-red-50 border border-red-200 rounded-lg p-3 text-sm text-red-700">{error}</div>
)}
{/* Results */}
{results && results.results && (
<div className="bg-white border border-gray-200 rounded-xl p-6 shadow-sm">
<ChecklistView results={results.results} />
{/* Email status */}
{results.email_status && (
<div className="mt-3 text-xs text-gray-500 flex items-center gap-2">
<span className={`w-2 h-2 rounded-full ${results.email_status === 'sent' ? 'bg-green-400' : 'bg-gray-300'}`} />
E-Mail: {results.email_status === 'sent' ? 'Gesendet' : results.email_status}
</div>
)}
</div>
)}
{/* History */}
{history.length > 0 && (
<div className="border border-gray-200 rounded-xl p-4">
<h4 className="text-sm font-medium text-gray-700 mb-2">Letzte Compliance-Checks</h4>
<div className="space-y-1">
{history.map((h, i) => (
<button
key={i}
onClick={() => loadFromHistory(h)}
className="w-full flex items-center justify-between text-sm py-2 px-2 rounded-lg border border-gray-50 hover:border-purple-200 hover:bg-purple-50/30 transition-all text-left"
>
<span className="text-gray-600">
{new Date(h.date).toLocaleDateString('de-DE', {
day: '2-digit', month: '2-digit', year: 'numeric',
hour: '2-digit', minute: '2-digit',
})}
</span>
<div className="flex items-center gap-3">
<span className="text-xs text-gray-500">{h.docCount} Dok.</span>
<span className={`text-xs font-medium ${h.findings > 0 ? 'text-amber-600' : 'text-green-600'}`}>
{h.findings} Findings
</span>
</div>
</button>
))}
</div>
</div>
)}
</div>
)
}
@@ -0,0 +1,163 @@
'use client'
import React, { useState, useRef } from 'react'
interface DocumentRowProps {
label: string
docType: string
required?: boolean
url: string
text: string
loading: boolean
error: string | null
wordCount: number
onUrlChange: (url: string) => void
onFetchText: () => void
onTextChange: (text: string) => void
onFileUpload: (file: File) => void
}
export function DocumentRow({
label,
docType,
required,
url,
text,
loading,
error,
wordCount,
onUrlChange,
onFetchText,
onTextChange,
onFileUpload,
}: DocumentRowProps) {
const [showText, setShowText] = useState(false)
const fileRef = useRef<HTMLInputElement>(null)
const textVisible = showText || text.length > 0
const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
const file = e.target.files?.[0]
if (!file) return
// Read text-based files directly
const reader = new FileReader()
reader.onload = () => {
const content = reader.result as string
onTextChange(content)
}
reader.onerror = () => {
// Let parent handle via onFileUpload for binary formats
onFileUpload(file)
}
if (file.name.endsWith('.txt') || file.type === 'text/plain') {
reader.readAsText(file)
} else {
// PDF, DOCX — pass to parent for server-side parsing
onFileUpload(file)
}
// Reset input so the same file can be re-selected
e.target.value = ''
}
return (
<div className="border border-gray-200 rounded-lg p-3 space-y-2">
{/* Header row: label + inputs */}
<div className="flex items-center gap-2">
<div className="w-52 shrink-0">
<span className="text-sm font-medium text-gray-700">
{label}
{required && <span className="text-red-500 ml-0.5">*</span>}
</span>
</div>
<input
type="url"
value={url}
onChange={e => onUrlChange(e.target.value)}
placeholder="https://example.com/datenschutz"
className="flex-1 px-3 py-2 border border-gray-300 rounded-lg text-sm focus:ring-2 focus:ring-purple-500 focus:border-transparent"
/>
{/* Fetch text button */}
<button
type="button"
onClick={onFetchText}
disabled={loading || !url.trim()}
className="px-3 py-2 border border-gray-300 rounded-lg text-sm text-gray-700 hover:bg-gray-50 disabled:opacity-40 disabled:cursor-not-allowed whitespace-nowrap transition-colors"
>
{loading ? (
<svg className="animate-spin w-4 h-4 text-purple-500" fill="none" viewBox="0 0 24 24">
<circle className="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" strokeWidth="4" />
<path className="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4z" />
</svg>
) : (
'Text laden'
)}
</button>
{/* File upload button */}
<button
type="button"
onClick={() => fileRef.current?.click()}
className="px-3 py-2 border border-gray-300 rounded-lg text-sm text-gray-700 hover:bg-gray-50 transition-colors"
title="PDF, DOCX oder TXT hochladen"
>
<svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2}
d="M4 16v1a3 3 0 003 3h10a3 3 0 003-3v-1m-4-8l-4-4m0 0L8 8m4-4v12" />
</svg>
</button>
<input
ref={fileRef}
type="file"
accept=".pdf,.docx,.doc,.txt"
onChange={handleFileChange}
className="hidden"
/>
{/* Toggle text area */}
<button
type="button"
onClick={() => setShowText(!showText)}
className={`px-3 py-2 border rounded-lg text-sm transition-colors ${
textVisible
? 'border-purple-300 bg-purple-50 text-purple-700'
: 'border-gray-300 text-gray-700 hover:bg-gray-50'
}`}
title={textVisible ? 'Text ausblenden' : 'Text anzeigen'}
>
<svg className={`w-4 h-4 transition-transform ${textVisible ? 'rotate-180' : ''}`}
fill="none" stroke="currentColor" viewBox="0 0 24 24">
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M19 9l-7 7-7-7" />
</svg>
</button>
{/* Word count badge */}
{wordCount > 0 && (
<span className="text-xs px-2 py-1 rounded-full bg-green-100 text-green-700 font-medium shrink-0">
{wordCount.toLocaleString('de-DE')} W.
</span>
)}
</div>
{/* Error */}
{error && (
<div className="text-xs text-red-600 px-1">{error}</div>
)}
{/* Collapsible textarea */}
{textVisible && (
<textarea
value={text}
onChange={e => onTextChange(e.target.value)}
placeholder="Dokumenttext hier einfuegen oder per URL / Upload laden..."
rows={6}
className="w-full px-3 py-2 border border-gray-300 rounded-lg text-sm font-mono resize-y focus:ring-2 focus:ring-purple-500 focus:border-transparent"
/>
)}
</div>
)
}
+5 -8
View File
@@ -2,23 +2,21 @@
import React, { useState } from 'react'
import { ScanResult } from './_components/ScanResult'
import { DocCheckTab } from './_components/DocCheckTab'
import { ComplianceCheckTab } from './_components/ComplianceCheckTab'
import { BannerCheckTab } from './_components/BannerCheckTab'
import { ImpressumCheckTab } from './_components/ImpressumCheckTab'
import { ComplianceFAQ } from './_components/ComplianceFAQ'
type AnalysisTab = 'scan' | 'doc-check' | 'banner-check' | 'impressum-check'
type AnalysisTab = 'scan' | 'compliance-check' | 'banner-check'
const TABS: { id: AnalysisTab; label: string; desc: string }[] = [
{ id: 'scan', label: 'Website-Scan', desc: 'Rechtliche Dokumente finden + Dienstleister erkennen' },
{ id: 'doc-check', label: 'Dokumenten-Pruefung', desc: 'DSI, AGB, Cookie-Richtlinie inhaltlich pruefen' },
{ id: 'compliance-check', label: 'Compliance-Check', desc: 'Alle rechtlichen Dokumente zusammen pruefen' },
{ id: 'banner-check', label: 'Banner-Check', desc: 'Cookie-Banner auf DSGVO-Konformitaet testen' },
{ id: 'impressum-check', label: 'Impressum-Check', desc: 'Impressum auf §5 TMG Pflichtangaben pruefen' },
]
export default function AgentPage() {
const [url, setUrl] = useState(() => typeof window !== 'undefined' ? localStorage.getItem('agent-scan-url') || '' : '')
const [tab, setTab] = useState<AnalysisTab>(() => (typeof window !== 'undefined' ? localStorage.getItem('agent-scan-tab') as AnalysisTab : null) || 'scan')
const [tab, setTab] = useState<AnalysisTab>(() => (typeof window !== 'undefined' ? localStorage.getItem('agent-scan-tab') as AnalysisTab : null) || 'compliance-check')
const [scanLoading, setScanLoading] = useState(false)
const [scanError, setScanError] = useState<string | null>(null)
const [scanData, setScanData] = useState<any>(() => {
@@ -186,9 +184,8 @@ export default function AgentPage() {
</div>
)}
{tab === 'doc-check' && <DocCheckTab />}
{tab === 'compliance-check' && <ComplianceCheckTab />}
{tab === 'banner-check' && <BannerCheckTab />}
{tab === 'impressum-check' && <ImpressumCheckTab />}
<ComplianceFAQ />
</div>
@@ -0,0 +1,439 @@
"""
Unified Compliance Check Routes — check all documents in one request.
POST /compliance/agent/extract-text — extract text from a URL
POST /compliance/agent/compliance-check — unified check for all documents
GET /compliance/agent/compliance-check/{check_id} — poll status
"""
import asyncio
import logging
import os
import uuid as _uuid
from dataclasses import asdict
from datetime import datetime, timezone
import httpx
from fastapi import APIRouter
from pydantic import BaseModel
from compliance.services.smtp_sender import send_email
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/compliance/agent", tags=["agent"])
CONSENT_TESTER_URL = "http://bp-compliance-consent-tester:8094"
# In-memory job store (same pattern as doc-check)
_compliance_check_jobs: dict[str, dict] = {}
# ── Models ───────────────────────────────────────────────────────────
class ExtractTextRequest(BaseModel):
url: str
class DocumentInput(BaseModel):
doc_type: str # dse, agb, impressum, cookie, widerruf, avv, loeschkonzept, etc.
url: str = ""
text: str = "" # text has priority over URL
class ComplianceCheckRequest(BaseModel):
documents: list[DocumentInput]
use_agent: bool = False
recipient: str = "dsb@breakpilot.local"
class ComplianceCheckStartResponse(BaseModel):
check_id: str
status: str = "running"
class ComplianceCheckStatusResponse(BaseModel):
check_id: str
status: str
progress: str = ""
result: dict | None = None
error: str = ""
# ── Extract text endpoint ────────────────────────────────────────────
@router.post("/extract-text")
async def extract_text(req: ExtractTextRequest):
"""Extract text from a URL via consent-tester DSI discovery."""
try:
async with httpx.AsyncClient(timeout=90.0) as client:
resp = await client.post(
f"{CONSENT_TESTER_URL}/dsi-discovery",
json={"url": req.url, "max_documents": 1},
)
if resp.status_code != 200:
return {
"text": "", "word_count": 0, "title": "",
"error": f"HTTP {resp.status_code} von Consent-Tester",
}
data = resp.json()
docs = data.get("documents", [])
if not docs:
return {
"text": "", "word_count": 0, "title": "",
"error": "Kein Text extrahierbar",
}
doc = docs[0]
text = doc.get("full_text", "") or doc.get("text_preview", "") or doc.get("text", "")
title = doc.get("title", "") or doc.get("doc_type", "")
word_count = doc.get("word_count", 0) or len(text.split())
return {
"text": text,
"word_count": word_count,
"title": title,
"error": "",
}
except Exception as e:
logger.warning("extract-text failed for %s: %s", req.url, e)
return {
"text": "", "word_count": 0, "title": "",
"error": str(e)[:200],
}
# ── Unified compliance check ────────────────────────────────────────
@router.post("/compliance-check")
async def start_compliance_check(req: ComplianceCheckRequest):
"""Start async compliance check for all documents."""
check_id = str(_uuid.uuid4())[:8]
_compliance_check_jobs[check_id] = {
"status": "running",
"progress": "Pruefung gestartet...",
"result": None,
"error": "",
}
asyncio.create_task(_run_compliance_check(check_id, req))
return ComplianceCheckStartResponse(check_id=check_id, status="running")
@router.get("/compliance-check/{check_id}")
async def get_compliance_check_status(check_id: str):
"""Poll compliance check status."""
job = _compliance_check_jobs.get(check_id)
if not job:
return {"check_id": check_id, "status": "not_found"}
return ComplianceCheckStatusResponse(
check_id=check_id,
status=job["status"],
progress=job.get("progress", ""),
result=job.get("result"),
error=job.get("error", ""),
)
async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
"""Background task: check all documents with business-profile context."""
try:
from compliance.services.business_profiler import detect_business_profile
from compliance.services.doc_checks.runner import check_document_completeness
from compliance.services.rag_document_checker import check_document_with_controls
from .agent_doc_check_routes import CheckItem, DocCheckResult
from .agent_doc_check_report import build_html_report
# Step 1: Resolve texts (fetch from URL if needed)
_update(check_id, "Texte werden geladen...")
doc_texts: dict[str, str] = {}
doc_entries: list[dict] = []
for i, doc in enumerate(req.documents):
_update(check_id, f"Dokument {i+1}/{len(req.documents)}: {doc.doc_type}...")
text = doc.text
if not text and doc.url:
text = await _fetch_text(doc.url)
if text:
doc_texts[doc.doc_type] = text
doc_entries.append({
"doc_type": doc.doc_type,
"url": doc.url,
"text": text,
"word_count": len(text.split()) if text else 0,
})
# Step 2: Detect business profile
_update(check_id, "Geschaeftsmodell wird erkannt...")
profile = await detect_business_profile(doc_texts)
profile_dict = asdict(profile)
# Step 3: Check each document
results: list[DocCheckResult] = []
total_findings = 0
use_agent_flag = req.use_agent or os.getenv(
"COMPLIANCE_USE_AGENT", "false"
).lower() == "true"
for i, entry in enumerate(doc_entries):
text = entry["text"]
doc_type = entry["doc_type"]
label = _doc_type_label(doc_type)
url = entry["url"]
_update(check_id, f"Pruefe {label} ({i+1}/{len(doc_entries)})...")
if not text or len(text) < 50:
results.append(DocCheckResult(
label=label, url=url, doc_type=doc_type,
error="Kein Text vorhanden oder zu kurz",
))
continue
result = await _check_single(
text, doc_type, label, url,
entry["word_count"], use_agent_flag,
)
# Apply profile context filter
result = _apply_profile_filter(result, profile, doc_type)
results.append(result)
total_findings += result.findings_count
# Step 4: Build report
_update(check_id, "Report wird erstellt...")
report_html = build_html_report(results, None)
# Prepend profile summary to report
profile_html = _build_profile_html(profile)
full_html = profile_html + report_html
# Step 5: Send email
doc_count = len([r for r in results if not r.error])
email_result = send_email(
recipient=req.recipient,
subject=f"[COMPLIANCE-CHECK] {doc_count} Dokumente geprueft",
body_html=full_html,
)
# Step 6: Store result
response = {
"results": [_result_to_dict(r) for r in results],
"business_profile": profile_dict,
"total_documents": len(results),
"total_findings": total_findings,
"email_status": email_result.get("status", "failed"),
"checked_at": datetime.now(timezone.utc).isoformat(),
}
_compliance_check_jobs[check_id]["status"] = "completed"
_compliance_check_jobs[check_id]["result"] = response
_compliance_check_jobs[check_id]["progress"] = "Fertig"
except Exception as e:
logger.error("Compliance check %s failed: %s", check_id, e, exc_info=True)
_compliance_check_jobs[check_id]["status"] = "failed"
_compliance_check_jobs[check_id]["error"] = str(e)[:500]
def _update(check_id: str, msg: str):
_compliance_check_jobs[check_id]["progress"] = msg
async def _fetch_text(url: str) -> str:
"""Fetch text from URL via consent-tester."""
try:
async with httpx.AsyncClient(timeout=90.0) as client:
resp = await client.post(
f"{CONSENT_TESTER_URL}/dsi-discovery",
json={"url": url, "max_documents": 1},
)
if resp.status_code != 200:
return ""
docs = resp.json().get("documents", [])
if not docs:
return ""
doc = docs[0]
return doc.get("full_text", "") or doc.get("text_preview", "") or ""
except Exception as e:
logger.warning("Text fetch failed for %s: %s", url, e)
return ""
async def _check_single(
text: str, doc_type: str, label: str, url: str,
word_count: int, use_agent: bool,
):
"""Run regex + MC checks on a single document."""
from compliance.services.doc_checks.runner import check_document_completeness
from compliance.services.rag_document_checker import check_document_with_controls
from .agent_doc_check_routes import CheckItem, DocCheckResult
# Regex checklist
findings = check_document_completeness(text, doc_type, label, url)
all_checks: list[CheckItem] = []
completeness = 0
correctness = 0
for f in findings:
if "SCORE" in f.get("code", ""):
for c in f.get("all_checks", []):
all_checks.append(CheckItem(
id=c["id"], label=c["label"], passed=c["passed"],
severity=c["severity"], matched_text=c.get("matched_text", ""),
level=c.get("level", 1), parent=c.get("parent"),
skipped=c.get("skipped", False), hint=c.get("hint", ""),
))
completeness = f.get("completeness_pct", 0)
correctness = f.get("correctness_pct", 0)
# Master Control checks
try:
mc_results = await check_document_with_controls(
text, doc_type, label, max_controls=0, use_agent=use_agent,
)
if mc_results:
for mc in mc_results:
all_checks.append(CheckItem(**mc))
l2 = [c for c in all_checks if c.level == 2 and not c.skipped]
l2_passed = sum(1 for c in l2 if c.passed)
correctness = round(l2_passed / len(l2) * 100) if l2 else 0
except Exception as e:
logger.warning("MC check skipped for %s: %s", label, e)
# LLM verification of regex fails
failed = [c for c in all_checks if not c.passed and not c.skipped and c.hint]
if failed:
try:
from compliance.services.doc_checks.llm_verify import verify_failed_checks
overturns = await verify_failed_checks(
text,
[{"id": c.id, "label": c.label, "hint": c.hint} for c in failed],
label,
)
for c in all_checks:
if c.id in overturns and overturns[c.id]["overturned"]:
c.passed = True
c.matched_text = f"[LLM] {overturns[c.id]['evidence']}"
l2_active = [c for c in all_checks if c.level == 2 and not c.skipped]
l2_passed = sum(1 for c in l2_active if c.passed)
if l2_active:
correctness = round(l2_passed / len(l2_active) * 100)
except Exception as e:
logger.warning("LLM verification skipped: %s", e)
non_score = [f for f in findings if "SCORE" not in f.get("code", "")]
return DocCheckResult(
label=label, url=url, doc_type=doc_type,
word_count=word_count or len(text.split()),
completeness_pct=completeness, correctness_pct=correctness,
checks=all_checks, findings_count=len(non_score),
)
def _apply_profile_filter(result, profile, doc_type: str):
"""Adjust INFO-level checks based on business profile context.
For example: ODR check only relevant for B2C online shops.
"""
from .agent_doc_check_routes import CheckItem
for check in result.checks:
cid = check.id.lower()
# ODR/OS-Link only relevant for B2C online shops
if "odr" in cid or "os-link" in cid or "streitbeilegung" in check.label.lower():
if not profile.needs_odr:
check.skipped = True
check.hint = "Nicht relevant (kein B2C Online-Shop)"
# Widerruf only relevant for B2C
if doc_type == "widerruf" and profile.business_type not in ("b2c", "unknown"):
if check.severity == "INFO":
check.skipped = True
# Regulated profession: check for Kammer info
if "kammer" in cid or "berufsordnung" in check.label.lower():
if not profile.is_regulated_profession:
check.skipped = True
check.hint = "Nicht relevant (kein regulierter Beruf)"
return result
# ── Helpers ──────────────────────────────────────────────────────────
_DOC_TYPE_LABELS = {
"dse": "Datenschutzerklaerung",
"datenschutz": "Datenschutzerklaerung",
"privacy": "Datenschutzerklaerung",
"impressum": "Impressum",
"agb": "AGB",
"widerruf": "Widerrufsbelehrung",
"cookie": "Cookie-Richtlinie",
"avv": "Auftragsverarbeitung",
"loeschkonzept": "Loeschkonzept",
"dsfa": "Datenschutz-Folgenabschaetzung",
"social_media": "Social Media Datenschutz",
}
def _doc_type_label(doc_type: str) -> str:
return _DOC_TYPE_LABELS.get(doc_type, doc_type.upper())
def _result_to_dict(r) -> dict:
"""Convert DocCheckResult to JSON-serializable dict."""
return {
"label": r.label, "url": r.url, "doc_type": r.doc_type,
"word_count": r.word_count, "completeness_pct": r.completeness_pct,
"correctness_pct": r.correctness_pct,
"checks": [
{
"id": c.id, "label": c.label, "passed": c.passed,
"severity": c.severity, "matched_text": c.matched_text,
"level": c.level, "parent": c.parent,
"skipped": c.skipped, "hint": c.hint,
}
for c in r.checks
],
"findings_count": r.findings_count, "error": r.error,
}
def _build_profile_html(profile) -> str:
"""Build a small HTML block summarizing the detected business profile."""
service_tags = ", ".join(profile.detected_services[:10]) or "keine erkannt"
flags = []
if profile.has_online_shop:
flags.append("Online-Shop")
if profile.has_editorial_content:
flags.append("Redaktionelle Inhalte")
if profile.is_regulated_profession:
flags.append(f"Regulierter Beruf ({profile.regulated_profession_type})")
if profile.needs_odr:
flags.append("ODR-pflichtig")
flags_str = ", ".join(flags) or "keine"
return (
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
'max-width:700px;margin:0 auto 16px;padding:12px 16px;'
'background:#f0f9ff;border:1px solid #bae6fd;border-radius:8px">'
'<h3 style="margin:0 0 8px;font-size:14px;color:#0369a1">'
'Erkanntes Geschaeftsmodell</h3>'
'<table style="font-size:13px;color:#374151">'
f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Typ:</td>'
f'<td><strong>{profile.business_type.upper()}</strong>'
f' ({profile.industry})</td></tr>'
f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Merkmale:</td>'
f'<td>{flags_str}</td></tr>'
f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Dienste:</td>'
f'<td>{service_tags}</td></tr>'
f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Konfidenz:</td>'
f'<td>{int(profile.confidence * 100)}%</td></tr>'
'</table></div>'
)
@@ -0,0 +1,223 @@
"""
Business Profiler — detect business model from document texts.
Pure keyword-based detection (deterministic, no LLM). Analyzes
DSE, Impressum, AGB, Widerruf etc. together to build a profile
that drives context-aware compliance checks.
Example:
profile = await detect_business_profile({"dse": "...", "impressum": "..."})
profile.business_type # "b2c"
profile.has_online_shop # True
"""
from __future__ import annotations
import re
from dataclasses import dataclass, field
@dataclass
class BusinessProfile:
business_type: str = "unknown" # b2b, b2c, b2g, nonprofit, unknown
industry: str = "unknown" # it_services, retail, healthcare, legal, craft, public, unknown
has_online_shop: bool = False
has_editorial_content: bool = False
is_regulated_profession: bool = False
regulated_profession_type: str = "" # arzt, anwalt, steuerberater, architekt, ""
needs_odr: bool = False # Online-Streitbeilegung
detected_services: list[str] = field(default_factory=list)
confidence: float = 0.0
# ── Keyword lists ────────────────────────────────────────────────────
_B2C_KEYWORDS = [
"verbraucher", "warenkorb", "bestellung", "lieferung", "widerruf",
"shop", "kaufpreis", "rueckgabe", "rückgabe", "endkunde", "kaeufer",
"käufer", "privatkunde", "zahlungspflichtig bestellen",
]
_B2B_KEYWORDS = [
"unternehmen", "geschaeftskunden", "geschäftskunden", "gewerblich",
"auftrag", "auftraggeber", "auftragnehmer", "geschaeftspartner",
"geschäftspartner", "firmenkunde", "b2b",
]
_B2G_KEYWORDS = [
"behoerde", "behörde", "koerperschaft", "körperschaft", "oeffentlich",
"öffentlich", "gemeinde", "amt", "stadtverwaltung", "landesbehoerde",
"landesbehörde", "kommunal",
]
_NONPROFIT_KEYWORDS = [
"gemeinnuetzig", "gemeinnützig", "verein", "stiftung", "e.v.",
"spende", "ehrenamtlich", "satzung",
]
_REGULATED_PROFESSIONS = {
"rechtsanwalt": "anwalt",
"anwalt": "anwalt",
"anwaeltin": "anwalt",
"anwältin": "anwalt",
"kanzlei": "anwalt",
"rechtsanwaltskammer": "anwalt",
"arzt": "arzt",
"ärztin": "arzt",
"aerztin": "arzt",
"praxis": "arzt",
"aerztekammer": "arzt",
"ärztekammer": "arzt",
"steuerberater": "steuerberater",
"steuerberaterin": "steuerberater",
"steuerberaterkammer": "steuerberater",
"architekt": "architekt",
"architektin": "architekt",
"architektenkammer": "architekt",
"notar": "notar",
"notariat": "notar",
"apotheke": "apotheker",
"apotheker": "apotheker",
}
_ONLINE_SHOP_KEYWORDS = [
"warenkorb", "checkout", "bestellung", "lieferung", "versand",
"paypal", "kreditkarte", "klarna", "sofortueberweisung",
"sofortüberweisung", "zahlungsarten", "versandkosten",
"lieferzeit", "retour", "paketdienst",
]
_EDITORIAL_KEYWORDS = [
"blog", "ratgeber", "news", "redaktion", "artikel", "magazin",
"beitrag", "kommentar", "podcast", "newsletter", "autor",
]
_INDUSTRY_KEYWORDS = {
"it_services": ["software", "saas", "cloud", "hosting", "server", "api", "app"],
"retail": ["shop", "warenkorb", "versand", "lieferung", "einzelhandel"],
"healthcare": ["arzt", "praxis", "patient", "gesundheit", "therapie", "klinik"],
"legal": ["kanzlei", "rechtsanwalt", "mandant", "anwalt"],
"craft": ["handwerk", "meister", "werkstatt", "montage", "gewerk"],
"public": ["behoerde", "behörde", "kommune", "verwaltung", "buerger", "bürger"],
"finance": ["bank", "versicherung", "finanz", "kredit", "anlage"],
"education": ["schule", "bildung", "unterricht", "lehrplan", "schueler", "schüler"],
}
_TRACKING_SERVICES = {
"google analytics": "Google Analytics",
"google tag manager": "Google Tag Manager",
"matomo": "Matomo",
"facebook pixel": "Facebook Pixel",
"meta pixel": "Meta Pixel",
"hotjar": "Hotjar",
"hubspot": "HubSpot",
"mailchimp": "Mailchimp",
"linkedin insight": "LinkedIn Insight",
"google ads": "Google Ads",
"google adsense": "Google AdSense",
"google maps": "Google Maps",
"youtube": "YouTube",
"vimeo": "Vimeo",
"cloudflare": "Cloudflare",
"sentry": "Sentry",
"intercom": "Intercom",
"zendesk": "Zendesk",
"stripe": "Stripe",
"paypal": "PayPal",
}
# ── Detection logic ──────────────────────────────────────────────────
def _count_hits(text: str, keywords: list[str]) -> int:
return sum(1 for kw in keywords if kw in text)
async def detect_business_profile(documents: dict[str, str]) -> BusinessProfile:
"""Analyze all document texts together to detect business model.
Args:
documents: dict mapping doc_type -> text (e.g. {"dse": "...", "impressum": "..."})
"""
profile = BusinessProfile()
if not documents:
return profile
# Merge all texts for keyword search
full_text = "\n".join(documents.values()).lower()
full_text = full_text.replace("\xad", "") # strip soft hyphens
# ── Tracking services ────────────────────────────────────────
for pattern, label in _TRACKING_SERVICES.items():
if pattern in full_text:
profile.detected_services.append(label)
# ── Online shop ──────────────────────────────────────────────
shop_hits = _count_hits(full_text, _ONLINE_SHOP_KEYWORDS)
profile.has_online_shop = shop_hits >= 3
# ── Editorial content ────────────────────────────────────────
editorial_hits = _count_hits(full_text, _EDITORIAL_KEYWORDS)
profile.has_editorial_content = editorial_hits >= 2
# ── Regulated profession ─────────────────────────────────────
for keyword, prof_type in _REGULATED_PROFESSIONS.items():
if keyword in full_text:
profile.is_regulated_profession = True
profile.regulated_profession_type = prof_type
break
# ── Business type ────────────────────────────────────────────
b2c_score = _count_hits(full_text, _B2C_KEYWORDS)
b2b_score = _count_hits(full_text, _B2B_KEYWORDS)
b2g_score = _count_hits(full_text, _B2G_KEYWORDS)
nonprofit_score = _count_hits(full_text, _NONPROFIT_KEYWORDS)
# Missing documents as signal
has_agb = "agb" in documents
has_widerruf = "widerruf" in documents
if not has_agb:
b2c_score -= 1 # No AGB → less likely B2C
if not has_widerruf:
b2c_score -= 1 # No Widerruf → less likely B2C shop
if profile.has_online_shop:
b2c_score += 3 # Strong B2C signal
scores = {
"b2c": b2c_score,
"b2b": b2b_score,
"b2g": b2g_score,
"nonprofit": nonprofit_score,
}
best = max(scores, key=scores.get) # type: ignore[arg-type]
best_val = scores[best]
if best_val >= 2:
profile.business_type = best
total = sum(max(0, v) for v in scores.values())
profile.confidence = round(best_val / total, 2) if total > 0 else 0.5
else:
profile.business_type = "unknown"
profile.confidence = 0.2
# ── ODR (Online-Streitbeilegung) ─────────────────────────────
# Required for B2C with online shop (EU Regulation 524/2013)
profile.needs_odr = (
profile.business_type == "b2c" and profile.has_online_shop
)
# ── Industry ─────────────────────────────────────────────────
industry_scores: dict[str, int] = {}
for industry, keywords in _INDUSTRY_KEYWORDS.items():
hits = _count_hits(full_text, keywords)
if hits >= 2:
industry_scores[industry] = hits
if industry_scores:
profile.industry = max(industry_scores, key=industry_scores.get) # type: ignore[arg-type]
elif profile.is_regulated_profession:
prof_map = {"anwalt": "legal", "arzt": "healthcare",
"steuerberater": "finance", "architekt": "craft"}
profile.industry = prof_map.get(profile.regulated_profession_type, "unknown")
return profile
+4
View File
@@ -48,6 +48,8 @@ from compliance.api.agent_scan_routes import router as agent_scan_router
from compliance.api.agent_history_routes import router as agent_history_router
from compliance.api.agent_recurring_routes import router as agent_recurring_router
from compliance.api.agent_compare_routes import router as agent_compare_router
from compliance.api.agent_doc_check_routes import router as agent_doc_check_router
from compliance.api.agent_compliance_check_routes import router as agent_compliance_check_router
# Middleware
from middleware import (
@@ -150,6 +152,8 @@ app.include_router(agent_scan_router, prefix="/api")
app.include_router(agent_history_router, prefix="/api")
app.include_router(agent_recurring_router, prefix="/api")
app.include_router(agent_compare_router, prefix="/api")
app.include_router(agent_doc_check_router, prefix="/api")
app.include_router(agent_compliance_check_router, prefix="/api")
if __name__ == "__main__":