diff --git a/admin-compliance/app/sdk/template-rule-editor/_components/ConditionBuilder.tsx b/admin-compliance/app/sdk/template-rule-editor/_components/ConditionBuilder.tsx
new file mode 100644
index 00000000..42195fb5
--- /dev/null
+++ b/admin-compliance/app/sdk/template-rule-editor/_components/ConditionBuilder.tsx
@@ -0,0 +1,232 @@
+'use client'
+
+/**
+ * Strukturierter Editor fuer JSONB-Conditions:
+ * { kind: 'all'|'any', clauses: [{field, op, value}] }
+ *
+ * Wird im RuleEditor verwendet. Reine Praesentations-Komponente — Parent
+ * verwaltet State.
+ */
+
+import type {
+ ClauseOperator, RuleClause, RuleCondition,
+} from '../_types'
+import { OPERATOR_LABELS, PROFILE_FIELDS } from '../_types'
+
+interface Props {
+ value: RuleCondition
+ onChange: (next: RuleCondition) => void
+ readOnly?: boolean
+}
+
+export default function ConditionBuilder({ value, onChange, readOnly }: Props) {
+ const setKind = (kind: 'all' | 'any') => onChange({ ...value, kind })
+ const setClause = (idx: number, clause: RuleClause) => {
+ const next = [...value.clauses]
+ next[idx] = clause
+ onChange({ ...value, clauses: next })
+ }
+ const addClause = () =>
+ onChange({
+ ...value,
+ clauses: [
+ ...value.clauses,
+ { field: PROFILE_FIELDS[0].key, op: 'eq', value: '' },
+ ],
+ })
+ const removeClause = (idx: number) =>
+ onChange({ ...value, clauses: value.clauses.filter((_, i) => i !== idx) })
+
+ return (
+
+
+ Bedingung:
+
+
+
+ {value.clauses.length === 0 && (
+
+ Keine Klauseln — Regel gilt für jedes Profil.
+
+ )}
+
+
+ {value.clauses.map((clause, idx) => (
+ -
+ setClause(idx, c)}
+ readOnly={!!readOnly}
+ />
+ {!readOnly && (
+
+ )}
+
+ ))}
+
+
+ {!readOnly && (
+
+ )}
+
+ )
+}
+
+function ClauseRow({
+ clause, onChange, readOnly,
+}: {
+ clause: RuleClause
+ onChange: (c: RuleClause) => void
+ readOnly: boolean
+}) {
+ const field = PROFILE_FIELDS.find((f) => f.key === clause.field) || PROFILE_FIELDS[0]
+ const operators: ClauseOperator[] =
+ field.type === 'enum'
+ ? ['eq', 'neq', 'in', 'not_in', 'exists', 'truthy', 'falsy']
+ : field.type === 'boolean'
+ ? ['truthy', 'falsy', 'eq', 'neq']
+ : field.type === 'number'
+ ? ['eq', 'neq', 'gt', 'gte', 'lt', 'lte']
+ : ['eq', 'neq', 'in', 'not_in', 'exists']
+
+ const requiresValue = !['exists', 'truthy', 'falsy'].includes(clause.op)
+ const multiValue = clause.op === 'in' || clause.op === 'not_in'
+
+ return (
+
+
+
+
+
+
+ {requiresValue && (
+ onChange({ ...clause, value: v })}
+ readOnly={readOnly}
+ />
+ )}
+
+
+ )
+}
+
+function ValueInput({
+ field, multi, value, onChange, readOnly,
+}: {
+ field: typeof PROFILE_FIELDS[number]
+ multi: boolean
+ value: unknown
+ onChange: (v: unknown) => void
+ readOnly: boolean
+}) {
+ if (field.type === 'enum' && field.options) {
+ if (multi) {
+ const selected = Array.isArray(value) ? (value as string[]) : []
+ return (
+
+ )
+ }
+ return (
+
+ )
+ }
+
+ if (field.type === 'number') {
+ return (
+ onChange(Number(e.target.value))}
+ />
+ )
+ }
+
+ if (field.type === 'boolean') {
+ return (
+
+ )
+ }
+
+ return (
+ onChange(e.target.value)}
+ />
+ )
+}
diff --git a/admin-compliance/app/sdk/template-rule-editor/_components/RuleEditor.tsx b/admin-compliance/app/sdk/template-rule-editor/_components/RuleEditor.tsx
new file mode 100644
index 00000000..9cd23796
--- /dev/null
+++ b/admin-compliance/app/sdk/template-rule-editor/_components/RuleEditor.tsx
@@ -0,0 +1,414 @@
+'use client'
+
+/**
+ * Rechte Spalte: Detail-Editor fuer die ausgewaehlte Regel.
+ *
+ * - Zeigt Live-Version + offenen Draft (falls vorhanden)
+ * - Erlaubt Draft-Edit (classification, conditions, source_citation, rationale)
+ * - Buttons: "Neuen Draft starten" (kopiert von Live), "Einreichen" (mit Pflicht
+ * change_summary-Modal), "Intern freigeben" (DSB), "Publish" (= Mandanten-Freigabe)
+ * - Versionshistorie + Approval-Trail unten als Akkordeon
+ */
+
+import { useEffect, useMemo, useState } from 'react'
+import type {
+ ApprovalHistoryEntry, Classification, Rule, RuleCondition, RuleVersion,
+} from '../_types'
+import { CLASSIFICATION_LABELS, STATUS_LABELS } from '../_types'
+import ConditionBuilder from './ConditionBuilder'
+
+interface Props {
+ rule: Rule
+ versions: RuleVersion[]
+ history: ApprovalHistoryEntry[]
+ onCreateDraft: (payload: {
+ classification: Classification
+ conditions: RuleCondition
+ source_citation: string
+ rationale?: string | null
+ }) => Promise
+ onUpdateDraft: (versionId: string, patch: {
+ classification?: Classification
+ conditions?: RuleCondition
+ source_citation?: string
+ rationale?: string | null
+ }) => Promise
+ onSubmitForReview: (versionId: string, changeSummary: string) => Promise
+ onApprove: (versionId: string) => Promise
+ onPublish: (versionId: string) => Promise
+ onReject: (versionId: string, reason: string) => Promise
+}
+
+export default function RuleEditor({
+ rule, versions, history,
+ onCreateDraft, onUpdateDraft,
+ onSubmitForReview, onApprove, onPublish, onReject,
+}: Props) {
+ const liveVersion = useMemo(
+ () => versions.find((v) => v.is_live) || null,
+ [versions],
+ )
+ const draftVersion = useMemo(
+ () => versions.find((v) => ['draft', 'review'].includes(v.status)) || null,
+ [versions],
+ )
+
+ // Edit-State
+ const [classification, setClassification] = useState('required')
+ const [conditions, setConditions] = useState({ kind: 'all', clauses: [] })
+ const [sourceCitation, setSourceCitation] = useState('')
+ const [rationale, setRationale] = useState('')
+
+ // Modal-State
+ const [showSubmit, setShowSubmit] = useState(false)
+ const [changeSummary, setChangeSummary] = useState('')
+ const [showHistory, setShowHistory] = useState(false)
+ const [rejectReason, setRejectReason] = useState('')
+ const [showReject, setShowReject] = useState(false)
+
+ // Sync Edit-State mit ausgewaehltem Version (Draft hat Vorrang)
+ const sourceVersion = draftVersion || liveVersion
+ useEffect(() => {
+ if (sourceVersion) {
+ setClassification(sourceVersion.classification)
+ setConditions(sourceVersion.conditions)
+ setSourceCitation(sourceVersion.source_citation)
+ setRationale(sourceVersion.rationale || '')
+ }
+ }, [sourceVersion?.id])
+
+ const isDraftMode = !!draftVersion && draftVersion.status === 'draft'
+ const isReviewMode = !!draftVersion && draftVersion.status === 'review'
+ const readOnly = !isDraftMode
+
+ const handleCreateDraft = () => {
+ onCreateDraft({
+ classification: liveVersion?.classification || 'recommended',
+ conditions: liveVersion?.conditions || { kind: 'all', clauses: [] },
+ source_citation: liveVersion?.source_citation || '',
+ rationale: liveVersion?.rationale,
+ })
+ }
+
+ const handleSaveDraft = () => {
+ if (!draftVersion) return
+ onUpdateDraft(draftVersion.id, {
+ classification, conditions, source_citation: sourceCitation, rationale,
+ })
+ }
+
+ const handleSubmit = () => {
+ if (!draftVersion || !changeSummary.trim()) return
+ onSubmitForReview(draftVersion.id, changeSummary.trim())
+ setShowSubmit(false)
+ setChangeSummary('')
+ }
+
+ return (
+
+
+
+
+ {!draftVersion && (
+
+
+ Kein offener Draft. Starte einen neuen Draft, um die Regel zu ändern.
+
+
+
+ )}
+
+ {/* Klassifikation */}
+
+
+
+
+
+ {/* Bedingung */}
+
+
+ {/* Source Citation (Pflicht) */}
+
+
+ {/* Rationale */}
+
+
+
+
+ {/* Versionshistorie */}
+
+
+ {showHistory && (
+
+ )}
+
+
+
+ {/* Footer-Aktionen */}
+
+
+ {showSubmit && (
+
setShowSubmit(false)}
+ onSubmit={handleSubmit}
+ />
+ )}
+
+ {showReject && (
+ { setShowReject(false); setRejectReason('') }}
+ onSubmit={() => {
+ if (!draftVersion || !rejectReason.trim()) return
+ onReject(draftVersion.id, rejectReason.trim())
+ setShowReject(false); setRejectReason('')
+ }}
+ />
+ )}
+
+ )
+}
+
+function HistoryList({ versions, history }: { versions: RuleVersion[]; history: ApprovalHistoryEntry[] }) {
+ return (
+
+
+
Versionen:
+
+ {versions.map((v) => (
+ -
+
+ v{v.version_number}
+ {STATUS_LABELS[v.status]}
+ {v.is_live && ● Live}
+
+ {new Date(v.created_at).toLocaleString('de-DE')}
+
+
+ {v.change_summary && (
+ Änderung: {v.change_summary}
+ )}
+ {v.source_citation && (
+ Quelle: {v.source_citation}
+ )}
+
+ ))}
+
+
+
+
Approval-Trail:
+
+ {history.map((h) => (
+ -
+ {new Date(h.created_at).toLocaleString('de-DE')} · {h.action}
+ {h.approver && ` · ${h.approver}`}
+ {h.comment && ` — ${h.comment}`}
+
+ ))}
+
+
+
+ )
+}
+
+function SubmitDialog({
+ value, onChange, onCancel, onSubmit,
+}: {
+ value: string
+ onChange: (s: string) => void
+ onCancel: () => void
+ onSubmit: () => void
+}) {
+ return (
+
+
e.stopPropagation()}>
+
+ Zur internen Prüfung einreichen
+
+
+
+
+
+
+
+ )
+}
+
+function RejectDialog({
+ value, onChange, onCancel, onSubmit,
+}: {
+ value: string
+ onChange: (s: string) => void
+ onCancel: () => void
+ onSubmit: () => void
+}) {
+ return (
+
+
e.stopPropagation()}>
+
+
+
+
+
+
+
+ )
+}
diff --git a/admin-compliance/app/sdk/template-rule-editor/_components/RuleList.tsx b/admin-compliance/app/sdk/template-rule-editor/_components/RuleList.tsx
new file mode 100644
index 00000000..18d53de7
--- /dev/null
+++ b/admin-compliance/app/sdk/template-rule-editor/_components/RuleList.tsx
@@ -0,0 +1,111 @@
+'use client'
+
+/**
+ * Linke Spalte: Liste der globalen Empfehlungs-Regeln.
+ *
+ * Filterbar nach document_type. Klassifikations-Chip + Live-Indikator.
+ */
+
+import { useMemo, useState } from 'react'
+import type { Rule, RuleVersion } from '../_types'
+import { CLASSIFICATION_LABELS, STATUS_LABELS } from '../_types'
+
+interface Props {
+ rules: Rule[]
+ versionsByRule: Record
+ selectedRuleId: string | null
+ onSelectRule: (ruleId: string) => void
+}
+
+export default function RuleList({
+ rules, versionsByRule, selectedRuleId, onSelectRule,
+}: Props) {
+ const [filter, setFilter] = useState('')
+ const filtered = useMemo(() => {
+ if (!filter.trim()) return rules
+ const q = filter.toLowerCase()
+ return rules.filter(
+ (r) =>
+ r.title.toLowerCase().includes(q) ||
+ r.rule_key.toLowerCase().includes(q) ||
+ r.document_type.toLowerCase().includes(q),
+ )
+ }, [rules, filter])
+
+ return (
+
+
+
setFilter(e.target.value)}
+ className="w-full text-sm px-2 py-1.5 border border-gray-300 rounded"
+ />
+
+ {filtered.length} von {rules.length} Regeln
+
+
+
+
+ {filtered.map((rule) => {
+ const live = versionsByRule[rule.id]
+ const isSelected = rule.id === selectedRuleId
+ return (
+ -
+
+
+ )
+ })}
+ {filtered.length === 0 && (
+ -
+ Keine Regeln gefunden.
+
+ )}
+
+
+ )
+}
+
+function ClassificationChip({ classification }: { classification: 'required' | 'recommended' | 'optional' }) {
+ const colorMap = {
+ required: 'bg-rose-100 text-rose-800 border-rose-300',
+ recommended: 'bg-amber-100 text-amber-800 border-amber-300',
+ optional: 'bg-slate-100 text-slate-700 border-slate-300',
+ } as const
+ return (
+
+ {CLASSIFICATION_LABELS[classification]}
+
+ )
+}
diff --git a/admin-compliance/app/sdk/template-rule-editor/_hooks/useRuleEditorActions.ts b/admin-compliance/app/sdk/template-rule-editor/_hooks/useRuleEditorActions.ts
new file mode 100644
index 00000000..65e44fd9
--- /dev/null
+++ b/admin-compliance/app/sdk/template-rule-editor/_hooks/useRuleEditorActions.ts
@@ -0,0 +1,183 @@
+/**
+ * Hook fuer Template-Rule-Editor: laedt Regeln/Versions/History und exponiert
+ * Lifecycle-Actions (submit/approve/publish/reject) + Tenant-Override-CRUD.
+ *
+ * Alle API-Calls gehen ueber /api/sdk/v1/compliance/* (Next.js-Proxy zum
+ * backend-compliance).
+ */
+
+import { useCallback } from 'react'
+import type {
+ ApprovalHistoryEntry,
+ Classification,
+ Rule,
+ RuleCondition,
+ RuleVersion,
+ TenantRuleOverride,
+} from '../_types'
+
+const API_BASE = '/api/sdk/v1/compliance'
+
+async function req(url: string, init?: RequestInit): Promise {
+ const res = await fetch(url, {
+ ...init,
+ headers: {
+ 'Content-Type': 'application/json',
+ ...(init?.headers || {}),
+ },
+ })
+ if (!res.ok) {
+ const text = await res.text().catch(() => res.statusText)
+ throw new Error(`${res.status}: ${text}`)
+ }
+ if (res.status === 204) return undefined as T
+ return res.json() as Promise
+}
+
+export function useRuleEditorActions() {
+ const listRules = useCallback(
+ (documentType?: string) => {
+ const q = documentType ? `?document_type=${encodeURIComponent(documentType)}` : ''
+ return req(`${API_BASE}/template-rules${q}`)
+ },
+ [],
+ )
+
+ const getRule = useCallback(
+ (ruleId: string) => req(`${API_BASE}/template-rules/${ruleId}`),
+ [],
+ )
+
+ const listVersions = useCallback(
+ (ruleId: string) => req(`${API_BASE}/template-rules/${ruleId}/versions`),
+ [],
+ )
+
+ const getVersion = useCallback(
+ (versionId: string) => req(`${API_BASE}/template-rule-versions/${versionId}`),
+ [],
+ )
+
+ const createDraftVersion = useCallback(
+ (
+ ruleId: string,
+ payload: {
+ classification: Classification
+ conditions: RuleCondition
+ source_citation: string
+ rationale?: string | null
+ created_by?: string | null
+ },
+ ) =>
+ req(`${API_BASE}/template-rules/${ruleId}/versions`, {
+ method: 'POST',
+ body: JSON.stringify({
+ rule_id: ruleId,
+ ...payload,
+ }),
+ }),
+ [],
+ )
+
+ const updateDraftVersion = useCallback(
+ (
+ versionId: string,
+ patch: {
+ classification?: Classification
+ conditions?: RuleCondition
+ source_citation?: string
+ rationale?: string | null
+ change_summary?: string | null
+ },
+ ) =>
+ req(`${API_BASE}/template-rule-versions/${versionId}`, {
+ method: 'PATCH',
+ body: JSON.stringify(patch),
+ }),
+ [],
+ )
+
+ const submitForReview = useCallback(
+ (
+ versionId: string,
+ payload: { change_summary: string; submitter?: string; comment?: string },
+ ) =>
+ req(`${API_BASE}/template-rule-versions/${versionId}/submit-review`, {
+ method: 'POST',
+ body: JSON.stringify(payload),
+ }),
+ [],
+ )
+
+ const approveVersion = useCallback(
+ (versionId: string, payload: { approver?: string; comment?: string } = {}) =>
+ req(`${API_BASE}/template-rule-versions/${versionId}/approve`, {
+ method: 'POST',
+ body: JSON.stringify(payload),
+ }),
+ [],
+ )
+
+ const publishVersion = useCallback(
+ (versionId: string, payload: { approver?: string; comment?: string } = {}) =>
+ req(`${API_BASE}/template-rule-versions/${versionId}/publish`, {
+ method: 'POST',
+ body: JSON.stringify(payload),
+ }),
+ [],
+ )
+
+ const rejectVersion = useCallback(
+ (
+ versionId: string,
+ payload: { rejection_reason: string; rejector?: string; comment?: string },
+ ) =>
+ req(`${API_BASE}/template-rule-versions/${versionId}/reject`, {
+ method: 'POST',
+ body: JSON.stringify(payload),
+ }),
+ [],
+ )
+
+ const getApprovalHistory = useCallback(
+ (versionId: string) =>
+ req(
+ `${API_BASE}/template-rule-versions/${versionId}/approval-history`,
+ ),
+ [],
+ )
+
+ const listOverrides = useCallback(
+ () => req(`${API_BASE}/tenant-rule-overrides`),
+ [],
+ )
+
+ const upsertOverride = useCallback(
+ (payload: {
+ rule_id: string
+ override_classification: Classification | null
+ reason: string
+ created_by?: string
+ }) =>
+ req(`${API_BASE}/tenant-rule-overrides`, {
+ method: 'POST',
+ body: JSON.stringify(payload),
+ }),
+ [],
+ )
+
+ const deleteOverride = useCallback(
+ (overrideId: string) =>
+ req(`${API_BASE}/tenant-rule-overrides/${overrideId}`, { method: 'DELETE' }),
+ [],
+ )
+
+ return {
+ listRules, getRule,
+ listVersions, getVersion,
+ createDraftVersion, updateDraftVersion,
+ submitForReview, approveVersion, publishVersion, rejectVersion,
+ getApprovalHistory,
+ listOverrides, upsertOverride, deleteOverride,
+ }
+}
diff --git a/admin-compliance/app/sdk/template-rule-editor/_types.ts b/admin-compliance/app/sdk/template-rule-editor/_types.ts
new file mode 100644
index 00000000..eb5b02cb
--- /dev/null
+++ b/admin-compliance/app/sdk/template-rule-editor/_types.ts
@@ -0,0 +1,246 @@
+/**
+ * Types fuer den Template-Rule-Editor (SDK).
+ *
+ * Spiegeln die Pydantic-Modelle aus
+ * backend-compliance/compliance/schemas/template_rule.py.
+ */
+
+export type Classification = 'required' | 'recommended' | 'optional'
+
+export type RuleStatus =
+ | 'draft' | 'review' | 'approved' | 'published' | 'archived' | 'rejected'
+
+export type ClauseOperator =
+ | 'eq' | 'neq' | 'in' | 'not_in'
+ | 'gt' | 'gte' | 'lt' | 'lte'
+ | 'exists' | 'truthy' | 'falsy'
+
+export interface RuleClause {
+ field: string
+ op: ClauseOperator
+ value?: unknown
+}
+
+export interface RuleCondition {
+ kind: 'all' | 'any'
+ clauses: RuleClause[]
+}
+
+export interface Rule {
+ id: string
+ rule_key: string
+ document_type: string
+ title: string
+ current_version_id: string | null
+ created_at: string
+ updated_at: string | null
+}
+
+export interface RuleVersion {
+ id: string
+ rule_id: string
+ version_number: number
+ status: RuleStatus
+ is_live: boolean
+ classification: Classification
+ conditions: RuleCondition
+ source_citation: string
+ rationale: string | null
+ change_summary: string | null
+ created_by: string | null
+ submitted_by: string | null
+ submitted_at: string | null
+ approved_by: string | null
+ approved_at: string | null
+ published_by: string | null
+ published_at: string | null
+ rejected_by: string | null
+ rejected_at: string | null
+ rejection_reason: string | null
+ created_at: string
+ updated_at: string | null
+}
+
+export interface ApprovalHistoryEntry {
+ id: string
+ version_id: string
+ action: string
+ approver: string | null
+ comment: string | null
+ created_at: string
+}
+
+export interface TenantRuleOverride {
+ id: string
+ tenant_id: string
+ rule_id: string
+ override_classification: Classification | null
+ reason: string
+ created_by: string | null
+ created_at: string
+ updated_at: string | null
+}
+
+// ---- Profil-Felder fuer Condition-Builder ----
+
+export interface ProfileFieldOption {
+ /** Key der im Profil verwendet wird */
+ key: string
+ /** Label fuer die UI */
+ label: string
+ /** Kategorie fuer Gruppierung */
+ category: 'org' | 'proc' | 'prod' | 'comp' | 'tech' | 'compliance'
+ /** Erwarteter Datentyp */
+ type: 'string' | 'number' | 'boolean' | 'enum'
+ /** Wenn enum: Mögliche Werte mit Label */
+ options?: { value: string; label: string }[]
+}
+
+/**
+ * Die 17 Profil-Felder, die in den 33 Initial-Regeln verwendet werden.
+ * Aus templateRecommendations.ts portiert + compliance_depth_level ergaenzt.
+ */
+export const PROFILE_FIELDS: ProfileFieldOption[] = [
+ {
+ key: 'compliance_depth_level',
+ label: 'Compliance-Tiefe',
+ category: 'compliance', type: 'enum',
+ options: [
+ { value: 'L1', label: 'L1 — Lean Startup' },
+ { value: 'L2', label: 'L2 — Standard' },
+ { value: 'L3', label: 'L3 — Strict' },
+ { value: 'L4', label: 'L4 — Zertifizierungsbereit' },
+ ],
+ },
+ {
+ key: 'org_employee_count',
+ label: 'Mitarbeiterzahl',
+ category: 'org', type: 'enum',
+ options: [
+ { value: 'none', label: 'Keine' },
+ { value: '1_9', label: '1–9' },
+ { value: '10_49', label: '10–49' },
+ { value: '50_249', label: '50–249' },
+ { value: '250_999', label: '250–999' },
+ { value: '1000_plus', label: '1000+' },
+ ],
+ },
+ {
+ key: 'org_has_employees', label: 'Hat Mitarbeiter', category: 'org', type: 'enum',
+ options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
+ },
+ {
+ key: 'org_business_model', label: 'Geschäftsmodell', category: 'org', type: 'enum',
+ options: [
+ { value: 'b2b_saas', label: 'B2B SaaS' },
+ { value: 'b2c_shop', label: 'B2C Shop' },
+ { value: 'platform', label: 'Plattform' },
+ { value: 'marketplace', label: 'Marktplatz' },
+ { value: 'social', label: 'Social Media' },
+ { value: 'saas', label: 'SaaS' },
+ { value: 'media', label: 'Media' },
+ { value: 'manufacturing', label: 'Maschinenbau' },
+ { value: 'other', label: 'Sonstiges' },
+ ],
+ },
+ {
+ key: 'org_has_social_media', label: 'Hat Social Media', category: 'org', type: 'enum',
+ options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
+ },
+ {
+ key: 'org_has_video_conferencing', label: 'Hat Video-Konferenzen', category: 'org', type: 'enum',
+ options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
+ },
+ {
+ key: 'org_cert_target', label: 'Zertifizierungsziel', category: 'org', type: 'enum',
+ options: [
+ { value: 'none', label: 'Keines' },
+ { value: 'iso27001', label: 'ISO 27001' },
+ { value: 'iso27701', label: 'ISO 27701' },
+ { value: 'tisax', label: 'TISAX' },
+ ],
+ },
+ {
+ key: 'proc_ai_usage', label: 'KI-Nutzung', category: 'proc', type: 'enum',
+ options: [
+ { value: 'none', label: 'Keine' },
+ { value: 'limited', label: 'Begrenzt' },
+ { value: 'extensive', label: 'Umfangreich' },
+ ],
+ },
+ {
+ key: 'proc_uses_ai_tools', label: 'Nutzt KI-Tools', category: 'proc', type: 'boolean',
+ },
+ {
+ key: 'proc_byod_allowed', label: 'BYOD erlaubt', category: 'proc', type: 'enum',
+ options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
+ },
+ {
+ key: 'proc_dsfa_required', label: 'DSFA erforderlich', category: 'proc', type: 'enum',
+ options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
+ },
+ {
+ key: 'prod_webshop', label: 'Webshop', category: 'prod', type: 'enum',
+ options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
+ },
+ {
+ key: 'prod_ugc_platform', label: 'UGC-Plattform', category: 'prod', type: 'enum',
+ options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
+ },
+ {
+ key: 'prod_consent_management', label: 'Consent Management', category: 'prod', type: 'enum',
+ options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
+ },
+ {
+ key: 'comp_has_processors', label: 'Auftragsverarbeiter', category: 'comp', type: 'enum',
+ options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
+ },
+ {
+ key: 'comp_vendor_management', label: 'Vendor-Management', category: 'comp', type: 'enum',
+ options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
+ },
+ {
+ key: 'comp_dsfa_processes', label: 'DSFA-Prozesse', category: 'comp', type: 'enum',
+ options: [{ value: 'required', label: 'Erforderlich' }, { value: 'optional', label: 'Optional' }],
+ },
+ {
+ key: 'tech_third_country', label: 'Drittland-Transfer', category: 'tech', type: 'enum',
+ options: [
+ { value: 'no', label: 'Nein' },
+ { value: 'us_dpf_only', label: 'Nur US-DPF' },
+ { value: 'adequate_only', label: 'Nur Angemessenheitsbeschluss' },
+ { value: 'yes_us', label: 'Ja, USA' },
+ { value: 'yes_other', label: 'Ja, Sonstige' },
+ ],
+ },
+]
+
+
+export const OPERATOR_LABELS: Record = {
+ eq: 'gleich (=)',
+ neq: 'ungleich (≠)',
+ in: 'in Liste',
+ not_in: 'nicht in Liste',
+ gt: 'größer (>)',
+ gte: 'größer/gleich (≥)',
+ lt: 'kleiner (<)',
+ lte: 'kleiner/gleich (≤)',
+ exists: 'existiert',
+ truthy: 'ist gesetzt',
+ falsy: 'ist leer',
+}
+
+export const CLASSIFICATION_LABELS: Record = {
+ required: 'Pflicht',
+ recommended: 'Empfohlen',
+ optional: 'Optional',
+}
+
+export const STATUS_LABELS: Record = {
+ draft: 'Entwurf',
+ review: 'In Prüfung',
+ approved: 'Freigegeben',
+ published: 'Live',
+ archived: 'Archiviert',
+ rejected: 'Abgelehnt',
+}
diff --git a/admin-compliance/app/sdk/template-rule-editor/page.tsx b/admin-compliance/app/sdk/template-rule-editor/page.tsx
new file mode 100644
index 00000000..e96104df
--- /dev/null
+++ b/admin-compliance/app/sdk/template-rule-editor/page.tsx
@@ -0,0 +1,205 @@
+'use client'
+
+/**
+ * Template Rule Editor — Editorial-UI fuer Anwaelte/DSBs.
+ *
+ * Architektur:
+ * - Links: RuleList mit Filter
+ * - Rechts: RuleEditor mit Klassifikation, Condition-Builder, Source-Citation,
+ * Approval-Workflow (draft → review → approved → published)
+ *
+ * Backend: /api/sdk/v1/compliance/template-rules + /template-rule-versions/*
+ */
+
+import { useEffect, useState, useCallback } from 'react'
+import { useSDK } from '@/lib/sdk'
+import StepHeader from '@/components/sdk/StepHeader/StepHeader'
+import { useRuleEditorActions } from './_hooks/useRuleEditorActions'
+import type {
+ ApprovalHistoryEntry, Classification, Rule, RuleCondition, RuleVersion,
+} from './_types'
+import RuleList from './_components/RuleList'
+import RuleEditor from './_components/RuleEditor'
+
+export default function TemplateRuleEditorPage() {
+ useSDK()
+
+ const actions = useRuleEditorActions()
+
+ const [rules, setRules] = useState([])
+ const [liveVersionsByRule, setLiveVersionsByRule] = useState>({})
+ const [selectedRuleId, setSelectedRuleId] = useState(null)
+ const [selectedVersions, setSelectedVersions] = useState([])
+ const [selectedHistory, setSelectedHistory] = useState([])
+ const [loading, setLoading] = useState(true)
+ const [error, setError] = useState(null)
+
+ // Initial: Regeln laden + Live-Versions
+ const loadRules = useCallback(async () => {
+ setLoading(true)
+ setError(null)
+ try {
+ const list = await actions.listRules()
+ setRules(list)
+ const byRule: Record = {}
+ // Live-Versionen parallel
+ await Promise.all(
+ list.map(async (r) => {
+ try {
+ const versions = await actions.listVersions(r.id)
+ const live = versions.find((v) => v.is_live)
+ byRule[r.id] = live
+ } catch {
+ byRule[r.id] = undefined
+ }
+ }),
+ )
+ setLiveVersionsByRule(byRule)
+ if (list.length > 0 && !selectedRuleId) {
+ setSelectedRuleId(list[0].id)
+ }
+ } catch (e) {
+ setError((e as Error).message)
+ } finally {
+ setLoading(false)
+ }
+ }, [actions, selectedRuleId])
+
+ // Bei Selektions-Wechsel: Versions + History laden
+ const loadSelected = useCallback(async () => {
+ if (!selectedRuleId) {
+ setSelectedVersions([])
+ setSelectedHistory([])
+ return
+ }
+ try {
+ const versions = await actions.listVersions(selectedRuleId)
+ setSelectedVersions(versions)
+ const live = versions.find((v) => v.is_live)
+ if (live) {
+ const history = await actions.getApprovalHistory(live.id)
+ setSelectedHistory(history)
+ } else {
+ setSelectedHistory([])
+ }
+ } catch (e) {
+ setError((e as Error).message)
+ }
+ }, [actions, selectedRuleId])
+
+ useEffect(() => { loadRules() }, [])
+ useEffect(() => { loadSelected() }, [selectedRuleId])
+
+ const handleCreateDraft = async (payload: {
+ classification: Classification
+ conditions: RuleCondition
+ source_citation: string
+ rationale?: string | null
+ }) => {
+ if (!selectedRuleId) return
+ try {
+ await actions.createDraftVersion(selectedRuleId, payload)
+ await loadSelected()
+ } catch (e) {
+ setError((e as Error).message)
+ }
+ }
+
+ const handleUpdateDraft = async (versionId: string, patch: {
+ classification?: Classification
+ conditions?: RuleCondition
+ source_citation?: string
+ rationale?: string | null
+ }) => {
+ try {
+ await actions.updateDraftVersion(versionId, patch)
+ await loadSelected()
+ } catch (e) {
+ setError((e as Error).message)
+ }
+ }
+
+ const handleSubmitForReview = async (versionId: string, changeSummary: string) => {
+ try {
+ await actions.submitForReview(versionId, { change_summary: changeSummary })
+ await loadSelected()
+ } catch (e) {
+ setError((e as Error).message)
+ }
+ }
+
+ const handleApprove = async (versionId: string) => {
+ try {
+ await actions.approveVersion(versionId)
+ await loadSelected()
+ } catch (e) {
+ setError((e as Error).message)
+ }
+ }
+
+ const handlePublish = async (versionId: string) => {
+ try {
+ await actions.publishVersion(versionId)
+ await loadRules()
+ await loadSelected()
+ } catch (e) {
+ setError((e as Error).message)
+ }
+ }
+
+ const handleReject = async (versionId: string, reason: string) => {
+ try {
+ await actions.rejectVersion(versionId, { rejection_reason: reason })
+ await loadSelected()
+ } catch (e) {
+ setError((e as Error).message)
+ }
+ }
+
+ const selectedRule = rules.find((r) => r.id === selectedRuleId)
+
+ return (
+
+
+ {error && (
+
+ {error}
+
+ )}
+ {loading && (
+
Lade Regeln…
+ )}
+ {!loading && (
+
+
+ {selectedRule ? (
+
+ ) : (
+
+ Wähle links eine Regel zum Bearbeiten.
+
+ )}
+
+ )}
+
+ )
+}
diff --git a/admin-compliance/lib/sdk/types/sdk-steps.ts b/admin-compliance/lib/sdk/types/sdk-steps.ts
index 5035c838..c67d3d56 100644
--- a/admin-compliance/lib/sdk/types/sdk-steps.ts
+++ b/admin-compliance/lib/sdk/types/sdk-steps.ts
@@ -494,4 +494,18 @@ export const SDK_STEPS: SDKStep[] = [
prerequisiteSteps: [],
isOptional: true,
},
+ {
+ id: 'template-rule-editor',
+ seq: 5000,
+ phase: 2,
+ package: 'betrieb',
+ order: 13,
+ name: 'Empfehlungs-Regeln',
+ nameShort: 'Regeln',
+ description: 'Editorial-UI fuer profilbasierte Dokument-Empfehlungen (Anwalt/DSB)',
+ url: '/sdk/template-rule-editor',
+ checkpointId: 'CP-RULES',
+ prerequisiteSteps: [],
+ isOptional: true,
+ },
]
diff --git a/backend-compliance/compliance/services/cookie_screenshot_ocr.py b/backend-compliance/compliance/services/cookie_screenshot_ocr.py
index 6a6ba961..62951e78 100644
--- a/backend-compliance/compliance/services/cookie_screenshot_ocr.py
+++ b/backend-compliance/compliance/services/cookie_screenshot_ocr.py
@@ -1,336 +1,49 @@
-"""Screenshot-basierte Cookie-Extraktion mit Tesseract-OCR.
+"""Screenshot-basierte Cookie-Extraktion (Orchestration).
Pipeline:
1. consent-tester macht Full-Page-Screenshot (Banner akzeptiert,
Accordions ausgeklappt, Timestamp eingebrannt) → PNG b64
2. Tesseract OCR (lang=deu, psm=4) → Rohtext mit Tabellen-Reihen
-3. _parse_ocr_cookie_table(text) → strukturierte Liste {name, category,
- purpose, duration, type, vendor}
+3. parse_ocr_cookie_table(text) → strukturierte Liste
-Funktioniert site-unabhaengig — egal welches CMP, egal welche Sprache
-(Tesseract kann viele), egal welches DOM-Layout. Timestamp im Bild =
-Beweis was wir zum Scan-Zeitpunkt wirklich gesehen haben.
+Phase-1-Split (2026-06-06): Engine-Funktionen
+(_slice_screenshot / vision-OCR / paddle / tesseract / parse) leben
+jetzt in `cookie_screenshot_ocr_engines.py`. Re-Exports halten die
+Public-API stabil — externe Importer (`_phase_d1_vendors_raw.py`)
+brauchen keinen Code-Change.
"""
from __future__ import annotations
import base64 as _b64
-import json
import logging
import os
-import re
import httpx
-logger = logging.getLogger(__name__)
+from .cookie_screenshot_ocr_engines import ( # noqa: F401 (re-exports)
+ OLLAMA_URL,
+ VISION_MODEL,
+ VISION_PROMPT,
+ _PADDLE_OCR,
+ _call_vision_on_slice,
+ _slice_screenshot,
+ ocr_screenshot_via_paddle,
+ ocr_screenshot_via_tesseract,
+ ocr_screenshot_via_vision_slices,
+ parse_ocr_cookie_table,
+ parse_vision_response,
+)
+logger = logging.getLogger(__name__)
CONSENT_TESTER_URL = os.getenv(
"CONSENT_TESTER_URL", "http://bp-compliance-consent-tester:8094"
)
-VISION_MODEL = os.getenv("COOKIE_VISION_MODEL", "qwen2.5vl:32b")
-OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
-def _slice_screenshot(png_bytes: bytes, slice_h: int = 1500,
- max_slices: int = 25) -> list[str]:
- """Cut a tall full-page screenshot into 1280×slice_h slices and return
- each as base64-encoded PNG. Vision models choke on 25k-tall images
- (resampled down to ~1024 → unreadable text); slicing keeps DPI."""
- if not png_bytes:
- return []
- try:
- from PIL import Image
- from io import BytesIO
- except ImportError:
- return []
- img = Image.open(BytesIO(png_bytes)).convert("RGB")
- w, h = img.size
- n = min((h + slice_h - 1) // slice_h, max_slices)
- out: list[str] = []
- for i in range(n):
- top = i * slice_h
- bot = min((i + 1) * slice_h, h)
- chunk = img.crop((0, top, w, bot))
- buf = BytesIO()
- chunk.save(buf, format="PNG", optimize=True)
- out.append(_b64.b64encode(buf.getvalue()).decode("ascii"))
- return out
-
-
-async def _call_vision_on_slice(b64_png: str, timeout_s: float = 240.0) -> str:
- """Ask the vision model to dump all cookie-row text from one slice
- as raw text (NOT JSON). We parse it downstream with parse_flat regex."""
- prompt = (
- "Du siehst einen Bildausschnitt einer Cookie-Richtlinien-Tabelle. "
- "Liste ALLE Tabellen-Zeilen wortwoertlich auf, eine Zeile pro "
- "Cookie. Jede Zeile soll enthalten: Cookie-Name, Kategorie, "
- "Zweck, Speicherdauer, Art (Permanent/Session). "
- "Format: ' | | | | '. "
- "KEINE Cookies erfinden, nur was im Bild steht. Nur die Tabellen-"
- "Zeilen, keine Erklaerungen."
- )
- payload = {
- "model": VISION_MODEL,
- "stream": False,
- "messages": [{
- "role": "user", "content": prompt, "images": [b64_png],
- }],
- "options": {"temperature": 0.05, "num_predict": 4000},
- }
- try:
- async with httpx.AsyncClient(timeout=timeout_s) as c:
- r = await c.post(f"{OLLAMA_URL.rstrip('/')}/api/chat", json=payload)
- r.raise_for_status()
- return (r.json().get("message") or {}).get("content", "") or ""
- except Exception as e:
- logger.debug("vision slice failed: %s", e)
- return ""
-
-
-async def ocr_screenshot_via_vision_slices(png_bytes: bytes,
- max_slices: int = 20) -> str:
- """Slice + vision-OCR each slice + concatenate. Returns raw text that
- can be fed to parse_flat_cookie_text."""
- slices = _slice_screenshot(png_bytes, slice_h=1500, max_slices=max_slices)
- if not slices:
- return ""
- logger.info("Vision-slicing: %d slices → vision-OCR (model=%s)",
- len(slices), VISION_MODEL)
- import asyncio as _aio
- # Run slices SEQUENTIALLY: ollama is single-GPU and loading the same
- # model for parallel requests causes OOM + thrashing on Mac Mini.
- parts: list[str] = []
- for i, s in enumerate(slices):
- txt = await _call_vision_on_slice(s)
- if txt:
- parts.append(txt)
- logger.info("Vision-slice %d/%d: %d chars", i + 1, len(slices),
- len(txt))
- full = "\n".join(parts)
- logger.info("Vision-OCR slicing total: %d chars from %d slices",
- len(full), len(slices))
- return full
-
-
-def ocr_screenshot_via_paddle(png_bytes: bytes) -> str:
- """Run PaddleOCR over the full-page screenshot, returning the
- concatenated text. Deterministic, no LLM halluzination.
-
- Splits tall screenshots into 1280x3000 slices so OCR works in chunks
- without OOM on large pages (VW cookie-page is ~25k px tall).
- """
- if not png_bytes:
- return ""
- try:
- from PIL import Image
- from io import BytesIO
- from paddleocr import PaddleOCR
- except ImportError as e:
- logger.warning("PaddleOCR / PIL not available: %s", e)
- return ""
-
- try:
- img = Image.open(BytesIO(png_bytes)).convert("RGB")
- except Exception as e:
- logger.warning("PIL open failed: %s", e)
- return ""
-
- w, h = img.size
- slice_h = 3000
- n_slices = (h + slice_h - 1) // slice_h
- logger.info("PaddleOCR: %dx%d screenshot → %d slices of %d high",
- w, h, n_slices, slice_h)
-
- # Global OCR instance reused — initial init is ~10s.
- global _PADDLE_OCR
- if "_PADDLE_OCR" not in globals() or _PADDLE_OCR is None:
- try:
- _PADDLE_OCR = PaddleOCR(use_angle_cls=False, lang="german",
- show_log=False)
- except Exception as e:
- logger.warning("PaddleOCR init failed: %s", e)
- return ""
-
- parts: list[str] = []
- import numpy as np
- for i in range(n_slices):
- top = i * slice_h
- bot = min((i + 1) * slice_h, h)
- crop = img.crop((0, top, w, bot))
- arr = np.array(crop)
- try:
- result = _PADDLE_OCR.ocr(arr, cls=False)
- except Exception as e:
- logger.warning("PaddleOCR slice %d failed: %s", i, e)
- continue
- # PaddleOCR returns list-of-lines where each line is
- # [bbox, (text, conf)] — variable nesting depending on version.
- if not result:
- continue
- for page in result:
- if not page: continue
- for line in page:
- if not line: continue
- try:
- if isinstance(line, list) and len(line) >= 2:
- txt = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1])
- else:
- txt = str(line)
- if txt: parts.append(txt)
- except Exception:
- continue
-
- full_text = "\n".join(parts)
- logger.info("PaddleOCR: extracted %d lines / %d chars from %d slices",
- len(parts), len(full_text), n_slices)
- return full_text
-
-
-_PADDLE_OCR = None
-
-
-# ── Tesseract-based parser ────────────────────────────────────────────
-
-def ocr_screenshot_via_tesseract(png_bytes: bytes,
- lang: str = "deu",
- psm: int = 4) -> str:
- """Run Tesseract OCR on a full-page screenshot. Returns normalized text
- where multi-newline paragraphs are collapsed but blank lines preserved
- (helps anchor-based parsing).
-
- psm=4 means single column of text of variable sizes (cookie-tables).
- """
- if not png_bytes:
- return ""
- try:
- import pytesseract
- from PIL import Image
- from io import BytesIO
- import re as _re
- except ImportError as e:
- logger.warning("tesseract/PIL not available: %s", e)
- return ""
- try:
- img = Image.open(BytesIO(png_bytes)).convert("RGB")
- raw = pytesseract.image_to_string(img, lang=lang,
- config=f"--psm {psm}")
- # Collapse intra-paragraph newlines so OCR cells flow on one line.
- norm = _re.sub(r"[ \t]+", " ", raw)
- norm = _re.sub(r"\n(?!\s*\n)", " ", norm)
- norm = _re.sub(r"\s{2,}", " ", norm)
- logger.info(
- "Tesseract OCR: %d chars / %d words (image %dx%d)",
- len(norm), len(norm.split()), img.size[0], img.size[1],
- )
- return norm
- except Exception as e:
- logger.warning("Tesseract OCR failed: %s (%s)",
- str(e) or "(no msg)", type(e).__name__)
- return ""
-
-
-# Kategorie-Anchor-Tokens that ALWAYS follow the Cookie-Name in the
-# typical column layout: [NAME] [KATEGORIE] [ZWECK] [DAUER] [ART]
-_CATEGORY_ANCHORS = (
- r"Funktionscookie", r"Trackingcookie",
- r"Tracking Cookies?", r"Session Cookies?",
- r"Funktional", r"Marketing", r"Analytics", r"Necessary",
- r"Werbung", r"Personalisierung", r"Statistik",
- r"Notwendig", r"Erforderlich",
-)
-
-_CATEGORY_PATTERN = "(?:" + "|".join(_CATEGORY_ANCHORS) + r")(?:\s*\([^)]*\))?"
-
-# Cookie-Name: alphanum + underscore + dash + dot. Wir erlauben optional
-# einen Suffix-Underscore (Spalten-Umbruch bei VW: `VWD6_ENSIGHTEN_PRIVACY_`
-# als Name-Fragment). Mind. 3, max. 60 chars.
-_COOKIE_NAME_RE = (
- r"(?:[A-Za-z][\w\-.]{2,60}|[A-Za-z][\w\-.]{2,60}<[^>]+>)"
-)
-
-
-def parse_ocr_cookie_table(text: str) -> list[dict]:
- """Extract cookie-records from Tesseract-OCR text using anchor-based
- pattern: .
-
- Returns list of {name, category, purpose, duration, type}. Vendor is
- NOT inferred here — caller maps via _guess_vendor.
-
- KEINE Cookie-Namens-Korrektur — `awsalb` bleibt `awsalb`, nicht
- `awesome`. Falsche Korrektur waere ein Compliance-Verlust.
- """
- if not text or len(text) < 200:
- return []
- import re as _re
- # Pattern: capture name + anchor category, then up to 250 chars
- # forward to grab duration + type tokens.
- pattern = _re.compile(
- rf"(?P{_COOKIE_NAME_RE})\s+"
- rf"(?P{_CATEGORY_PATTERN})"
- rf"(?P[^A-Z]{{0,300}}?)"
- rf"(?:(?P\d+(?:[.,]\s*)?\s*(?:Tage|Jahre?|Monate?|Minuten|Stunden|Sekunden)\.?)?\s*"
- rf"(?PPermanent/Protokoll|Session\s*Cookie|Persistent\s*Cookie|Persistent\s*cookie))?",
- _re.IGNORECASE | _re.DOTALL,
- )
- seen_names: set[str] = set()
- out: list[dict] = []
- for m in pattern.finditer(text):
- name = (m.group("name") or "").strip()
- # Filter obvious garbage (UI strings, navigation, common words)
- if not name or len(name) < 3:
- continue
- nl = name.lower()
- if nl in seen_names:
- continue
- # Reject common non-cookie words. Cookie-Namen sind technische IDs:
- # haben oft Unterstrich/Bindestrich/Camel-Case oder sind kurze IDs.
- if nl in ("name", "art", "zweck", "dauer", "kategorie", "anbieter",
- "cookie", "cookies", "name des cookies",
- "this", "dieser", "diese", "alle", "und", "von", "der",
- "die", "das", "ein", "eine", "session", "permanent",
- "category"):
- continue
- # Cookie-Namen sollen kein reines Lower-Word sein OHNE _ oder -
- # (z.B. "verwendet" wuerde sonst matchen)
- has_marker = any(c in name for c in "_-.<>")
- is_caps = name.upper() == name and len(name) >= 3
- is_camel = any(c.isupper() for c in name[1:]) and any(c.islower() for c in name)
- if not (has_marker or is_caps or is_camel):
- # Lowercase word ohne Marker → vermutlich kein Cookie-Name
- continue
- seen_names.add(nl)
- out.append({
- "name": name[:80],
- "category": (m.group("category") or "").strip()[:60],
- "purpose": (m.group("rest") or "").strip()[:200],
- "duration": (m.group("duration") or "").strip()[:60],
- "type": (m.group("type") or "").strip()[:30],
- "vendor": "",
- })
- logger.info("parse_ocr_cookie_table: %d unique cookies extracted", len(out))
- return out
-
-
-_VISION_PROMPT = (
- "Du analysierst einen Screenshot einer Cookie-Richtlinie. Auf der Seite "
- "ist eine Tabelle mit Cookies aufgelistet. Spalten sind ueblicherweise: "
- "Name des Cookies, Kategorie (z.B. 'Funktional', 'Marketing', "
- "'Analytics'), Verwendungszweck, Speicherdauer, Art des Cookies "
- "(z.B. 'Permanent', 'Session').\n\n"
- "Extrahiere ALLE Cookies aus dem Bild. Wenn die Tabelle abgeschnitten "
- "ist, extrahiere alles was sichtbar ist. KEINE Cookies erfinden, KEINE "
- "Halluzinationen.\n\n"
- "Antworte als reines JSON-Objekt im Format:\n"
- '{"cookies": [\n'
- ' {"name": "", "category": "", '
- '"purpose": "", '
- '"duration": "", '
- '"type": "", '
- '"vendor": ""}\n'
- "]}\n\n"
- "Nur JSON, kein Erklaerungstext, keine Code-Fences."
-)
+# Backward-compat: some callers may import _parse_vision_response
+_parse_vision_response = parse_vision_response
async def capture_cookie_evidence_slices(
@@ -414,9 +127,7 @@ async def capture_cookie_evidence_slices(
def _ocr_one_slice(s: dict) -> tuple[dict, list[dict]]:
- """Helper for parallel execution: tesseract + parse for one slice.
- Returns (slice_metadata_summary, cookies)."""
- import base64 as _b64
+ """Helper for parallel execution: tesseract + parse for one slice."""
try:
png = _b64.b64decode(s.get("png_b64", ""))
except Exception:
@@ -440,10 +151,6 @@ def ocr_slices_extract_cookies(
ThreadPoolExecutor with 4 workers yields ~4x speedup on multi-core
machines (M4 Pro has plenty). Sequential 32 slices = ~60s, parallel
~15s.
-
- Returns (cookies, stats) where stats has:
- per_slice: [{idx, cookies_found, ts, top_y, bot_y}]
- total_raw, total_unique, slices
"""
from concurrent.futures import ThreadPoolExecutor
@@ -451,7 +158,6 @@ def ocr_slices_extract_cookies(
return [], {"per_slice": [], "total_raw": 0,
"total_unique": 0, "slices": 0}
- # Keep slice order so the per-slice report is sequential.
with ThreadPoolExecutor(max_workers=max_workers) as ex:
results = list(ex.map(_ocr_one_slice, slices))
@@ -474,7 +180,8 @@ def ocr_slices_extract_cookies(
}
logger.info(
"ocr_slices_extract_cookies (parallel=%d): %d slices → %d raw → %d unique",
- max_workers, stats["slices"], stats["total_raw"], stats["total_unique"],
+ max_workers, stats["slices"], stats["total_raw"],
+ stats["total_unique"],
)
return all_cookies, stats
@@ -482,11 +189,7 @@ def ocr_slices_extract_cookies(
async def capture_cookie_screenshot(
cookie_url: str, check_id: str = "", timeout_s: float = 60.0,
) -> dict:
- """Trigger consent-tester to capture full-page screenshot of cookie URL.
-
- Returns dict with png_b64, captured_at, url, width_px, height_px etc.
- Empty png_b64 on error.
- """
+ """Trigger consent-tester to capture full-page screenshot of cookie URL."""
if not cookie_url:
return {"png_b64": "", "error": "no url"}
try:
@@ -514,11 +217,7 @@ async def capture_cookie_screenshot(
async def extract_cookies_via_vision(
png_b64: str, timeout_s: float = 240.0,
) -> list[dict]:
- """Call Ollama llama3.2-vision with the screenshot + extraction prompt.
-
- Returns list of {name, category, purpose, duration, type, vendor}.
- Empty list on failure.
- """
+ """Call Ollama vision model with the screenshot + extraction prompt."""
if not png_b64:
return []
payload = {
@@ -527,13 +226,10 @@ async def extract_cookies_via_vision(
"format": "json",
"messages": [{
"role": "user",
- "content": _VISION_PROMPT,
+ "content": VISION_PROMPT,
"images": [png_b64],
}],
- "options": {
- "temperature": 0.05,
- "num_predict": 8000,
- },
+ "options": {"temperature": 0.05, "num_predict": 8000},
}
try:
async with httpx.AsyncClient(timeout=timeout_s) as c:
@@ -543,7 +239,7 @@ async def extract_cookies_via_vision(
)
r.raise_for_status()
content = (r.json().get("message") or {}).get("content", "") or ""
- cookies = _parse_vision_response(content)
+ cookies = parse_vision_response(content)
logger.info(
"Vision-OCR extracted %d cookies (model=%s, response_len=%d)",
len(cookies), VISION_MODEL, len(content),
@@ -557,59 +253,11 @@ async def extract_cookies_via_vision(
return []
-def _parse_vision_response(content: str) -> list[dict]:
- """Be lenient: code fences, leading prose, partial JSON."""
- if not content:
- return []
- txt = content.strip()
- if txt.startswith("```"):
- lines = txt.split("\n")
- if lines and lines[-1].strip().startswith("```"):
- txt = "\n".join(lines[1:-1])
- else:
- txt = "\n".join(lines[1:])
- a, b = txt.find("{"), txt.rfind("}")
- if not (0 <= a < b):
- return []
- try:
- obj = json.loads(txt[a:b + 1])
- except json.JSONDecodeError:
- return []
- if not isinstance(obj, dict):
- return []
- arr = obj.get("cookies") or obj.get("Cookies") or []
- if not isinstance(arr, list):
- return []
- out: list[dict] = []
- for item in arr[:300]: # cap to sanity
- if not isinstance(item, dict):
- continue
- name = (item.get("name") or "").strip()
- if not name or len(name) < 2 or len(name) > 80:
- continue
- # Strip obvious garbage
- if re.fullmatch(r"[\s\-_.]+", name):
- continue
- out.append({
- "name": name[:80],
- "category": (item.get("category") or "").strip()[:60],
- "purpose": (item.get("purpose") or "").strip()[:200],
- "duration": (item.get("duration") or "").strip()[:60],
- "type": (item.get("type") or "").strip()[:30],
- "vendor": (item.get("vendor") or "").strip()[:80],
- })
- return out
-
-
def cookies_to_vendor_records(
cookies: list[dict], guess_vendor_fn=None,
) -> list[dict]:
"""Aggregate OCR-extracted cookies into vendor records compatible with
- cmp_vendors-schema. guess_vendor_fn: optional callable name → vendor.
-
- Each cookie's vendor field is used; if empty, we fall back to
- guess_vendor_fn (e.g. _guess_vendor from cookies_table_parser).
- """
+ cmp_vendors-schema. guess_vendor_fn: optional callable name → vendor."""
by_vendor: dict[str, dict] = {}
for c in cookies:
v_name = (c.get("vendor") or "").strip()
diff --git a/backend-compliance/compliance/services/cookie_screenshot_ocr_engines.py b/backend-compliance/compliance/services/cookie_screenshot_ocr_engines.py
new file mode 100644
index 00000000..452a8c7a
--- /dev/null
+++ b/backend-compliance/compliance/services/cookie_screenshot_ocr_engines.py
@@ -0,0 +1,353 @@
+"""OCR-Engine-Funktionen für cookie_screenshot_ocr (Phase-1 Split).
+
+Aus dem Hauptmodul ausgelagert, damit es unter dem 500-LOC-Hard-Cap bleibt:
+ - PIL-basiertes _slice_screenshot (zerteilt PNG in subimages)
+ - Vision-LLM-OCR (ollama qwen2.5vl:32b)
+ - PaddleOCR fallback
+ - Tesseract OCR (Hauptpfad)
+ - Anchor-basierter Parser parse_ocr_cookie_table
+ - _parse_vision_response (JSON-Toleranz für Vision-Output)
+"""
+
+from __future__ import annotations
+
+import base64 as _b64
+import json
+import logging
+import os
+import re
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+VISION_MODEL = os.getenv("COOKIE_VISION_MODEL", "qwen2.5vl:32b")
+OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
+
+_PADDLE_OCR = None # lazy-initialised PaddleOCR instance
+
+
+# ── 1. Screenshot-Slicing für Vision-Models ────────────────────────
+
+def _slice_screenshot(png_bytes: bytes, slice_h: int = 1500,
+ max_slices: int = 25) -> list[str]:
+ """Cut a tall full-page screenshot into 1280×slice_h slices and return
+ each as base64-encoded PNG. Vision models choke on 25k-tall images
+ (resampled down to ~1024 → unreadable text); slicing keeps DPI."""
+ if not png_bytes:
+ return []
+ try:
+ from PIL import Image
+ from io import BytesIO
+ except ImportError:
+ return []
+ img = Image.open(BytesIO(png_bytes)).convert("RGB")
+ w, h = img.size
+ n = min((h + slice_h - 1) // slice_h, max_slices)
+ out: list[str] = []
+ for i in range(n):
+ top = i * slice_h
+ bot = min((i + 1) * slice_h, h)
+ chunk = img.crop((0, top, w, bot))
+ buf = BytesIO()
+ chunk.save(buf, format="PNG", optimize=True)
+ out.append(_b64.b64encode(buf.getvalue()).decode("ascii"))
+ return out
+
+
+# ── 2. Vision-LLM-OCR ──────────────────────────────────────────────
+
+async def _call_vision_on_slice(b64_png: str,
+ timeout_s: float = 240.0) -> str:
+ """Ask the vision model to dump all cookie-row text from one slice
+ as raw text (NOT JSON). We parse it downstream with parse_flat regex."""
+ prompt = (
+ "Du siehst einen Bildausschnitt einer Cookie-Richtlinien-Tabelle. "
+ "Liste ALLE Tabellen-Zeilen wortwoertlich auf, eine Zeile pro "
+ "Cookie. Jede Zeile soll enthalten: Cookie-Name, Kategorie, "
+ "Zweck, Speicherdauer, Art (Permanent/Session). "
+ "Format: ' | | | | '. "
+ "KEINE Cookies erfinden, nur was im Bild steht. Nur die Tabellen-"
+ "Zeilen, keine Erklaerungen."
+ )
+ payload = {
+ "model": VISION_MODEL,
+ "stream": False,
+ "messages": [{
+ "role": "user", "content": prompt, "images": [b64_png],
+ }],
+ "options": {"temperature": 0.05, "num_predict": 4000},
+ }
+ try:
+ async with httpx.AsyncClient(timeout=timeout_s) as c:
+ r = await c.post(f"{OLLAMA_URL.rstrip('/')}/api/chat",
+ json=payload)
+ r.raise_for_status()
+ return (r.json().get("message") or {}).get("content", "") or ""
+ except Exception as e:
+ logger.debug("vision slice failed: %s", e)
+ return ""
+
+
+async def ocr_screenshot_via_vision_slices(png_bytes: bytes,
+ max_slices: int = 20) -> str:
+ """Slice + vision-OCR each slice + concatenate."""
+ slices = _slice_screenshot(png_bytes, slice_h=1500,
+ max_slices=max_slices)
+ if not slices:
+ return ""
+ logger.info("Vision-slicing: %d slices → vision-OCR (model=%s)",
+ len(slices), VISION_MODEL)
+ parts: list[str] = []
+ for i, s in enumerate(slices):
+ txt = await _call_vision_on_slice(s)
+ if txt:
+ parts.append(txt)
+ logger.info("Vision-slice %d/%d: %d chars", i + 1, len(slices),
+ len(txt))
+ full = "\n".join(parts)
+ logger.info("Vision-OCR slicing total: %d chars from %d slices",
+ len(full), len(slices))
+ return full
+
+
+# ── 3. PaddleOCR (fallback) ────────────────────────────────────────
+
+def ocr_screenshot_via_paddle(png_bytes: bytes) -> str:
+ """Run PaddleOCR over the full-page screenshot, returning the
+ concatenated text. Splits tall screenshots into 1280x3000 slices."""
+ if not png_bytes:
+ return ""
+ try:
+ from PIL import Image
+ from io import BytesIO
+ from paddleocr import PaddleOCR
+ except ImportError as e:
+ logger.warning("PaddleOCR / PIL not available: %s", e)
+ return ""
+
+ try:
+ img = Image.open(BytesIO(png_bytes)).convert("RGB")
+ except Exception as e:
+ logger.warning("PIL open failed: %s", e)
+ return ""
+
+ w, h = img.size
+ slice_h = 3000
+ n_slices = (h + slice_h - 1) // slice_h
+ logger.info("PaddleOCR: %dx%d screenshot → %d slices of %d high",
+ w, h, n_slices, slice_h)
+
+ global _PADDLE_OCR
+ if _PADDLE_OCR is None:
+ try:
+ _PADDLE_OCR = PaddleOCR(use_angle_cls=False, lang="german",
+ show_log=False)
+ except Exception as e:
+ logger.warning("PaddleOCR init failed: %s", e)
+ return ""
+
+ parts: list[str] = []
+ import numpy as np
+ for i in range(n_slices):
+ top = i * slice_h
+ bot = min((i + 1) * slice_h, h)
+ crop = img.crop((0, top, w, bot))
+ arr = np.array(crop)
+ try:
+ result = _PADDLE_OCR.ocr(arr, cls=False)
+ except Exception as e:
+ logger.warning("PaddleOCR slice %d failed: %s", i, e)
+ continue
+ if not result:
+ continue
+ for page in result:
+ if not page:
+ continue
+ for line in page:
+ if not line:
+ continue
+ try:
+ if isinstance(line, list) and len(line) >= 2:
+ txt = (line[1][0]
+ if isinstance(line[1], (list, tuple))
+ else str(line[1]))
+ else:
+ txt = str(line)
+ if txt:
+ parts.append(txt)
+ except Exception:
+ continue
+
+ full_text = "\n".join(parts)
+ logger.info("PaddleOCR: extracted %d lines / %d chars from %d slices",
+ len(parts), len(full_text), n_slices)
+ return full_text
+
+
+# ── 4. Tesseract OCR (Hauptpfad) ───────────────────────────────────
+
+def ocr_screenshot_via_tesseract(png_bytes: bytes,
+ lang: str = "deu",
+ psm: int = 4) -> str:
+ """Run Tesseract OCR on a full-page screenshot. psm=4 = single column
+ of text of variable sizes (cookie-tables)."""
+ if not png_bytes:
+ return ""
+ try:
+ import pytesseract
+ from PIL import Image
+ from io import BytesIO
+ import re as _re
+ except ImportError as e:
+ logger.warning("tesseract/PIL not available: %s", e)
+ return ""
+ try:
+ img = Image.open(BytesIO(png_bytes)).convert("RGB")
+ raw = pytesseract.image_to_string(img, lang=lang,
+ config=f"--psm {psm}")
+ norm = _re.sub(r"[ \t]+", " ", raw)
+ norm = _re.sub(r"\n(?!\s*\n)", " ", norm)
+ norm = _re.sub(r"\s{2,}", " ", norm)
+ logger.info(
+ "Tesseract OCR: %d chars / %d words (image %dx%d)",
+ len(norm), len(norm.split()), img.size[0], img.size[1],
+ )
+ return norm
+ except Exception as e:
+ logger.warning("Tesseract OCR failed: %s (%s)",
+ str(e) or "(no msg)", type(e).__name__)
+ return ""
+
+
+# ── 5. Anchor-basierter Parser ─────────────────────────────────────
+
+_CATEGORY_ANCHORS = (
+ r"Funktionscookie", r"Trackingcookie",
+ r"Tracking Cookies?", r"Session Cookies?",
+ r"Funktional", r"Marketing", r"Analytics", r"Necessary",
+ r"Werbung", r"Personalisierung", r"Statistik",
+ r"Notwendig", r"Erforderlich",
+)
+_CATEGORY_PATTERN = ("(?:" + "|".join(_CATEGORY_ANCHORS)
+ + r")(?:\s*\([^)]*\))?")
+_COOKIE_NAME_RE = (
+ r"(?:[A-Za-z][\w\-.]{2,60}|[A-Za-z][\w\-.]{2,60}<[^>]+>)"
+)
+
+
+def parse_ocr_cookie_table(text: str) -> list[dict]:
+ """Extract cookie-records from Tesseract-OCR text. KEINE Cookie-Namens-
+ Korrektur — `awsalb` bleibt `awsalb`."""
+ if not text or len(text) < 200:
+ return []
+ pattern = re.compile(
+ rf"(?P{_COOKIE_NAME_RE})\s+"
+ rf"(?P{_CATEGORY_PATTERN})"
+ rf"(?P[^A-Z]{{0,300}}?)"
+ rf"(?:(?P\d+(?:[.,]\s*)?\s*"
+ rf"(?:Tage|Jahre?|Monate?|Minuten|Stunden|Sekunden)\.?)?\s*"
+ rf"(?PPermanent/Protokoll|Session\s*Cookie|"
+ rf"Persistent\s*Cookie|Persistent\s*cookie))?",
+ re.IGNORECASE | re.DOTALL,
+ )
+ seen_names: set[str] = set()
+ out: list[dict] = []
+ for m in pattern.finditer(text):
+ name = (m.group("name") or "").strip()
+ if not name or len(name) < 3:
+ continue
+ nl = name.lower()
+ if nl in seen_names:
+ continue
+ if nl in ("name", "art", "zweck", "dauer", "kategorie", "anbieter",
+ "cookie", "cookies", "name des cookies",
+ "this", "dieser", "diese", "alle", "und", "von", "der",
+ "die", "das", "ein", "eine", "session", "permanent",
+ "category"):
+ continue
+ has_marker = any(c in name for c in "_-.<>")
+ is_caps = name.upper() == name and len(name) >= 3
+ is_camel = (any(c.isupper() for c in name[1:])
+ and any(c.islower() for c in name))
+ if not (has_marker or is_caps or is_camel):
+ continue
+ seen_names.add(nl)
+ out.append({
+ "name": name[:80],
+ "category": (m.group("category") or "").strip()[:60],
+ "purpose": (m.group("rest") or "").strip()[:200],
+ "duration": (m.group("duration") or "").strip()[:60],
+ "type": (m.group("type") or "").strip()[:30],
+ "vendor": "",
+ })
+ logger.info("parse_ocr_cookie_table: %d unique cookies extracted",
+ len(out))
+ return out
+
+
+# ── 6. Vision-Response-Parser ──────────────────────────────────────
+
+VISION_PROMPT = (
+ "Du analysierst einen Screenshot einer Cookie-Richtlinie. Auf der Seite "
+ "ist eine Tabelle mit Cookies aufgelistet. Spalten sind ueblicherweise: "
+ "Name des Cookies, Kategorie (z.B. 'Funktional', 'Marketing', "
+ "'Analytics'), Verwendungszweck, Speicherdauer, Art des Cookies "
+ "(z.B. 'Permanent', 'Session').\n\n"
+ "Extrahiere ALLE Cookies aus dem Bild. Wenn die Tabelle abgeschnitten "
+ "ist, extrahiere alles was sichtbar ist. KEINE Cookies erfinden, KEINE "
+ "Halluzinationen.\n\n"
+ "Antworte als reines JSON-Objekt im Format:\n"
+ '{"cookies": [\n'
+ ' {"name": "", "category": "", '
+ '"purpose": "", '
+ '"duration": "", '
+ '"type": "", '
+ '"vendor": ""}\n'
+ "]}\n\n"
+ "Nur JSON, kein Erklaerungstext, keine Code-Fences."
+)
+
+
+def parse_vision_response(content: str) -> list[dict]:
+ """Be lenient: code fences, leading prose, partial JSON."""
+ if not content:
+ return []
+ txt = content.strip()
+ if txt.startswith("```"):
+ lines = txt.split("\n")
+ if lines and lines[-1].strip().startswith("```"):
+ txt = "\n".join(lines[1:-1])
+ else:
+ txt = "\n".join(lines[1:])
+ a, b = txt.find("{"), txt.rfind("}")
+ if not (0 <= a < b):
+ return []
+ try:
+ obj = json.loads(txt[a:b + 1])
+ except json.JSONDecodeError:
+ return []
+ if not isinstance(obj, dict):
+ return []
+ arr = obj.get("cookies") or obj.get("Cookies") or []
+ if not isinstance(arr, list):
+ return []
+ out: list[dict] = []
+ for item in arr[:300]:
+ if not isinstance(item, dict):
+ continue
+ name = (item.get("name") or "").strip()
+ if not name or len(name) < 2 or len(name) > 80:
+ continue
+ if re.fullmatch(r"[\s\-_.]+", name):
+ continue
+ out.append({
+ "name": name[:80],
+ "category": (item.get("category") or "").strip()[:60],
+ "purpose": (item.get("purpose") or "").strip()[:200],
+ "duration": (item.get("duration") or "").strip()[:60],
+ "type": (item.get("type") or "").strip()[:30],
+ "vendor": (item.get("vendor") or "").strip()[:80],
+ })
+ return out