refactor: split cookie_screenshot_ocr.py (642 → 290 + 353 LOC)
CI / detect-changes (push) Successful in 7s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 11s
CI / loc-budget (push) Failing after 14s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m19s
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 29s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / detect-changes (push) Successful in 7s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 11s
CI / loc-budget (push) Failing after 14s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m19s
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 29s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI hard-cap 500 LOC. cookie_screenshot_ocr.py war auf 642 gewachsen,
also gesplittet:
- cookie_screenshot_ocr_engines.py (353 LOC, NEU)
OCR-Engine-Funktionen: _slice_screenshot, Vision-LLM (qwen2.5vl),
PaddleOCR, Tesseract, parse_ocr_cookie_table, parse_vision_response,
Konstanten VISION_MODEL/OLLAMA_URL/VISION_PROMPT.
- cookie_screenshot_ocr.py (290 LOC, REWRITE)
Orchestration: capture_cookie_evidence_slices, _ocr_one_slice,
ocr_slices_extract_cookies, capture_cookie_screenshot,
extract_cookies_via_vision, cookies_to_vendor_records.
Re-Exports der Engine-Funktionen für Backward-Kompat.
Einziger externer Importer (_phase_d1_vendors_raw.py) braucht keinen
Code-Change — Public-API stabil.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,232 @@
|
||||
'use client'
|
||||
|
||||
/**
|
||||
* Strukturierter Editor fuer JSONB-Conditions:
|
||||
* { kind: 'all'|'any', clauses: [{field, op, value}] }
|
||||
*
|
||||
* Wird im RuleEditor verwendet. Reine Praesentations-Komponente — Parent
|
||||
* verwaltet State.
|
||||
*/
|
||||
|
||||
import type {
|
||||
ClauseOperator, RuleClause, RuleCondition,
|
||||
} from '../_types'
|
||||
import { OPERATOR_LABELS, PROFILE_FIELDS } from '../_types'
|
||||
|
||||
interface Props {
|
||||
value: RuleCondition
|
||||
onChange: (next: RuleCondition) => void
|
||||
readOnly?: boolean
|
||||
}
|
||||
|
||||
export default function ConditionBuilder({ value, onChange, readOnly }: Props) {
|
||||
const setKind = (kind: 'all' | 'any') => onChange({ ...value, kind })
|
||||
const setClause = (idx: number, clause: RuleClause) => {
|
||||
const next = [...value.clauses]
|
||||
next[idx] = clause
|
||||
onChange({ ...value, clauses: next })
|
||||
}
|
||||
const addClause = () =>
|
||||
onChange({
|
||||
...value,
|
||||
clauses: [
|
||||
...value.clauses,
|
||||
{ field: PROFILE_FIELDS[0].key, op: 'eq', value: '' },
|
||||
],
|
||||
})
|
||||
const removeClause = (idx: number) =>
|
||||
onChange({ ...value, clauses: value.clauses.filter((_, i) => i !== idx) })
|
||||
|
||||
return (
|
||||
<div className="space-y-2">
|
||||
<div className="flex items-center gap-2">
|
||||
<span className="text-xs text-gray-600">Bedingung:</span>
|
||||
<select
|
||||
className="text-xs px-2 py-1 border border-gray-300 rounded"
|
||||
value={value.kind}
|
||||
disabled={readOnly}
|
||||
onChange={(e) => setKind(e.target.value as 'all' | 'any')}
|
||||
>
|
||||
<option value="all">ALLE Klauseln müssen zutreffen (AND)</option>
|
||||
<option value="any">MIND. EINE Klausel trifft zu (OR)</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
{value.clauses.length === 0 && (
|
||||
<div className="text-xs text-gray-500 italic px-1">
|
||||
Keine Klauseln — Regel gilt für jedes Profil.
|
||||
</div>
|
||||
)}
|
||||
|
||||
<ul className="space-y-1">
|
||||
{value.clauses.map((clause, idx) => (
|
||||
<li key={idx} className="flex items-start gap-1 p-1.5 bg-gray-50 rounded border border-gray-200">
|
||||
<ClauseRow
|
||||
clause={clause}
|
||||
onChange={(c) => setClause(idx, c)}
|
||||
readOnly={!!readOnly}
|
||||
/>
|
||||
{!readOnly && (
|
||||
<button
|
||||
className="text-xs px-1.5 py-0.5 text-rose-700 hover:bg-rose-50 rounded"
|
||||
onClick={() => removeClause(idx)}
|
||||
title="Klausel entfernen"
|
||||
>
|
||||
×
|
||||
</button>
|
||||
)}
|
||||
</li>
|
||||
))}
|
||||
</ul>
|
||||
|
||||
{!readOnly && (
|
||||
<button
|
||||
className="text-xs px-2 py-1 border border-gray-300 rounded text-gray-700 hover:bg-gray-50"
|
||||
onClick={addClause}
|
||||
>
|
||||
+ Klausel hinzufügen
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
function ClauseRow({
|
||||
clause, onChange, readOnly,
|
||||
}: {
|
||||
clause: RuleClause
|
||||
onChange: (c: RuleClause) => void
|
||||
readOnly: boolean
|
||||
}) {
|
||||
const field = PROFILE_FIELDS.find((f) => f.key === clause.field) || PROFILE_FIELDS[0]
|
||||
const operators: ClauseOperator[] =
|
||||
field.type === 'enum'
|
||||
? ['eq', 'neq', 'in', 'not_in', 'exists', 'truthy', 'falsy']
|
||||
: field.type === 'boolean'
|
||||
? ['truthy', 'falsy', 'eq', 'neq']
|
||||
: field.type === 'number'
|
||||
? ['eq', 'neq', 'gt', 'gte', 'lt', 'lte']
|
||||
: ['eq', 'neq', 'in', 'not_in', 'exists']
|
||||
|
||||
const requiresValue = !['exists', 'truthy', 'falsy'].includes(clause.op)
|
||||
const multiValue = clause.op === 'in' || clause.op === 'not_in'
|
||||
|
||||
return (
|
||||
<div className="flex-1 grid grid-cols-12 gap-1 items-center text-xs">
|
||||
<select
|
||||
className="col-span-4 px-1 py-0.5 border border-gray-300 rounded bg-white truncate"
|
||||
value={clause.field}
|
||||
disabled={readOnly}
|
||||
onChange={(e) => onChange({ ...clause, field: e.target.value })}
|
||||
>
|
||||
{PROFILE_FIELDS.map((f) => (
|
||||
<option key={f.key} value={f.key}>{f.label} ({f.key})</option>
|
||||
))}
|
||||
</select>
|
||||
|
||||
<select
|
||||
className="col-span-3 px-1 py-0.5 border border-gray-300 rounded bg-white"
|
||||
value={clause.op}
|
||||
disabled={readOnly}
|
||||
onChange={(e) => onChange({ ...clause, op: e.target.value as ClauseOperator })}
|
||||
>
|
||||
{operators.map((op) => (
|
||||
<option key={op} value={op}>{OPERATOR_LABELS[op]}</option>
|
||||
))}
|
||||
</select>
|
||||
|
||||
<div className="col-span-5">
|
||||
{requiresValue && (
|
||||
<ValueInput
|
||||
field={field}
|
||||
multi={multiValue}
|
||||
value={clause.value}
|
||||
onChange={(v) => onChange({ ...clause, value: v })}
|
||||
readOnly={readOnly}
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
function ValueInput({
|
||||
field, multi, value, onChange, readOnly,
|
||||
}: {
|
||||
field: typeof PROFILE_FIELDS[number]
|
||||
multi: boolean
|
||||
value: unknown
|
||||
onChange: (v: unknown) => void
|
||||
readOnly: boolean
|
||||
}) {
|
||||
if (field.type === 'enum' && field.options) {
|
||||
if (multi) {
|
||||
const selected = Array.isArray(value) ? (value as string[]) : []
|
||||
return (
|
||||
<select
|
||||
multiple
|
||||
className="w-full px-1 py-0.5 border border-gray-300 rounded bg-white h-16"
|
||||
value={selected}
|
||||
disabled={readOnly}
|
||||
onChange={(e) => {
|
||||
const opts = Array.from(e.target.selectedOptions, (o) => o.value)
|
||||
onChange(opts)
|
||||
}}
|
||||
>
|
||||
{field.options.map((o) => (
|
||||
<option key={o.value} value={o.value}>{o.label}</option>
|
||||
))}
|
||||
</select>
|
||||
)
|
||||
}
|
||||
return (
|
||||
<select
|
||||
className="w-full px-1 py-0.5 border border-gray-300 rounded bg-white"
|
||||
value={typeof value === 'string' ? value : ''}
|
||||
disabled={readOnly}
|
||||
onChange={(e) => onChange(e.target.value)}
|
||||
>
|
||||
<option value="">— wählen —</option>
|
||||
{field.options.map((o) => (
|
||||
<option key={o.value} value={o.value}>{o.label}</option>
|
||||
))}
|
||||
</select>
|
||||
)
|
||||
}
|
||||
|
||||
if (field.type === 'number') {
|
||||
return (
|
||||
<input
|
||||
type="number"
|
||||
className="w-full px-1 py-0.5 border border-gray-300 rounded"
|
||||
value={typeof value === 'number' ? value : 0}
|
||||
disabled={readOnly}
|
||||
onChange={(e) => onChange(Number(e.target.value))}
|
||||
/>
|
||||
)
|
||||
}
|
||||
|
||||
if (field.type === 'boolean') {
|
||||
return (
|
||||
<select
|
||||
className="w-full px-1 py-0.5 border border-gray-300 rounded bg-white"
|
||||
value={value ? 'true' : 'false'}
|
||||
disabled={readOnly}
|
||||
onChange={(e) => onChange(e.target.value === 'true')}
|
||||
>
|
||||
<option value="true">true</option>
|
||||
<option value="false">false</option>
|
||||
</select>
|
||||
)
|
||||
}
|
||||
|
||||
return (
|
||||
<input
|
||||
type="text"
|
||||
className="w-full px-1 py-0.5 border border-gray-300 rounded"
|
||||
value={typeof value === 'string' ? value : ''}
|
||||
disabled={readOnly}
|
||||
onChange={(e) => onChange(e.target.value)}
|
||||
/>
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,414 @@
|
||||
'use client'
|
||||
|
||||
/**
|
||||
* Rechte Spalte: Detail-Editor fuer die ausgewaehlte Regel.
|
||||
*
|
||||
* - Zeigt Live-Version + offenen Draft (falls vorhanden)
|
||||
* - Erlaubt Draft-Edit (classification, conditions, source_citation, rationale)
|
||||
* - Buttons: "Neuen Draft starten" (kopiert von Live), "Einreichen" (mit Pflicht
|
||||
* change_summary-Modal), "Intern freigeben" (DSB), "Publish" (= Mandanten-Freigabe)
|
||||
* - Versionshistorie + Approval-Trail unten als Akkordeon
|
||||
*/
|
||||
|
||||
import { useEffect, useMemo, useState } from 'react'
|
||||
import type {
|
||||
ApprovalHistoryEntry, Classification, Rule, RuleCondition, RuleVersion,
|
||||
} from '../_types'
|
||||
import { CLASSIFICATION_LABELS, STATUS_LABELS } from '../_types'
|
||||
import ConditionBuilder from './ConditionBuilder'
|
||||
|
||||
interface Props {
|
||||
rule: Rule
|
||||
versions: RuleVersion[]
|
||||
history: ApprovalHistoryEntry[]
|
||||
onCreateDraft: (payload: {
|
||||
classification: Classification
|
||||
conditions: RuleCondition
|
||||
source_citation: string
|
||||
rationale?: string | null
|
||||
}) => Promise<void>
|
||||
onUpdateDraft: (versionId: string, patch: {
|
||||
classification?: Classification
|
||||
conditions?: RuleCondition
|
||||
source_citation?: string
|
||||
rationale?: string | null
|
||||
}) => Promise<void>
|
||||
onSubmitForReview: (versionId: string, changeSummary: string) => Promise<void>
|
||||
onApprove: (versionId: string) => Promise<void>
|
||||
onPublish: (versionId: string) => Promise<void>
|
||||
onReject: (versionId: string, reason: string) => Promise<void>
|
||||
}
|
||||
|
||||
export default function RuleEditor({
|
||||
rule, versions, history,
|
||||
onCreateDraft, onUpdateDraft,
|
||||
onSubmitForReview, onApprove, onPublish, onReject,
|
||||
}: Props) {
|
||||
const liveVersion = useMemo(
|
||||
() => versions.find((v) => v.is_live) || null,
|
||||
[versions],
|
||||
)
|
||||
const draftVersion = useMemo(
|
||||
() => versions.find((v) => ['draft', 'review'].includes(v.status)) || null,
|
||||
[versions],
|
||||
)
|
||||
|
||||
// Edit-State
|
||||
const [classification, setClassification] = useState<Classification>('required')
|
||||
const [conditions, setConditions] = useState<RuleCondition>({ kind: 'all', clauses: [] })
|
||||
const [sourceCitation, setSourceCitation] = useState('')
|
||||
const [rationale, setRationale] = useState('')
|
||||
|
||||
// Modal-State
|
||||
const [showSubmit, setShowSubmit] = useState(false)
|
||||
const [changeSummary, setChangeSummary] = useState('')
|
||||
const [showHistory, setShowHistory] = useState(false)
|
||||
const [rejectReason, setRejectReason] = useState('')
|
||||
const [showReject, setShowReject] = useState(false)
|
||||
|
||||
// Sync Edit-State mit ausgewaehltem Version (Draft hat Vorrang)
|
||||
const sourceVersion = draftVersion || liveVersion
|
||||
useEffect(() => {
|
||||
if (sourceVersion) {
|
||||
setClassification(sourceVersion.classification)
|
||||
setConditions(sourceVersion.conditions)
|
||||
setSourceCitation(sourceVersion.source_citation)
|
||||
setRationale(sourceVersion.rationale || '')
|
||||
}
|
||||
}, [sourceVersion?.id])
|
||||
|
||||
const isDraftMode = !!draftVersion && draftVersion.status === 'draft'
|
||||
const isReviewMode = !!draftVersion && draftVersion.status === 'review'
|
||||
const readOnly = !isDraftMode
|
||||
|
||||
const handleCreateDraft = () => {
|
||||
onCreateDraft({
|
||||
classification: liveVersion?.classification || 'recommended',
|
||||
conditions: liveVersion?.conditions || { kind: 'all', clauses: [] },
|
||||
source_citation: liveVersion?.source_citation || '',
|
||||
rationale: liveVersion?.rationale,
|
||||
})
|
||||
}
|
||||
|
||||
const handleSaveDraft = () => {
|
||||
if (!draftVersion) return
|
||||
onUpdateDraft(draftVersion.id, {
|
||||
classification, conditions, source_citation: sourceCitation, rationale,
|
||||
})
|
||||
}
|
||||
|
||||
const handleSubmit = () => {
|
||||
if (!draftVersion || !changeSummary.trim()) return
|
||||
onSubmitForReview(draftVersion.id, changeSummary.trim())
|
||||
setShowSubmit(false)
|
||||
setChangeSummary('')
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="h-full flex flex-col overflow-hidden bg-white">
|
||||
<header className="px-5 py-3 border-b border-gray-200">
|
||||
<div className="flex items-baseline justify-between gap-3">
|
||||
<div className="min-w-0">
|
||||
<h2 className="text-base font-semibold text-gray-800 truncate">{rule.title}</h2>
|
||||
<div className="text-xs text-gray-500">
|
||||
<code>{rule.document_type}</code> · {rule.rule_key}
|
||||
</div>
|
||||
</div>
|
||||
<div className="flex items-center gap-2 text-xs text-gray-600">
|
||||
{liveVersion && (
|
||||
<span>
|
||||
Live: v{liveVersion.version_number} (
|
||||
<code>{liveVersion.classification}</code>)
|
||||
</span>
|
||||
)}
|
||||
{draftVersion && (
|
||||
<span className="px-1.5 py-0.5 bg-amber-100 text-amber-800 rounded border border-amber-300">
|
||||
Draft v{draftVersion.version_number} · {STATUS_LABELS[draftVersion.status]}
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<div className="flex-1 overflow-y-auto p-5 space-y-4">
|
||||
{!draftVersion && (
|
||||
<div className="bg-amber-50 border border-amber-200 rounded p-3 flex items-center justify-between">
|
||||
<span className="text-sm text-amber-800">
|
||||
Kein offener Draft. Starte einen neuen Draft, um die Regel zu ändern.
|
||||
</span>
|
||||
<button
|
||||
className="px-3 py-1.5 text-sm bg-amber-600 text-white rounded hover:bg-amber-700"
|
||||
onClick={handleCreateDraft}
|
||||
>
|
||||
+ Neuen Draft starten
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Klassifikation */}
|
||||
<section>
|
||||
<label className="text-xs font-medium text-gray-700 block mb-1">
|
||||
Klassifikation
|
||||
</label>
|
||||
<select
|
||||
className="text-sm px-2 py-1 border border-gray-300 rounded"
|
||||
value={classification}
|
||||
disabled={readOnly}
|
||||
onChange={(e) => setClassification(e.target.value as Classification)}
|
||||
>
|
||||
{(['required', 'recommended', 'optional'] as const).map((c) => (
|
||||
<option key={c} value={c}>{CLASSIFICATION_LABELS[c]}</option>
|
||||
))}
|
||||
</select>
|
||||
</section>
|
||||
|
||||
{/* Bedingung */}
|
||||
<section>
|
||||
<label className="text-xs font-medium text-gray-700 block mb-1">
|
||||
Bedingung
|
||||
</label>
|
||||
<ConditionBuilder
|
||||
value={conditions}
|
||||
onChange={setConditions}
|
||||
readOnly={readOnly}
|
||||
/>
|
||||
</section>
|
||||
|
||||
{/* Source Citation (Pflicht) */}
|
||||
<section>
|
||||
<label className="text-xs font-medium text-gray-700 block mb-1">
|
||||
Quelle / Norm-Citation <span className="text-rose-600">*</span>
|
||||
</label>
|
||||
<input
|
||||
type="text"
|
||||
className="w-full text-sm px-2 py-1.5 border border-gray-300 rounded"
|
||||
placeholder="z.B. § 12 HinSchG, Art. 28 DSGVO, EuGH C-311/18"
|
||||
value={sourceCitation}
|
||||
disabled={readOnly}
|
||||
onChange={(e) => setSourceCitation(e.target.value)}
|
||||
/>
|
||||
</section>
|
||||
|
||||
{/* Rationale */}
|
||||
<section>
|
||||
<label className="text-xs font-medium text-gray-700 block mb-1">
|
||||
Begründung / Rationale (optional)
|
||||
</label>
|
||||
<textarea
|
||||
className="w-full text-sm px-2 py-1.5 border border-gray-300 rounded"
|
||||
rows={3}
|
||||
placeholder="Anwalts-Kommentar, warum die Regel so klassifiziert ist…"
|
||||
value={rationale}
|
||||
disabled={readOnly}
|
||||
onChange={(e) => setRationale(e.target.value)}
|
||||
/>
|
||||
</section>
|
||||
|
||||
{/* Versionshistorie */}
|
||||
<section>
|
||||
<button
|
||||
className="text-xs text-gray-600 hover:text-gray-800"
|
||||
onClick={() => setShowHistory((v) => !v)}
|
||||
>
|
||||
{showHistory ? '▾' : '▸'} Versionshistorie + Approval-Trail ({versions.length} Versionen)
|
||||
</button>
|
||||
{showHistory && (
|
||||
<HistoryList versions={versions} history={history} />
|
||||
)}
|
||||
</section>
|
||||
</div>
|
||||
|
||||
{/* Footer-Aktionen */}
|
||||
<footer className="px-5 py-3 border-t border-gray-200 bg-gray-50 flex items-center gap-2 flex-wrap">
|
||||
{isDraftMode && (
|
||||
<>
|
||||
<button
|
||||
className="px-3 py-1.5 text-sm border border-gray-300 rounded text-gray-700 hover:bg-white"
|
||||
onClick={handleSaveDraft}
|
||||
>
|
||||
Draft speichern
|
||||
</button>
|
||||
<button
|
||||
className="px-3 py-1.5 text-sm bg-amber-600 text-white rounded hover:bg-amber-700 disabled:opacity-50"
|
||||
disabled={!sourceCitation.trim()}
|
||||
onClick={() => setShowSubmit(true)}
|
||||
title={!sourceCitation.trim() ? 'Source Citation ist Pflicht' : ''}
|
||||
>
|
||||
Zur internen Prüfung einreichen
|
||||
</button>
|
||||
</>
|
||||
)}
|
||||
{isReviewMode && (
|
||||
<>
|
||||
<button
|
||||
className="px-3 py-1.5 text-sm bg-emerald-600 text-white rounded hover:bg-emerald-700"
|
||||
onClick={() => draftVersion && onApprove(draftVersion.id)}
|
||||
>
|
||||
Intern freigeben → Mandant
|
||||
</button>
|
||||
<button
|
||||
className="px-3 py-1.5 text-sm bg-blue-600 text-white rounded hover:bg-blue-700"
|
||||
onClick={() => draftVersion && onPublish(draftVersion.id)}
|
||||
title="Wird sofort live (Test-Modus)"
|
||||
>
|
||||
Publish (sofort live)
|
||||
</button>
|
||||
<button
|
||||
className="px-3 py-1.5 text-sm border border-rose-300 text-rose-700 rounded hover:bg-rose-50"
|
||||
onClick={() => setShowReject(true)}
|
||||
>
|
||||
Ablehnen
|
||||
</button>
|
||||
</>
|
||||
)}
|
||||
</footer>
|
||||
|
||||
{showSubmit && (
|
||||
<SubmitDialog
|
||||
value={changeSummary}
|
||||
onChange={setChangeSummary}
|
||||
onCancel={() => setShowSubmit(false)}
|
||||
onSubmit={handleSubmit}
|
||||
/>
|
||||
)}
|
||||
|
||||
{showReject && (
|
||||
<RejectDialog
|
||||
value={rejectReason}
|
||||
onChange={setRejectReason}
|
||||
onCancel={() => { setShowReject(false); setRejectReason('') }}
|
||||
onSubmit={() => {
|
||||
if (!draftVersion || !rejectReason.trim()) return
|
||||
onReject(draftVersion.id, rejectReason.trim())
|
||||
setShowReject(false); setRejectReason('')
|
||||
}}
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
function HistoryList({ versions, history }: { versions: RuleVersion[]; history: ApprovalHistoryEntry[] }) {
|
||||
return (
|
||||
<div className="mt-2 space-y-2 text-xs">
|
||||
<div>
|
||||
<div className="font-medium text-gray-700 mb-1">Versionen:</div>
|
||||
<ul className="space-y-1">
|
||||
{versions.map((v) => (
|
||||
<li key={v.id} className="bg-white border border-gray-200 rounded p-2">
|
||||
<div className="flex items-center gap-2">
|
||||
<span className="font-medium">v{v.version_number}</span>
|
||||
<span className="px-1.5 py-0.5 bg-gray-100 rounded">{STATUS_LABELS[v.status]}</span>
|
||||
{v.is_live && <span className="text-emerald-700">● Live</span>}
|
||||
<span className="text-gray-500 ml-auto">
|
||||
{new Date(v.created_at).toLocaleString('de-DE')}
|
||||
</span>
|
||||
</div>
|
||||
{v.change_summary && (
|
||||
<div className="mt-1 text-gray-600">Änderung: {v.change_summary}</div>
|
||||
)}
|
||||
{v.source_citation && (
|
||||
<div className="mt-0.5 text-gray-500">Quelle: {v.source_citation}</div>
|
||||
)}
|
||||
</li>
|
||||
))}
|
||||
</ul>
|
||||
</div>
|
||||
<div>
|
||||
<div className="font-medium text-gray-700 mb-1">Approval-Trail:</div>
|
||||
<ul className="space-y-0.5">
|
||||
{history.map((h) => (
|
||||
<li key={h.id} className="text-gray-600">
|
||||
{new Date(h.created_at).toLocaleString('de-DE')} · {h.action}
|
||||
{h.approver && ` · ${h.approver}`}
|
||||
{h.comment && ` — ${h.comment}`}
|
||||
</li>
|
||||
))}
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
function SubmitDialog({
|
||||
value, onChange, onCancel, onSubmit,
|
||||
}: {
|
||||
value: string
|
||||
onChange: (s: string) => void
|
||||
onCancel: () => void
|
||||
onSubmit: () => void
|
||||
}) {
|
||||
return (
|
||||
<div className="fixed inset-0 bg-black/30 z-50 flex items-center justify-center" onClick={onCancel}>
|
||||
<div className="bg-white rounded-lg shadow-xl w-[520px]" onClick={(e) => e.stopPropagation()}>
|
||||
<header className="px-5 py-3 border-b border-gray-200">
|
||||
<h3 className="font-semibold">Zur internen Prüfung einreichen</h3>
|
||||
</header>
|
||||
<div className="p-5">
|
||||
<label className="text-xs font-medium text-gray-700">
|
||||
Was wurde geändert? <span className="text-rose-600">*</span>
|
||||
</label>
|
||||
<textarea
|
||||
autoFocus
|
||||
rows={4}
|
||||
className="w-full mt-1 text-sm px-2 py-1.5 border border-gray-300 rounded"
|
||||
placeholder="z.B. Schwelle auf 50 MA angehoben (BAG-Urteil X)"
|
||||
value={value}
|
||||
onChange={(e) => onChange(e.target.value)}
|
||||
/>
|
||||
</div>
|
||||
<footer className="px-5 py-3 border-t border-gray-200 flex justify-end gap-2">
|
||||
<button className="px-3 py-1.5 text-sm text-gray-600" onClick={onCancel}>Abbrechen</button>
|
||||
<button
|
||||
className="px-4 py-1.5 text-sm bg-amber-600 text-white rounded disabled:opacity-50"
|
||||
disabled={!value.trim()}
|
||||
onClick={onSubmit}
|
||||
>
|
||||
Einreichen
|
||||
</button>
|
||||
</footer>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
function RejectDialog({
|
||||
value, onChange, onCancel, onSubmit,
|
||||
}: {
|
||||
value: string
|
||||
onChange: (s: string) => void
|
||||
onCancel: () => void
|
||||
onSubmit: () => void
|
||||
}) {
|
||||
return (
|
||||
<div className="fixed inset-0 bg-black/30 z-50 flex items-center justify-center" onClick={onCancel}>
|
||||
<div className="bg-white rounded-lg shadow-xl w-[480px]" onClick={(e) => e.stopPropagation()}>
|
||||
<header className="px-5 py-3 border-b border-gray-200">
|
||||
<h3 className="font-semibold">Draft ablehnen</h3>
|
||||
</header>
|
||||
<div className="p-5">
|
||||
<label className="text-xs font-medium text-gray-700">
|
||||
Ablehnungsgrund <span className="text-rose-600">*</span>
|
||||
</label>
|
||||
<textarea
|
||||
autoFocus
|
||||
rows={3}
|
||||
className="w-full mt-1 text-sm px-2 py-1.5 border border-gray-300 rounded"
|
||||
value={value}
|
||||
onChange={(e) => onChange(e.target.value)}
|
||||
/>
|
||||
</div>
|
||||
<footer className="px-5 py-3 border-t border-gray-200 flex justify-end gap-2">
|
||||
<button className="px-3 py-1.5 text-sm text-gray-600" onClick={onCancel}>Abbrechen</button>
|
||||
<button
|
||||
className="px-4 py-1.5 text-sm bg-rose-600 text-white rounded disabled:opacity-50"
|
||||
disabled={!value.trim()}
|
||||
onClick={onSubmit}
|
||||
>
|
||||
Ablehnen
|
||||
</button>
|
||||
</footer>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,111 @@
|
||||
'use client'
|
||||
|
||||
/**
|
||||
* Linke Spalte: Liste der globalen Empfehlungs-Regeln.
|
||||
*
|
||||
* Filterbar nach document_type. Klassifikations-Chip + Live-Indikator.
|
||||
*/
|
||||
|
||||
import { useMemo, useState } from 'react'
|
||||
import type { Rule, RuleVersion } from '../_types'
|
||||
import { CLASSIFICATION_LABELS, STATUS_LABELS } from '../_types'
|
||||
|
||||
interface Props {
|
||||
rules: Rule[]
|
||||
versionsByRule: Record<string, RuleVersion | undefined>
|
||||
selectedRuleId: string | null
|
||||
onSelectRule: (ruleId: string) => void
|
||||
}
|
||||
|
||||
export default function RuleList({
|
||||
rules, versionsByRule, selectedRuleId, onSelectRule,
|
||||
}: Props) {
|
||||
const [filter, setFilter] = useState('')
|
||||
const filtered = useMemo(() => {
|
||||
if (!filter.trim()) return rules
|
||||
const q = filter.toLowerCase()
|
||||
return rules.filter(
|
||||
(r) =>
|
||||
r.title.toLowerCase().includes(q) ||
|
||||
r.rule_key.toLowerCase().includes(q) ||
|
||||
r.document_type.toLowerCase().includes(q),
|
||||
)
|
||||
}, [rules, filter])
|
||||
|
||||
return (
|
||||
<div className="h-full flex flex-col overflow-hidden border-r border-gray-200 bg-gray-50">
|
||||
<div className="p-3 border-b border-gray-200 bg-white">
|
||||
<input
|
||||
type="text"
|
||||
placeholder="Suchen (Titel, Key, Doc-Type)…"
|
||||
value={filter}
|
||||
onChange={(e) => setFilter(e.target.value)}
|
||||
className="w-full text-sm px-2 py-1.5 border border-gray-300 rounded"
|
||||
/>
|
||||
<div className="text-xs text-gray-500 mt-1">
|
||||
{filtered.length} von {rules.length} Regeln
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<ul className="flex-1 overflow-y-auto">
|
||||
{filtered.map((rule) => {
|
||||
const live = versionsByRule[rule.id]
|
||||
const isSelected = rule.id === selectedRuleId
|
||||
return (
|
||||
<li key={rule.id}>
|
||||
<button
|
||||
className={`w-full text-left px-3 py-2 border-b border-gray-100 hover:bg-white ${
|
||||
isSelected ? 'bg-white border-l-4 border-l-amber-500' : ''
|
||||
}`}
|
||||
onClick={() => onSelectRule(rule.id)}
|
||||
>
|
||||
<div className="flex items-center gap-2 mb-0.5">
|
||||
{live && (
|
||||
<ClassificationChip classification={live.classification} />
|
||||
)}
|
||||
{!live && (
|
||||
<span className="px-1.5 py-0.5 text-xs rounded bg-gray-200 text-gray-600">
|
||||
ohne Live-Version
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
<div className="text-sm font-medium text-gray-800 truncate">
|
||||
{rule.title}
|
||||
</div>
|
||||
<div className="text-xs text-gray-500 truncate">
|
||||
<code>{rule.document_type}</code> · {rule.rule_key}
|
||||
</div>
|
||||
{live && (
|
||||
<div className="text-[10px] text-gray-500 mt-0.5">
|
||||
v{live.version_number} · {STATUS_LABELS[live.status]}
|
||||
{live.is_live && (
|
||||
<span className="ml-1 inline-block w-1.5 h-1.5 bg-emerald-500 rounded-full" />
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</button>
|
||||
</li>
|
||||
)
|
||||
})}
|
||||
{filtered.length === 0 && (
|
||||
<li className="px-3 py-4 text-sm text-gray-500 italic">
|
||||
Keine Regeln gefunden.
|
||||
</li>
|
||||
)}
|
||||
</ul>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
function ClassificationChip({ classification }: { classification: 'required' | 'recommended' | 'optional' }) {
|
||||
const colorMap = {
|
||||
required: 'bg-rose-100 text-rose-800 border-rose-300',
|
||||
recommended: 'bg-amber-100 text-amber-800 border-amber-300',
|
||||
optional: 'bg-slate-100 text-slate-700 border-slate-300',
|
||||
} as const
|
||||
return (
|
||||
<span className={`px-1.5 py-0.5 text-[10px] font-medium rounded border ${colorMap[classification]}`}>
|
||||
{CLASSIFICATION_LABELS[classification]}
|
||||
</span>
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,183 @@
|
||||
/**
|
||||
* Hook fuer Template-Rule-Editor: laedt Regeln/Versions/History und exponiert
|
||||
* Lifecycle-Actions (submit/approve/publish/reject) + Tenant-Override-CRUD.
|
||||
*
|
||||
* Alle API-Calls gehen ueber /api/sdk/v1/compliance/* (Next.js-Proxy zum
|
||||
* backend-compliance).
|
||||
*/
|
||||
|
||||
import { useCallback } from 'react'
|
||||
import type {
|
||||
ApprovalHistoryEntry,
|
||||
Classification,
|
||||
Rule,
|
||||
RuleCondition,
|
||||
RuleVersion,
|
||||
TenantRuleOverride,
|
||||
} from '../_types'
|
||||
|
||||
const API_BASE = '/api/sdk/v1/compliance'
|
||||
|
||||
async function req<T>(url: string, init?: RequestInit): Promise<T> {
|
||||
const res = await fetch(url, {
|
||||
...init,
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
...(init?.headers || {}),
|
||||
},
|
||||
})
|
||||
if (!res.ok) {
|
||||
const text = await res.text().catch(() => res.statusText)
|
||||
throw new Error(`${res.status}: ${text}`)
|
||||
}
|
||||
if (res.status === 204) return undefined as T
|
||||
return res.json() as Promise<T>
|
||||
}
|
||||
|
||||
export function useRuleEditorActions() {
|
||||
const listRules = useCallback(
|
||||
(documentType?: string) => {
|
||||
const q = documentType ? `?document_type=${encodeURIComponent(documentType)}` : ''
|
||||
return req<Rule[]>(`${API_BASE}/template-rules${q}`)
|
||||
},
|
||||
[],
|
||||
)
|
||||
|
||||
const getRule = useCallback(
|
||||
(ruleId: string) => req<Rule>(`${API_BASE}/template-rules/${ruleId}`),
|
||||
[],
|
||||
)
|
||||
|
||||
const listVersions = useCallback(
|
||||
(ruleId: string) => req<RuleVersion[]>(`${API_BASE}/template-rules/${ruleId}/versions`),
|
||||
[],
|
||||
)
|
||||
|
||||
const getVersion = useCallback(
|
||||
(versionId: string) => req<RuleVersion>(`${API_BASE}/template-rule-versions/${versionId}`),
|
||||
[],
|
||||
)
|
||||
|
||||
const createDraftVersion = useCallback(
|
||||
(
|
||||
ruleId: string,
|
||||
payload: {
|
||||
classification: Classification
|
||||
conditions: RuleCondition
|
||||
source_citation: string
|
||||
rationale?: string | null
|
||||
created_by?: string | null
|
||||
},
|
||||
) =>
|
||||
req<RuleVersion>(`${API_BASE}/template-rules/${ruleId}/versions`, {
|
||||
method: 'POST',
|
||||
body: JSON.stringify({
|
||||
rule_id: ruleId,
|
||||
...payload,
|
||||
}),
|
||||
}),
|
||||
[],
|
||||
)
|
||||
|
||||
const updateDraftVersion = useCallback(
|
||||
(
|
||||
versionId: string,
|
||||
patch: {
|
||||
classification?: Classification
|
||||
conditions?: RuleCondition
|
||||
source_citation?: string
|
||||
rationale?: string | null
|
||||
change_summary?: string | null
|
||||
},
|
||||
) =>
|
||||
req<RuleVersion>(`${API_BASE}/template-rule-versions/${versionId}`, {
|
||||
method: 'PATCH',
|
||||
body: JSON.stringify(patch),
|
||||
}),
|
||||
[],
|
||||
)
|
||||
|
||||
const submitForReview = useCallback(
|
||||
(
|
||||
versionId: string,
|
||||
payload: { change_summary: string; submitter?: string; comment?: string },
|
||||
) =>
|
||||
req<RuleVersion>(`${API_BASE}/template-rule-versions/${versionId}/submit-review`, {
|
||||
method: 'POST',
|
||||
body: JSON.stringify(payload),
|
||||
}),
|
||||
[],
|
||||
)
|
||||
|
||||
const approveVersion = useCallback(
|
||||
(versionId: string, payload: { approver?: string; comment?: string } = {}) =>
|
||||
req<RuleVersion>(`${API_BASE}/template-rule-versions/${versionId}/approve`, {
|
||||
method: 'POST',
|
||||
body: JSON.stringify(payload),
|
||||
}),
|
||||
[],
|
||||
)
|
||||
|
||||
const publishVersion = useCallback(
|
||||
(versionId: string, payload: { approver?: string; comment?: string } = {}) =>
|
||||
req<RuleVersion>(`${API_BASE}/template-rule-versions/${versionId}/publish`, {
|
||||
method: 'POST',
|
||||
body: JSON.stringify(payload),
|
||||
}),
|
||||
[],
|
||||
)
|
||||
|
||||
const rejectVersion = useCallback(
|
||||
(
|
||||
versionId: string,
|
||||
payload: { rejection_reason: string; rejector?: string; comment?: string },
|
||||
) =>
|
||||
req<RuleVersion>(`${API_BASE}/template-rule-versions/${versionId}/reject`, {
|
||||
method: 'POST',
|
||||
body: JSON.stringify(payload),
|
||||
}),
|
||||
[],
|
||||
)
|
||||
|
||||
const getApprovalHistory = useCallback(
|
||||
(versionId: string) =>
|
||||
req<ApprovalHistoryEntry[]>(
|
||||
`${API_BASE}/template-rule-versions/${versionId}/approval-history`,
|
||||
),
|
||||
[],
|
||||
)
|
||||
|
||||
const listOverrides = useCallback(
|
||||
() => req<TenantRuleOverride[]>(`${API_BASE}/tenant-rule-overrides`),
|
||||
[],
|
||||
)
|
||||
|
||||
const upsertOverride = useCallback(
|
||||
(payload: {
|
||||
rule_id: string
|
||||
override_classification: Classification | null
|
||||
reason: string
|
||||
created_by?: string
|
||||
}) =>
|
||||
req<TenantRuleOverride>(`${API_BASE}/tenant-rule-overrides`, {
|
||||
method: 'POST',
|
||||
body: JSON.stringify(payload),
|
||||
}),
|
||||
[],
|
||||
)
|
||||
|
||||
const deleteOverride = useCallback(
|
||||
(overrideId: string) =>
|
||||
req<void>(`${API_BASE}/tenant-rule-overrides/${overrideId}`, { method: 'DELETE' }),
|
||||
[],
|
||||
)
|
||||
|
||||
return {
|
||||
listRules, getRule,
|
||||
listVersions, getVersion,
|
||||
createDraftVersion, updateDraftVersion,
|
||||
submitForReview, approveVersion, publishVersion, rejectVersion,
|
||||
getApprovalHistory,
|
||||
listOverrides, upsertOverride, deleteOverride,
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,246 @@
|
||||
/**
|
||||
* Types fuer den Template-Rule-Editor (SDK).
|
||||
*
|
||||
* Spiegeln die Pydantic-Modelle aus
|
||||
* backend-compliance/compliance/schemas/template_rule.py.
|
||||
*/
|
||||
|
||||
export type Classification = 'required' | 'recommended' | 'optional'
|
||||
|
||||
export type RuleStatus =
|
||||
| 'draft' | 'review' | 'approved' | 'published' | 'archived' | 'rejected'
|
||||
|
||||
export type ClauseOperator =
|
||||
| 'eq' | 'neq' | 'in' | 'not_in'
|
||||
| 'gt' | 'gte' | 'lt' | 'lte'
|
||||
| 'exists' | 'truthy' | 'falsy'
|
||||
|
||||
export interface RuleClause {
|
||||
field: string
|
||||
op: ClauseOperator
|
||||
value?: unknown
|
||||
}
|
||||
|
||||
export interface RuleCondition {
|
||||
kind: 'all' | 'any'
|
||||
clauses: RuleClause[]
|
||||
}
|
||||
|
||||
export interface Rule {
|
||||
id: string
|
||||
rule_key: string
|
||||
document_type: string
|
||||
title: string
|
||||
current_version_id: string | null
|
||||
created_at: string
|
||||
updated_at: string | null
|
||||
}
|
||||
|
||||
export interface RuleVersion {
|
||||
id: string
|
||||
rule_id: string
|
||||
version_number: number
|
||||
status: RuleStatus
|
||||
is_live: boolean
|
||||
classification: Classification
|
||||
conditions: RuleCondition
|
||||
source_citation: string
|
||||
rationale: string | null
|
||||
change_summary: string | null
|
||||
created_by: string | null
|
||||
submitted_by: string | null
|
||||
submitted_at: string | null
|
||||
approved_by: string | null
|
||||
approved_at: string | null
|
||||
published_by: string | null
|
||||
published_at: string | null
|
||||
rejected_by: string | null
|
||||
rejected_at: string | null
|
||||
rejection_reason: string | null
|
||||
created_at: string
|
||||
updated_at: string | null
|
||||
}
|
||||
|
||||
export interface ApprovalHistoryEntry {
|
||||
id: string
|
||||
version_id: string
|
||||
action: string
|
||||
approver: string | null
|
||||
comment: string | null
|
||||
created_at: string
|
||||
}
|
||||
|
||||
export interface TenantRuleOverride {
|
||||
id: string
|
||||
tenant_id: string
|
||||
rule_id: string
|
||||
override_classification: Classification | null
|
||||
reason: string
|
||||
created_by: string | null
|
||||
created_at: string
|
||||
updated_at: string | null
|
||||
}
|
||||
|
||||
// ---- Profil-Felder fuer Condition-Builder ----
|
||||
|
||||
export interface ProfileFieldOption {
|
||||
/** Key der im Profil verwendet wird */
|
||||
key: string
|
||||
/** Label fuer die UI */
|
||||
label: string
|
||||
/** Kategorie fuer Gruppierung */
|
||||
category: 'org' | 'proc' | 'prod' | 'comp' | 'tech' | 'compliance'
|
||||
/** Erwarteter Datentyp */
|
||||
type: 'string' | 'number' | 'boolean' | 'enum'
|
||||
/** Wenn enum: Mögliche Werte mit Label */
|
||||
options?: { value: string; label: string }[]
|
||||
}
|
||||
|
||||
/**
|
||||
* Die 17 Profil-Felder, die in den 33 Initial-Regeln verwendet werden.
|
||||
* Aus templateRecommendations.ts portiert + compliance_depth_level ergaenzt.
|
||||
*/
|
||||
export const PROFILE_FIELDS: ProfileFieldOption[] = [
|
||||
{
|
||||
key: 'compliance_depth_level',
|
||||
label: 'Compliance-Tiefe',
|
||||
category: 'compliance', type: 'enum',
|
||||
options: [
|
||||
{ value: 'L1', label: 'L1 — Lean Startup' },
|
||||
{ value: 'L2', label: 'L2 — Standard' },
|
||||
{ value: 'L3', label: 'L3 — Strict' },
|
||||
{ value: 'L4', label: 'L4 — Zertifizierungsbereit' },
|
||||
],
|
||||
},
|
||||
{
|
||||
key: 'org_employee_count',
|
||||
label: 'Mitarbeiterzahl',
|
||||
category: 'org', type: 'enum',
|
||||
options: [
|
||||
{ value: 'none', label: 'Keine' },
|
||||
{ value: '1_9', label: '1–9' },
|
||||
{ value: '10_49', label: '10–49' },
|
||||
{ value: '50_249', label: '50–249' },
|
||||
{ value: '250_999', label: '250–999' },
|
||||
{ value: '1000_plus', label: '1000+' },
|
||||
],
|
||||
},
|
||||
{
|
||||
key: 'org_has_employees', label: 'Hat Mitarbeiter', category: 'org', type: 'enum',
|
||||
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
|
||||
},
|
||||
{
|
||||
key: 'org_business_model', label: 'Geschäftsmodell', category: 'org', type: 'enum',
|
||||
options: [
|
||||
{ value: 'b2b_saas', label: 'B2B SaaS' },
|
||||
{ value: 'b2c_shop', label: 'B2C Shop' },
|
||||
{ value: 'platform', label: 'Plattform' },
|
||||
{ value: 'marketplace', label: 'Marktplatz' },
|
||||
{ value: 'social', label: 'Social Media' },
|
||||
{ value: 'saas', label: 'SaaS' },
|
||||
{ value: 'media', label: 'Media' },
|
||||
{ value: 'manufacturing', label: 'Maschinenbau' },
|
||||
{ value: 'other', label: 'Sonstiges' },
|
||||
],
|
||||
},
|
||||
{
|
||||
key: 'org_has_social_media', label: 'Hat Social Media', category: 'org', type: 'enum',
|
||||
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
|
||||
},
|
||||
{
|
||||
key: 'org_has_video_conferencing', label: 'Hat Video-Konferenzen', category: 'org', type: 'enum',
|
||||
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
|
||||
},
|
||||
{
|
||||
key: 'org_cert_target', label: 'Zertifizierungsziel', category: 'org', type: 'enum',
|
||||
options: [
|
||||
{ value: 'none', label: 'Keines' },
|
||||
{ value: 'iso27001', label: 'ISO 27001' },
|
||||
{ value: 'iso27701', label: 'ISO 27701' },
|
||||
{ value: 'tisax', label: 'TISAX' },
|
||||
],
|
||||
},
|
||||
{
|
||||
key: 'proc_ai_usage', label: 'KI-Nutzung', category: 'proc', type: 'enum',
|
||||
options: [
|
||||
{ value: 'none', label: 'Keine' },
|
||||
{ value: 'limited', label: 'Begrenzt' },
|
||||
{ value: 'extensive', label: 'Umfangreich' },
|
||||
],
|
||||
},
|
||||
{
|
||||
key: 'proc_uses_ai_tools', label: 'Nutzt KI-Tools', category: 'proc', type: 'boolean',
|
||||
},
|
||||
{
|
||||
key: 'proc_byod_allowed', label: 'BYOD erlaubt', category: 'proc', type: 'enum',
|
||||
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
|
||||
},
|
||||
{
|
||||
key: 'proc_dsfa_required', label: 'DSFA erforderlich', category: 'proc', type: 'enum',
|
||||
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
|
||||
},
|
||||
{
|
||||
key: 'prod_webshop', label: 'Webshop', category: 'prod', type: 'enum',
|
||||
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
|
||||
},
|
||||
{
|
||||
key: 'prod_ugc_platform', label: 'UGC-Plattform', category: 'prod', type: 'enum',
|
||||
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
|
||||
},
|
||||
{
|
||||
key: 'prod_consent_management', label: 'Consent Management', category: 'prod', type: 'enum',
|
||||
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
|
||||
},
|
||||
{
|
||||
key: 'comp_has_processors', label: 'Auftragsverarbeiter', category: 'comp', type: 'enum',
|
||||
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
|
||||
},
|
||||
{
|
||||
key: 'comp_vendor_management', label: 'Vendor-Management', category: 'comp', type: 'enum',
|
||||
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
|
||||
},
|
||||
{
|
||||
key: 'comp_dsfa_processes', label: 'DSFA-Prozesse', category: 'comp', type: 'enum',
|
||||
options: [{ value: 'required', label: 'Erforderlich' }, { value: 'optional', label: 'Optional' }],
|
||||
},
|
||||
{
|
||||
key: 'tech_third_country', label: 'Drittland-Transfer', category: 'tech', type: 'enum',
|
||||
options: [
|
||||
{ value: 'no', label: 'Nein' },
|
||||
{ value: 'us_dpf_only', label: 'Nur US-DPF' },
|
||||
{ value: 'adequate_only', label: 'Nur Angemessenheitsbeschluss' },
|
||||
{ value: 'yes_us', label: 'Ja, USA' },
|
||||
{ value: 'yes_other', label: 'Ja, Sonstige' },
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
export const OPERATOR_LABELS: Record<ClauseOperator, string> = {
|
||||
eq: 'gleich (=)',
|
||||
neq: 'ungleich (≠)',
|
||||
in: 'in Liste',
|
||||
not_in: 'nicht in Liste',
|
||||
gt: 'größer (>)',
|
||||
gte: 'größer/gleich (≥)',
|
||||
lt: 'kleiner (<)',
|
||||
lte: 'kleiner/gleich (≤)',
|
||||
exists: 'existiert',
|
||||
truthy: 'ist gesetzt',
|
||||
falsy: 'ist leer',
|
||||
}
|
||||
|
||||
export const CLASSIFICATION_LABELS: Record<Classification, string> = {
|
||||
required: 'Pflicht',
|
||||
recommended: 'Empfohlen',
|
||||
optional: 'Optional',
|
||||
}
|
||||
|
||||
export const STATUS_LABELS: Record<RuleStatus, string> = {
|
||||
draft: 'Entwurf',
|
||||
review: 'In Prüfung',
|
||||
approved: 'Freigegeben',
|
||||
published: 'Live',
|
||||
archived: 'Archiviert',
|
||||
rejected: 'Abgelehnt',
|
||||
}
|
||||
@@ -0,0 +1,205 @@
|
||||
'use client'
|
||||
|
||||
/**
|
||||
* Template Rule Editor — Editorial-UI fuer Anwaelte/DSBs.
|
||||
*
|
||||
* Architektur:
|
||||
* - Links: RuleList mit Filter
|
||||
* - Rechts: RuleEditor mit Klassifikation, Condition-Builder, Source-Citation,
|
||||
* Approval-Workflow (draft → review → approved → published)
|
||||
*
|
||||
* Backend: /api/sdk/v1/compliance/template-rules + /template-rule-versions/*
|
||||
*/
|
||||
|
||||
import { useEffect, useState, useCallback } from 'react'
|
||||
import { useSDK } from '@/lib/sdk'
|
||||
import StepHeader from '@/components/sdk/StepHeader/StepHeader'
|
||||
import { useRuleEditorActions } from './_hooks/useRuleEditorActions'
|
||||
import type {
|
||||
ApprovalHistoryEntry, Classification, Rule, RuleCondition, RuleVersion,
|
||||
} from './_types'
|
||||
import RuleList from './_components/RuleList'
|
||||
import RuleEditor from './_components/RuleEditor'
|
||||
|
||||
export default function TemplateRuleEditorPage() {
|
||||
useSDK()
|
||||
|
||||
const actions = useRuleEditorActions()
|
||||
|
||||
const [rules, setRules] = useState<Rule[]>([])
|
||||
const [liveVersionsByRule, setLiveVersionsByRule] = useState<Record<string, RuleVersion | undefined>>({})
|
||||
const [selectedRuleId, setSelectedRuleId] = useState<string | null>(null)
|
||||
const [selectedVersions, setSelectedVersions] = useState<RuleVersion[]>([])
|
||||
const [selectedHistory, setSelectedHistory] = useState<ApprovalHistoryEntry[]>([])
|
||||
const [loading, setLoading] = useState(true)
|
||||
const [error, setError] = useState<string | null>(null)
|
||||
|
||||
// Initial: Regeln laden + Live-Versions
|
||||
const loadRules = useCallback(async () => {
|
||||
setLoading(true)
|
||||
setError(null)
|
||||
try {
|
||||
const list = await actions.listRules()
|
||||
setRules(list)
|
||||
const byRule: Record<string, RuleVersion | undefined> = {}
|
||||
// Live-Versionen parallel
|
||||
await Promise.all(
|
||||
list.map(async (r) => {
|
||||
try {
|
||||
const versions = await actions.listVersions(r.id)
|
||||
const live = versions.find((v) => v.is_live)
|
||||
byRule[r.id] = live
|
||||
} catch {
|
||||
byRule[r.id] = undefined
|
||||
}
|
||||
}),
|
||||
)
|
||||
setLiveVersionsByRule(byRule)
|
||||
if (list.length > 0 && !selectedRuleId) {
|
||||
setSelectedRuleId(list[0].id)
|
||||
}
|
||||
} catch (e) {
|
||||
setError((e as Error).message)
|
||||
} finally {
|
||||
setLoading(false)
|
||||
}
|
||||
}, [actions, selectedRuleId])
|
||||
|
||||
// Bei Selektions-Wechsel: Versions + History laden
|
||||
const loadSelected = useCallback(async () => {
|
||||
if (!selectedRuleId) {
|
||||
setSelectedVersions([])
|
||||
setSelectedHistory([])
|
||||
return
|
||||
}
|
||||
try {
|
||||
const versions = await actions.listVersions(selectedRuleId)
|
||||
setSelectedVersions(versions)
|
||||
const live = versions.find((v) => v.is_live)
|
||||
if (live) {
|
||||
const history = await actions.getApprovalHistory(live.id)
|
||||
setSelectedHistory(history)
|
||||
} else {
|
||||
setSelectedHistory([])
|
||||
}
|
||||
} catch (e) {
|
||||
setError((e as Error).message)
|
||||
}
|
||||
}, [actions, selectedRuleId])
|
||||
|
||||
useEffect(() => { loadRules() }, [])
|
||||
useEffect(() => { loadSelected() }, [selectedRuleId])
|
||||
|
||||
const handleCreateDraft = async (payload: {
|
||||
classification: Classification
|
||||
conditions: RuleCondition
|
||||
source_citation: string
|
||||
rationale?: string | null
|
||||
}) => {
|
||||
if (!selectedRuleId) return
|
||||
try {
|
||||
await actions.createDraftVersion(selectedRuleId, payload)
|
||||
await loadSelected()
|
||||
} catch (e) {
|
||||
setError((e as Error).message)
|
||||
}
|
||||
}
|
||||
|
||||
const handleUpdateDraft = async (versionId: string, patch: {
|
||||
classification?: Classification
|
||||
conditions?: RuleCondition
|
||||
source_citation?: string
|
||||
rationale?: string | null
|
||||
}) => {
|
||||
try {
|
||||
await actions.updateDraftVersion(versionId, patch)
|
||||
await loadSelected()
|
||||
} catch (e) {
|
||||
setError((e as Error).message)
|
||||
}
|
||||
}
|
||||
|
||||
const handleSubmitForReview = async (versionId: string, changeSummary: string) => {
|
||||
try {
|
||||
await actions.submitForReview(versionId, { change_summary: changeSummary })
|
||||
await loadSelected()
|
||||
} catch (e) {
|
||||
setError((e as Error).message)
|
||||
}
|
||||
}
|
||||
|
||||
const handleApprove = async (versionId: string) => {
|
||||
try {
|
||||
await actions.approveVersion(versionId)
|
||||
await loadSelected()
|
||||
} catch (e) {
|
||||
setError((e as Error).message)
|
||||
}
|
||||
}
|
||||
|
||||
const handlePublish = async (versionId: string) => {
|
||||
try {
|
||||
await actions.publishVersion(versionId)
|
||||
await loadRules()
|
||||
await loadSelected()
|
||||
} catch (e) {
|
||||
setError((e as Error).message)
|
||||
}
|
||||
}
|
||||
|
||||
const handleReject = async (versionId: string, reason: string) => {
|
||||
try {
|
||||
await actions.rejectVersion(versionId, { rejection_reason: reason })
|
||||
await loadSelected()
|
||||
} catch (e) {
|
||||
setError((e as Error).message)
|
||||
}
|
||||
}
|
||||
|
||||
const selectedRule = rules.find((r) => r.id === selectedRuleId)
|
||||
|
||||
return (
|
||||
<div className="h-full flex flex-col bg-white">
|
||||
<StepHeader
|
||||
stepId="template-rule-editor"
|
||||
title="Empfehlungs-Regeln"
|
||||
description="Editorial-UI für profilbasierte Dokument-Empfehlungen. Anwälte/DSBs editieren globale Regeln mit Approval-Workflow + Quellen-Attribution."
|
||||
/>
|
||||
{error && (
|
||||
<div className="px-5 py-2 bg-rose-50 border-b border-rose-200 text-sm text-rose-800">
|
||||
{error}
|
||||
</div>
|
||||
)}
|
||||
{loading && (
|
||||
<div className="p-5 text-sm text-gray-500">Lade Regeln…</div>
|
||||
)}
|
||||
{!loading && (
|
||||
<div className="flex-1 grid grid-cols-[320px_1fr] overflow-hidden">
|
||||
<RuleList
|
||||
rules={rules}
|
||||
versionsByRule={liveVersionsByRule}
|
||||
selectedRuleId={selectedRuleId}
|
||||
onSelectRule={setSelectedRuleId}
|
||||
/>
|
||||
{selectedRule ? (
|
||||
<RuleEditor
|
||||
rule={selectedRule}
|
||||
versions={selectedVersions}
|
||||
history={selectedHistory}
|
||||
onCreateDraft={handleCreateDraft}
|
||||
onUpdateDraft={handleUpdateDraft}
|
||||
onSubmitForReview={handleSubmitForReview}
|
||||
onApprove={handleApprove}
|
||||
onPublish={handlePublish}
|
||||
onReject={handleReject}
|
||||
/>
|
||||
) : (
|
||||
<div className="h-full grid place-items-center text-sm text-gray-500">
|
||||
Wähle links eine Regel zum Bearbeiten.
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
@@ -494,4 +494,18 @@ export const SDK_STEPS: SDKStep[] = [
|
||||
prerequisiteSteps: [],
|
||||
isOptional: true,
|
||||
},
|
||||
{
|
||||
id: 'template-rule-editor',
|
||||
seq: 5000,
|
||||
phase: 2,
|
||||
package: 'betrieb',
|
||||
order: 13,
|
||||
name: 'Empfehlungs-Regeln',
|
||||
nameShort: 'Regeln',
|
||||
description: 'Editorial-UI fuer profilbasierte Dokument-Empfehlungen (Anwalt/DSB)',
|
||||
url: '/sdk/template-rule-editor',
|
||||
checkpointId: 'CP-RULES',
|
||||
prerequisiteSteps: [],
|
||||
isOptional: true,
|
||||
},
|
||||
]
|
||||
|
||||
@@ -1,336 +1,49 @@
|
||||
"""Screenshot-basierte Cookie-Extraktion mit Tesseract-OCR.
|
||||
"""Screenshot-basierte Cookie-Extraktion (Orchestration).
|
||||
|
||||
Pipeline:
|
||||
1. consent-tester macht Full-Page-Screenshot (Banner akzeptiert,
|
||||
Accordions ausgeklappt, Timestamp eingebrannt) → PNG b64
|
||||
2. Tesseract OCR (lang=deu, psm=4) → Rohtext mit Tabellen-Reihen
|
||||
3. _parse_ocr_cookie_table(text) → strukturierte Liste {name, category,
|
||||
purpose, duration, type, vendor}
|
||||
3. parse_ocr_cookie_table(text) → strukturierte Liste
|
||||
|
||||
Funktioniert site-unabhaengig — egal welches CMP, egal welche Sprache
|
||||
(Tesseract kann viele), egal welches DOM-Layout. Timestamp im Bild =
|
||||
Beweis was wir zum Scan-Zeitpunkt wirklich gesehen haben.
|
||||
Phase-1-Split (2026-06-06): Engine-Funktionen
|
||||
(_slice_screenshot / vision-OCR / paddle / tesseract / parse) leben
|
||||
jetzt in `cookie_screenshot_ocr_engines.py`. Re-Exports halten die
|
||||
Public-API stabil — externe Importer (`_phase_d1_vendors_raw.py`)
|
||||
brauchen keinen Code-Change.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64 as _b64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
from .cookie_screenshot_ocr_engines import ( # noqa: F401 (re-exports)
|
||||
OLLAMA_URL,
|
||||
VISION_MODEL,
|
||||
VISION_PROMPT,
|
||||
_PADDLE_OCR,
|
||||
_call_vision_on_slice,
|
||||
_slice_screenshot,
|
||||
ocr_screenshot_via_paddle,
|
||||
ocr_screenshot_via_tesseract,
|
||||
ocr_screenshot_via_vision_slices,
|
||||
parse_ocr_cookie_table,
|
||||
parse_vision_response,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
CONSENT_TESTER_URL = os.getenv(
|
||||
"CONSENT_TESTER_URL", "http://bp-compliance-consent-tester:8094"
|
||||
)
|
||||
VISION_MODEL = os.getenv("COOKIE_VISION_MODEL", "qwen2.5vl:32b")
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||
|
||||
|
||||
def _slice_screenshot(png_bytes: bytes, slice_h: int = 1500,
|
||||
max_slices: int = 25) -> list[str]:
|
||||
"""Cut a tall full-page screenshot into 1280×slice_h slices and return
|
||||
each as base64-encoded PNG. Vision models choke on 25k-tall images
|
||||
(resampled down to ~1024 → unreadable text); slicing keeps DPI."""
|
||||
if not png_bytes:
|
||||
return []
|
||||
try:
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
except ImportError:
|
||||
return []
|
||||
img = Image.open(BytesIO(png_bytes)).convert("RGB")
|
||||
w, h = img.size
|
||||
n = min((h + slice_h - 1) // slice_h, max_slices)
|
||||
out: list[str] = []
|
||||
for i in range(n):
|
||||
top = i * slice_h
|
||||
bot = min((i + 1) * slice_h, h)
|
||||
chunk = img.crop((0, top, w, bot))
|
||||
buf = BytesIO()
|
||||
chunk.save(buf, format="PNG", optimize=True)
|
||||
out.append(_b64.b64encode(buf.getvalue()).decode("ascii"))
|
||||
return out
|
||||
|
||||
|
||||
async def _call_vision_on_slice(b64_png: str, timeout_s: float = 240.0) -> str:
|
||||
"""Ask the vision model to dump all cookie-row text from one slice
|
||||
as raw text (NOT JSON). We parse it downstream with parse_flat regex."""
|
||||
prompt = (
|
||||
"Du siehst einen Bildausschnitt einer Cookie-Richtlinien-Tabelle. "
|
||||
"Liste ALLE Tabellen-Zeilen wortwoertlich auf, eine Zeile pro "
|
||||
"Cookie. Jede Zeile soll enthalten: Cookie-Name, Kategorie, "
|
||||
"Zweck, Speicherdauer, Art (Permanent/Session). "
|
||||
"Format: '<Name> | <Kategorie> | <Zweck> | <Dauer> | <Art>'. "
|
||||
"KEINE Cookies erfinden, nur was im Bild steht. Nur die Tabellen-"
|
||||
"Zeilen, keine Erklaerungen."
|
||||
)
|
||||
payload = {
|
||||
"model": VISION_MODEL,
|
||||
"stream": False,
|
||||
"messages": [{
|
||||
"role": "user", "content": prompt, "images": [b64_png],
|
||||
}],
|
||||
"options": {"temperature": 0.05, "num_predict": 4000},
|
||||
}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=timeout_s) as c:
|
||||
r = await c.post(f"{OLLAMA_URL.rstrip('/')}/api/chat", json=payload)
|
||||
r.raise_for_status()
|
||||
return (r.json().get("message") or {}).get("content", "") or ""
|
||||
except Exception as e:
|
||||
logger.debug("vision slice failed: %s", e)
|
||||
return ""
|
||||
|
||||
|
||||
async def ocr_screenshot_via_vision_slices(png_bytes: bytes,
|
||||
max_slices: int = 20) -> str:
|
||||
"""Slice + vision-OCR each slice + concatenate. Returns raw text that
|
||||
can be fed to parse_flat_cookie_text."""
|
||||
slices = _slice_screenshot(png_bytes, slice_h=1500, max_slices=max_slices)
|
||||
if not slices:
|
||||
return ""
|
||||
logger.info("Vision-slicing: %d slices → vision-OCR (model=%s)",
|
||||
len(slices), VISION_MODEL)
|
||||
import asyncio as _aio
|
||||
# Run slices SEQUENTIALLY: ollama is single-GPU and loading the same
|
||||
# model for parallel requests causes OOM + thrashing on Mac Mini.
|
||||
parts: list[str] = []
|
||||
for i, s in enumerate(slices):
|
||||
txt = await _call_vision_on_slice(s)
|
||||
if txt:
|
||||
parts.append(txt)
|
||||
logger.info("Vision-slice %d/%d: %d chars", i + 1, len(slices),
|
||||
len(txt))
|
||||
full = "\n".join(parts)
|
||||
logger.info("Vision-OCR slicing total: %d chars from %d slices",
|
||||
len(full), len(slices))
|
||||
return full
|
||||
|
||||
|
||||
def ocr_screenshot_via_paddle(png_bytes: bytes) -> str:
|
||||
"""Run PaddleOCR over the full-page screenshot, returning the
|
||||
concatenated text. Deterministic, no LLM halluzination.
|
||||
|
||||
Splits tall screenshots into 1280x3000 slices so OCR works in chunks
|
||||
without OOM on large pages (VW cookie-page is ~25k px tall).
|
||||
"""
|
||||
if not png_bytes:
|
||||
return ""
|
||||
try:
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
from paddleocr import PaddleOCR
|
||||
except ImportError as e:
|
||||
logger.warning("PaddleOCR / PIL not available: %s", e)
|
||||
return ""
|
||||
|
||||
try:
|
||||
img = Image.open(BytesIO(png_bytes)).convert("RGB")
|
||||
except Exception as e:
|
||||
logger.warning("PIL open failed: %s", e)
|
||||
return ""
|
||||
|
||||
w, h = img.size
|
||||
slice_h = 3000
|
||||
n_slices = (h + slice_h - 1) // slice_h
|
||||
logger.info("PaddleOCR: %dx%d screenshot → %d slices of %d high",
|
||||
w, h, n_slices, slice_h)
|
||||
|
||||
# Global OCR instance reused — initial init is ~10s.
|
||||
global _PADDLE_OCR
|
||||
if "_PADDLE_OCR" not in globals() or _PADDLE_OCR is None:
|
||||
try:
|
||||
_PADDLE_OCR = PaddleOCR(use_angle_cls=False, lang="german",
|
||||
show_log=False)
|
||||
except Exception as e:
|
||||
logger.warning("PaddleOCR init failed: %s", e)
|
||||
return ""
|
||||
|
||||
parts: list[str] = []
|
||||
import numpy as np
|
||||
for i in range(n_slices):
|
||||
top = i * slice_h
|
||||
bot = min((i + 1) * slice_h, h)
|
||||
crop = img.crop((0, top, w, bot))
|
||||
arr = np.array(crop)
|
||||
try:
|
||||
result = _PADDLE_OCR.ocr(arr, cls=False)
|
||||
except Exception as e:
|
||||
logger.warning("PaddleOCR slice %d failed: %s", i, e)
|
||||
continue
|
||||
# PaddleOCR returns list-of-lines where each line is
|
||||
# [bbox, (text, conf)] — variable nesting depending on version.
|
||||
if not result:
|
||||
continue
|
||||
for page in result:
|
||||
if not page: continue
|
||||
for line in page:
|
||||
if not line: continue
|
||||
try:
|
||||
if isinstance(line, list) and len(line) >= 2:
|
||||
txt = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1])
|
||||
else:
|
||||
txt = str(line)
|
||||
if txt: parts.append(txt)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
full_text = "\n".join(parts)
|
||||
logger.info("PaddleOCR: extracted %d lines / %d chars from %d slices",
|
||||
len(parts), len(full_text), n_slices)
|
||||
return full_text
|
||||
|
||||
|
||||
_PADDLE_OCR = None
|
||||
|
||||
|
||||
# ── Tesseract-based parser ────────────────────────────────────────────
|
||||
|
||||
def ocr_screenshot_via_tesseract(png_bytes: bytes,
|
||||
lang: str = "deu",
|
||||
psm: int = 4) -> str:
|
||||
"""Run Tesseract OCR on a full-page screenshot. Returns normalized text
|
||||
where multi-newline paragraphs are collapsed but blank lines preserved
|
||||
(helps anchor-based parsing).
|
||||
|
||||
psm=4 means single column of text of variable sizes (cookie-tables).
|
||||
"""
|
||||
if not png_bytes:
|
||||
return ""
|
||||
try:
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
import re as _re
|
||||
except ImportError as e:
|
||||
logger.warning("tesseract/PIL not available: %s", e)
|
||||
return ""
|
||||
try:
|
||||
img = Image.open(BytesIO(png_bytes)).convert("RGB")
|
||||
raw = pytesseract.image_to_string(img, lang=lang,
|
||||
config=f"--psm {psm}")
|
||||
# Collapse intra-paragraph newlines so OCR cells flow on one line.
|
||||
norm = _re.sub(r"[ \t]+", " ", raw)
|
||||
norm = _re.sub(r"\n(?!\s*\n)", " ", norm)
|
||||
norm = _re.sub(r"\s{2,}", " ", norm)
|
||||
logger.info(
|
||||
"Tesseract OCR: %d chars / %d words (image %dx%d)",
|
||||
len(norm), len(norm.split()), img.size[0], img.size[1],
|
||||
)
|
||||
return norm
|
||||
except Exception as e:
|
||||
logger.warning("Tesseract OCR failed: %s (%s)",
|
||||
str(e) or "(no msg)", type(e).__name__)
|
||||
return ""
|
||||
|
||||
|
||||
# Kategorie-Anchor-Tokens that ALWAYS follow the Cookie-Name in the
|
||||
# typical column layout: [NAME] [KATEGORIE] [ZWECK] [DAUER] [ART]
|
||||
_CATEGORY_ANCHORS = (
|
||||
r"Funktionscookie", r"Trackingcookie",
|
||||
r"Tracking Cookies?", r"Session Cookies?",
|
||||
r"Funktional", r"Marketing", r"Analytics", r"Necessary",
|
||||
r"Werbung", r"Personalisierung", r"Statistik",
|
||||
r"Notwendig", r"Erforderlich",
|
||||
)
|
||||
|
||||
_CATEGORY_PATTERN = "(?:" + "|".join(_CATEGORY_ANCHORS) + r")(?:\s*\([^)]*\))?"
|
||||
|
||||
# Cookie-Name: alphanum + underscore + dash + dot. Wir erlauben optional
|
||||
# einen Suffix-Underscore (Spalten-Umbruch bei VW: `VWD6_ENSIGHTEN_PRIVACY_`
|
||||
# als Name-Fragment). Mind. 3, max. 60 chars.
|
||||
_COOKIE_NAME_RE = (
|
||||
r"(?:[A-Za-z][\w\-.]{2,60}|[A-Za-z][\w\-.]{2,60}<[^>]+>)"
|
||||
)
|
||||
|
||||
|
||||
def parse_ocr_cookie_table(text: str) -> list[dict]:
|
||||
"""Extract cookie-records from Tesseract-OCR text using anchor-based
|
||||
pattern: <name> <category> <purpose...> <duration> <type>.
|
||||
|
||||
Returns list of {name, category, purpose, duration, type}. Vendor is
|
||||
NOT inferred here — caller maps via _guess_vendor.
|
||||
|
||||
KEINE Cookie-Namens-Korrektur — `awsalb` bleibt `awsalb`, nicht
|
||||
`awesome`. Falsche Korrektur waere ein Compliance-Verlust.
|
||||
"""
|
||||
if not text or len(text) < 200:
|
||||
return []
|
||||
import re as _re
|
||||
# Pattern: capture name + anchor category, then up to 250 chars
|
||||
# forward to grab duration + type tokens.
|
||||
pattern = _re.compile(
|
||||
rf"(?P<name>{_COOKIE_NAME_RE})\s+"
|
||||
rf"(?P<category>{_CATEGORY_PATTERN})"
|
||||
rf"(?P<rest>[^A-Z]{{0,300}}?)"
|
||||
rf"(?:(?P<duration>\d+(?:[.,]\s*)?\s*(?:Tage|Jahre?|Monate?|Minuten|Stunden|Sekunden)\.?)?\s*"
|
||||
rf"(?P<type>Permanent/Protokoll|Session\s*Cookie|Persistent\s*Cookie|Persistent\s*cookie))?",
|
||||
_re.IGNORECASE | _re.DOTALL,
|
||||
)
|
||||
seen_names: set[str] = set()
|
||||
out: list[dict] = []
|
||||
for m in pattern.finditer(text):
|
||||
name = (m.group("name") or "").strip()
|
||||
# Filter obvious garbage (UI strings, navigation, common words)
|
||||
if not name or len(name) < 3:
|
||||
continue
|
||||
nl = name.lower()
|
||||
if nl in seen_names:
|
||||
continue
|
||||
# Reject common non-cookie words. Cookie-Namen sind technische IDs:
|
||||
# haben oft Unterstrich/Bindestrich/Camel-Case oder sind kurze IDs.
|
||||
if nl in ("name", "art", "zweck", "dauer", "kategorie", "anbieter",
|
||||
"cookie", "cookies", "name des cookies",
|
||||
"this", "dieser", "diese", "alle", "und", "von", "der",
|
||||
"die", "das", "ein", "eine", "session", "permanent",
|
||||
"category"):
|
||||
continue
|
||||
# Cookie-Namen sollen kein reines Lower-Word sein OHNE _ oder -
|
||||
# (z.B. "verwendet" wuerde sonst matchen)
|
||||
has_marker = any(c in name for c in "_-.<>")
|
||||
is_caps = name.upper() == name and len(name) >= 3
|
||||
is_camel = any(c.isupper() for c in name[1:]) and any(c.islower() for c in name)
|
||||
if not (has_marker or is_caps or is_camel):
|
||||
# Lowercase word ohne Marker → vermutlich kein Cookie-Name
|
||||
continue
|
||||
seen_names.add(nl)
|
||||
out.append({
|
||||
"name": name[:80],
|
||||
"category": (m.group("category") or "").strip()[:60],
|
||||
"purpose": (m.group("rest") or "").strip()[:200],
|
||||
"duration": (m.group("duration") or "").strip()[:60],
|
||||
"type": (m.group("type") or "").strip()[:30],
|
||||
"vendor": "",
|
||||
})
|
||||
logger.info("parse_ocr_cookie_table: %d unique cookies extracted", len(out))
|
||||
return out
|
||||
|
||||
|
||||
_VISION_PROMPT = (
|
||||
"Du analysierst einen Screenshot einer Cookie-Richtlinie. Auf der Seite "
|
||||
"ist eine Tabelle mit Cookies aufgelistet. Spalten sind ueblicherweise: "
|
||||
"Name des Cookies, Kategorie (z.B. 'Funktional', 'Marketing', "
|
||||
"'Analytics'), Verwendungszweck, Speicherdauer, Art des Cookies "
|
||||
"(z.B. 'Permanent', 'Session').\n\n"
|
||||
"Extrahiere ALLE Cookies aus dem Bild. Wenn die Tabelle abgeschnitten "
|
||||
"ist, extrahiere alles was sichtbar ist. KEINE Cookies erfinden, KEINE "
|
||||
"Halluzinationen.\n\n"
|
||||
"Antworte als reines JSON-Objekt im Format:\n"
|
||||
'{"cookies": [\n'
|
||||
' {"name": "<Cookie-Name exakt>", "category": "<Kategorie>", '
|
||||
'"purpose": "<Kurzfassung Zweck max 120 chars>", '
|
||||
'"duration": "<Speicherdauer mit Einheit>", '
|
||||
'"type": "<Permanent|Session|...>", '
|
||||
'"vendor": "<Anbieter falls bekannt, sonst leer>"}\n'
|
||||
"]}\n\n"
|
||||
"Nur JSON, kein Erklaerungstext, keine Code-Fences."
|
||||
)
|
||||
# Backward-compat: some callers may import _parse_vision_response
|
||||
_parse_vision_response = parse_vision_response
|
||||
|
||||
|
||||
async def capture_cookie_evidence_slices(
|
||||
@@ -414,9 +127,7 @@ async def capture_cookie_evidence_slices(
|
||||
|
||||
|
||||
def _ocr_one_slice(s: dict) -> tuple[dict, list[dict]]:
|
||||
"""Helper for parallel execution: tesseract + parse for one slice.
|
||||
Returns (slice_metadata_summary, cookies)."""
|
||||
import base64 as _b64
|
||||
"""Helper for parallel execution: tesseract + parse for one slice."""
|
||||
try:
|
||||
png = _b64.b64decode(s.get("png_b64", ""))
|
||||
except Exception:
|
||||
@@ -440,10 +151,6 @@ def ocr_slices_extract_cookies(
|
||||
ThreadPoolExecutor with 4 workers yields ~4x speedup on multi-core
|
||||
machines (M4 Pro has plenty). Sequential 32 slices = ~60s, parallel
|
||||
~15s.
|
||||
|
||||
Returns (cookies, stats) where stats has:
|
||||
per_slice: [{idx, cookies_found, ts, top_y, bot_y}]
|
||||
total_raw, total_unique, slices
|
||||
"""
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
@@ -451,7 +158,6 @@ def ocr_slices_extract_cookies(
|
||||
return [], {"per_slice": [], "total_raw": 0,
|
||||
"total_unique": 0, "slices": 0}
|
||||
|
||||
# Keep slice order so the per-slice report is sequential.
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
||||
results = list(ex.map(_ocr_one_slice, slices))
|
||||
|
||||
@@ -474,7 +180,8 @@ def ocr_slices_extract_cookies(
|
||||
}
|
||||
logger.info(
|
||||
"ocr_slices_extract_cookies (parallel=%d): %d slices → %d raw → %d unique",
|
||||
max_workers, stats["slices"], stats["total_raw"], stats["total_unique"],
|
||||
max_workers, stats["slices"], stats["total_raw"],
|
||||
stats["total_unique"],
|
||||
)
|
||||
return all_cookies, stats
|
||||
|
||||
@@ -482,11 +189,7 @@ def ocr_slices_extract_cookies(
|
||||
async def capture_cookie_screenshot(
|
||||
cookie_url: str, check_id: str = "", timeout_s: float = 60.0,
|
||||
) -> dict:
|
||||
"""Trigger consent-tester to capture full-page screenshot of cookie URL.
|
||||
|
||||
Returns dict with png_b64, captured_at, url, width_px, height_px etc.
|
||||
Empty png_b64 on error.
|
||||
"""
|
||||
"""Trigger consent-tester to capture full-page screenshot of cookie URL."""
|
||||
if not cookie_url:
|
||||
return {"png_b64": "", "error": "no url"}
|
||||
try:
|
||||
@@ -514,11 +217,7 @@ async def capture_cookie_screenshot(
|
||||
async def extract_cookies_via_vision(
|
||||
png_b64: str, timeout_s: float = 240.0,
|
||||
) -> list[dict]:
|
||||
"""Call Ollama llama3.2-vision with the screenshot + extraction prompt.
|
||||
|
||||
Returns list of {name, category, purpose, duration, type, vendor}.
|
||||
Empty list on failure.
|
||||
"""
|
||||
"""Call Ollama vision model with the screenshot + extraction prompt."""
|
||||
if not png_b64:
|
||||
return []
|
||||
payload = {
|
||||
@@ -527,13 +226,10 @@ async def extract_cookies_via_vision(
|
||||
"format": "json",
|
||||
"messages": [{
|
||||
"role": "user",
|
||||
"content": _VISION_PROMPT,
|
||||
"content": VISION_PROMPT,
|
||||
"images": [png_b64],
|
||||
}],
|
||||
"options": {
|
||||
"temperature": 0.05,
|
||||
"num_predict": 8000,
|
||||
},
|
||||
"options": {"temperature": 0.05, "num_predict": 8000},
|
||||
}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=timeout_s) as c:
|
||||
@@ -543,7 +239,7 @@ async def extract_cookies_via_vision(
|
||||
)
|
||||
r.raise_for_status()
|
||||
content = (r.json().get("message") or {}).get("content", "") or ""
|
||||
cookies = _parse_vision_response(content)
|
||||
cookies = parse_vision_response(content)
|
||||
logger.info(
|
||||
"Vision-OCR extracted %d cookies (model=%s, response_len=%d)",
|
||||
len(cookies), VISION_MODEL, len(content),
|
||||
@@ -557,59 +253,11 @@ async def extract_cookies_via_vision(
|
||||
return []
|
||||
|
||||
|
||||
def _parse_vision_response(content: str) -> list[dict]:
|
||||
"""Be lenient: code fences, leading prose, partial JSON."""
|
||||
if not content:
|
||||
return []
|
||||
txt = content.strip()
|
||||
if txt.startswith("```"):
|
||||
lines = txt.split("\n")
|
||||
if lines and lines[-1].strip().startswith("```"):
|
||||
txt = "\n".join(lines[1:-1])
|
||||
else:
|
||||
txt = "\n".join(lines[1:])
|
||||
a, b = txt.find("{"), txt.rfind("}")
|
||||
if not (0 <= a < b):
|
||||
return []
|
||||
try:
|
||||
obj = json.loads(txt[a:b + 1])
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
if not isinstance(obj, dict):
|
||||
return []
|
||||
arr = obj.get("cookies") or obj.get("Cookies") or []
|
||||
if not isinstance(arr, list):
|
||||
return []
|
||||
out: list[dict] = []
|
||||
for item in arr[:300]: # cap to sanity
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
name = (item.get("name") or "").strip()
|
||||
if not name or len(name) < 2 or len(name) > 80:
|
||||
continue
|
||||
# Strip obvious garbage
|
||||
if re.fullmatch(r"[\s\-_.]+", name):
|
||||
continue
|
||||
out.append({
|
||||
"name": name[:80],
|
||||
"category": (item.get("category") or "").strip()[:60],
|
||||
"purpose": (item.get("purpose") or "").strip()[:200],
|
||||
"duration": (item.get("duration") or "").strip()[:60],
|
||||
"type": (item.get("type") or "").strip()[:30],
|
||||
"vendor": (item.get("vendor") or "").strip()[:80],
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
def cookies_to_vendor_records(
|
||||
cookies: list[dict], guess_vendor_fn=None,
|
||||
) -> list[dict]:
|
||||
"""Aggregate OCR-extracted cookies into vendor records compatible with
|
||||
cmp_vendors-schema. guess_vendor_fn: optional callable name → vendor.
|
||||
|
||||
Each cookie's vendor field is used; if empty, we fall back to
|
||||
guess_vendor_fn (e.g. _guess_vendor from cookies_table_parser).
|
||||
"""
|
||||
cmp_vendors-schema. guess_vendor_fn: optional callable name → vendor."""
|
||||
by_vendor: dict[str, dict] = {}
|
||||
for c in cookies:
|
||||
v_name = (c.get("vendor") or "").strip()
|
||||
|
||||
@@ -0,0 +1,353 @@
|
||||
"""OCR-Engine-Funktionen für cookie_screenshot_ocr (Phase-1 Split).
|
||||
|
||||
Aus dem Hauptmodul ausgelagert, damit es unter dem 500-LOC-Hard-Cap bleibt:
|
||||
- PIL-basiertes _slice_screenshot (zerteilt PNG in subimages)
|
||||
- Vision-LLM-OCR (ollama qwen2.5vl:32b)
|
||||
- PaddleOCR fallback
|
||||
- Tesseract OCR (Hauptpfad)
|
||||
- Anchor-basierter Parser parse_ocr_cookie_table
|
||||
- _parse_vision_response (JSON-Toleranz für Vision-Output)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64 as _b64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
VISION_MODEL = os.getenv("COOKIE_VISION_MODEL", "qwen2.5vl:32b")
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||
|
||||
_PADDLE_OCR = None # lazy-initialised PaddleOCR instance
|
||||
|
||||
|
||||
# ── 1. Screenshot-Slicing für Vision-Models ────────────────────────
|
||||
|
||||
def _slice_screenshot(png_bytes: bytes, slice_h: int = 1500,
|
||||
max_slices: int = 25) -> list[str]:
|
||||
"""Cut a tall full-page screenshot into 1280×slice_h slices and return
|
||||
each as base64-encoded PNG. Vision models choke on 25k-tall images
|
||||
(resampled down to ~1024 → unreadable text); slicing keeps DPI."""
|
||||
if not png_bytes:
|
||||
return []
|
||||
try:
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
except ImportError:
|
||||
return []
|
||||
img = Image.open(BytesIO(png_bytes)).convert("RGB")
|
||||
w, h = img.size
|
||||
n = min((h + slice_h - 1) // slice_h, max_slices)
|
||||
out: list[str] = []
|
||||
for i in range(n):
|
||||
top = i * slice_h
|
||||
bot = min((i + 1) * slice_h, h)
|
||||
chunk = img.crop((0, top, w, bot))
|
||||
buf = BytesIO()
|
||||
chunk.save(buf, format="PNG", optimize=True)
|
||||
out.append(_b64.b64encode(buf.getvalue()).decode("ascii"))
|
||||
return out
|
||||
|
||||
|
||||
# ── 2. Vision-LLM-OCR ──────────────────────────────────────────────
|
||||
|
||||
async def _call_vision_on_slice(b64_png: str,
|
||||
timeout_s: float = 240.0) -> str:
|
||||
"""Ask the vision model to dump all cookie-row text from one slice
|
||||
as raw text (NOT JSON). We parse it downstream with parse_flat regex."""
|
||||
prompt = (
|
||||
"Du siehst einen Bildausschnitt einer Cookie-Richtlinien-Tabelle. "
|
||||
"Liste ALLE Tabellen-Zeilen wortwoertlich auf, eine Zeile pro "
|
||||
"Cookie. Jede Zeile soll enthalten: Cookie-Name, Kategorie, "
|
||||
"Zweck, Speicherdauer, Art (Permanent/Session). "
|
||||
"Format: '<Name> | <Kategorie> | <Zweck> | <Dauer> | <Art>'. "
|
||||
"KEINE Cookies erfinden, nur was im Bild steht. Nur die Tabellen-"
|
||||
"Zeilen, keine Erklaerungen."
|
||||
)
|
||||
payload = {
|
||||
"model": VISION_MODEL,
|
||||
"stream": False,
|
||||
"messages": [{
|
||||
"role": "user", "content": prompt, "images": [b64_png],
|
||||
}],
|
||||
"options": {"temperature": 0.05, "num_predict": 4000},
|
||||
}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=timeout_s) as c:
|
||||
r = await c.post(f"{OLLAMA_URL.rstrip('/')}/api/chat",
|
||||
json=payload)
|
||||
r.raise_for_status()
|
||||
return (r.json().get("message") or {}).get("content", "") or ""
|
||||
except Exception as e:
|
||||
logger.debug("vision slice failed: %s", e)
|
||||
return ""
|
||||
|
||||
|
||||
async def ocr_screenshot_via_vision_slices(png_bytes: bytes,
|
||||
max_slices: int = 20) -> str:
|
||||
"""Slice + vision-OCR each slice + concatenate."""
|
||||
slices = _slice_screenshot(png_bytes, slice_h=1500,
|
||||
max_slices=max_slices)
|
||||
if not slices:
|
||||
return ""
|
||||
logger.info("Vision-slicing: %d slices → vision-OCR (model=%s)",
|
||||
len(slices), VISION_MODEL)
|
||||
parts: list[str] = []
|
||||
for i, s in enumerate(slices):
|
||||
txt = await _call_vision_on_slice(s)
|
||||
if txt:
|
||||
parts.append(txt)
|
||||
logger.info("Vision-slice %d/%d: %d chars", i + 1, len(slices),
|
||||
len(txt))
|
||||
full = "\n".join(parts)
|
||||
logger.info("Vision-OCR slicing total: %d chars from %d slices",
|
||||
len(full), len(slices))
|
||||
return full
|
||||
|
||||
|
||||
# ── 3. PaddleOCR (fallback) ────────────────────────────────────────
|
||||
|
||||
def ocr_screenshot_via_paddle(png_bytes: bytes) -> str:
|
||||
"""Run PaddleOCR over the full-page screenshot, returning the
|
||||
concatenated text. Splits tall screenshots into 1280x3000 slices."""
|
||||
if not png_bytes:
|
||||
return ""
|
||||
try:
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
from paddleocr import PaddleOCR
|
||||
except ImportError as e:
|
||||
logger.warning("PaddleOCR / PIL not available: %s", e)
|
||||
return ""
|
||||
|
||||
try:
|
||||
img = Image.open(BytesIO(png_bytes)).convert("RGB")
|
||||
except Exception as e:
|
||||
logger.warning("PIL open failed: %s", e)
|
||||
return ""
|
||||
|
||||
w, h = img.size
|
||||
slice_h = 3000
|
||||
n_slices = (h + slice_h - 1) // slice_h
|
||||
logger.info("PaddleOCR: %dx%d screenshot → %d slices of %d high",
|
||||
w, h, n_slices, slice_h)
|
||||
|
||||
global _PADDLE_OCR
|
||||
if _PADDLE_OCR is None:
|
||||
try:
|
||||
_PADDLE_OCR = PaddleOCR(use_angle_cls=False, lang="german",
|
||||
show_log=False)
|
||||
except Exception as e:
|
||||
logger.warning("PaddleOCR init failed: %s", e)
|
||||
return ""
|
||||
|
||||
parts: list[str] = []
|
||||
import numpy as np
|
||||
for i in range(n_slices):
|
||||
top = i * slice_h
|
||||
bot = min((i + 1) * slice_h, h)
|
||||
crop = img.crop((0, top, w, bot))
|
||||
arr = np.array(crop)
|
||||
try:
|
||||
result = _PADDLE_OCR.ocr(arr, cls=False)
|
||||
except Exception as e:
|
||||
logger.warning("PaddleOCR slice %d failed: %s", i, e)
|
||||
continue
|
||||
if not result:
|
||||
continue
|
||||
for page in result:
|
||||
if not page:
|
||||
continue
|
||||
for line in page:
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
if isinstance(line, list) and len(line) >= 2:
|
||||
txt = (line[1][0]
|
||||
if isinstance(line[1], (list, tuple))
|
||||
else str(line[1]))
|
||||
else:
|
||||
txt = str(line)
|
||||
if txt:
|
||||
parts.append(txt)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
full_text = "\n".join(parts)
|
||||
logger.info("PaddleOCR: extracted %d lines / %d chars from %d slices",
|
||||
len(parts), len(full_text), n_slices)
|
||||
return full_text
|
||||
|
||||
|
||||
# ── 4. Tesseract OCR (Hauptpfad) ───────────────────────────────────
|
||||
|
||||
def ocr_screenshot_via_tesseract(png_bytes: bytes,
|
||||
lang: str = "deu",
|
||||
psm: int = 4) -> str:
|
||||
"""Run Tesseract OCR on a full-page screenshot. psm=4 = single column
|
||||
of text of variable sizes (cookie-tables)."""
|
||||
if not png_bytes:
|
||||
return ""
|
||||
try:
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
import re as _re
|
||||
except ImportError as e:
|
||||
logger.warning("tesseract/PIL not available: %s", e)
|
||||
return ""
|
||||
try:
|
||||
img = Image.open(BytesIO(png_bytes)).convert("RGB")
|
||||
raw = pytesseract.image_to_string(img, lang=lang,
|
||||
config=f"--psm {psm}")
|
||||
norm = _re.sub(r"[ \t]+", " ", raw)
|
||||
norm = _re.sub(r"\n(?!\s*\n)", " ", norm)
|
||||
norm = _re.sub(r"\s{2,}", " ", norm)
|
||||
logger.info(
|
||||
"Tesseract OCR: %d chars / %d words (image %dx%d)",
|
||||
len(norm), len(norm.split()), img.size[0], img.size[1],
|
||||
)
|
||||
return norm
|
||||
except Exception as e:
|
||||
logger.warning("Tesseract OCR failed: %s (%s)",
|
||||
str(e) or "(no msg)", type(e).__name__)
|
||||
return ""
|
||||
|
||||
|
||||
# ── 5. Anchor-basierter Parser ─────────────────────────────────────
|
||||
|
||||
_CATEGORY_ANCHORS = (
|
||||
r"Funktionscookie", r"Trackingcookie",
|
||||
r"Tracking Cookies?", r"Session Cookies?",
|
||||
r"Funktional", r"Marketing", r"Analytics", r"Necessary",
|
||||
r"Werbung", r"Personalisierung", r"Statistik",
|
||||
r"Notwendig", r"Erforderlich",
|
||||
)
|
||||
_CATEGORY_PATTERN = ("(?:" + "|".join(_CATEGORY_ANCHORS)
|
||||
+ r")(?:\s*\([^)]*\))?")
|
||||
_COOKIE_NAME_RE = (
|
||||
r"(?:[A-Za-z][\w\-.]{2,60}|[A-Za-z][\w\-.]{2,60}<[^>]+>)"
|
||||
)
|
||||
|
||||
|
||||
def parse_ocr_cookie_table(text: str) -> list[dict]:
|
||||
"""Extract cookie-records from Tesseract-OCR text. KEINE Cookie-Namens-
|
||||
Korrektur — `awsalb` bleibt `awsalb`."""
|
||||
if not text or len(text) < 200:
|
||||
return []
|
||||
pattern = re.compile(
|
||||
rf"(?P<name>{_COOKIE_NAME_RE})\s+"
|
||||
rf"(?P<category>{_CATEGORY_PATTERN})"
|
||||
rf"(?P<rest>[^A-Z]{{0,300}}?)"
|
||||
rf"(?:(?P<duration>\d+(?:[.,]\s*)?\s*"
|
||||
rf"(?:Tage|Jahre?|Monate?|Minuten|Stunden|Sekunden)\.?)?\s*"
|
||||
rf"(?P<type>Permanent/Protokoll|Session\s*Cookie|"
|
||||
rf"Persistent\s*Cookie|Persistent\s*cookie))?",
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
seen_names: set[str] = set()
|
||||
out: list[dict] = []
|
||||
for m in pattern.finditer(text):
|
||||
name = (m.group("name") or "").strip()
|
||||
if not name or len(name) < 3:
|
||||
continue
|
||||
nl = name.lower()
|
||||
if nl in seen_names:
|
||||
continue
|
||||
if nl in ("name", "art", "zweck", "dauer", "kategorie", "anbieter",
|
||||
"cookie", "cookies", "name des cookies",
|
||||
"this", "dieser", "diese", "alle", "und", "von", "der",
|
||||
"die", "das", "ein", "eine", "session", "permanent",
|
||||
"category"):
|
||||
continue
|
||||
has_marker = any(c in name for c in "_-.<>")
|
||||
is_caps = name.upper() == name and len(name) >= 3
|
||||
is_camel = (any(c.isupper() for c in name[1:])
|
||||
and any(c.islower() for c in name))
|
||||
if not (has_marker or is_caps or is_camel):
|
||||
continue
|
||||
seen_names.add(nl)
|
||||
out.append({
|
||||
"name": name[:80],
|
||||
"category": (m.group("category") or "").strip()[:60],
|
||||
"purpose": (m.group("rest") or "").strip()[:200],
|
||||
"duration": (m.group("duration") or "").strip()[:60],
|
||||
"type": (m.group("type") or "").strip()[:30],
|
||||
"vendor": "",
|
||||
})
|
||||
logger.info("parse_ocr_cookie_table: %d unique cookies extracted",
|
||||
len(out))
|
||||
return out
|
||||
|
||||
|
||||
# ── 6. Vision-Response-Parser ──────────────────────────────────────
|
||||
|
||||
VISION_PROMPT = (
|
||||
"Du analysierst einen Screenshot einer Cookie-Richtlinie. Auf der Seite "
|
||||
"ist eine Tabelle mit Cookies aufgelistet. Spalten sind ueblicherweise: "
|
||||
"Name des Cookies, Kategorie (z.B. 'Funktional', 'Marketing', "
|
||||
"'Analytics'), Verwendungszweck, Speicherdauer, Art des Cookies "
|
||||
"(z.B. 'Permanent', 'Session').\n\n"
|
||||
"Extrahiere ALLE Cookies aus dem Bild. Wenn die Tabelle abgeschnitten "
|
||||
"ist, extrahiere alles was sichtbar ist. KEINE Cookies erfinden, KEINE "
|
||||
"Halluzinationen.\n\n"
|
||||
"Antworte als reines JSON-Objekt im Format:\n"
|
||||
'{"cookies": [\n'
|
||||
' {"name": "<Cookie-Name exakt>", "category": "<Kategorie>", '
|
||||
'"purpose": "<Kurzfassung Zweck max 120 chars>", '
|
||||
'"duration": "<Speicherdauer mit Einheit>", '
|
||||
'"type": "<Permanent|Session|...>", '
|
||||
'"vendor": "<Anbieter falls bekannt, sonst leer>"}\n'
|
||||
"]}\n\n"
|
||||
"Nur JSON, kein Erklaerungstext, keine Code-Fences."
|
||||
)
|
||||
|
||||
|
||||
def parse_vision_response(content: str) -> list[dict]:
|
||||
"""Be lenient: code fences, leading prose, partial JSON."""
|
||||
if not content:
|
||||
return []
|
||||
txt = content.strip()
|
||||
if txt.startswith("```"):
|
||||
lines = txt.split("\n")
|
||||
if lines and lines[-1].strip().startswith("```"):
|
||||
txt = "\n".join(lines[1:-1])
|
||||
else:
|
||||
txt = "\n".join(lines[1:])
|
||||
a, b = txt.find("{"), txt.rfind("}")
|
||||
if not (0 <= a < b):
|
||||
return []
|
||||
try:
|
||||
obj = json.loads(txt[a:b + 1])
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
if not isinstance(obj, dict):
|
||||
return []
|
||||
arr = obj.get("cookies") or obj.get("Cookies") or []
|
||||
if not isinstance(arr, list):
|
||||
return []
|
||||
out: list[dict] = []
|
||||
for item in arr[:300]:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
name = (item.get("name") or "").strip()
|
||||
if not name or len(name) < 2 or len(name) > 80:
|
||||
continue
|
||||
if re.fullmatch(r"[\s\-_.]+", name):
|
||||
continue
|
||||
out.append({
|
||||
"name": name[:80],
|
||||
"category": (item.get("category") or "").strip()[:60],
|
||||
"purpose": (item.get("purpose") or "").strip()[:200],
|
||||
"duration": (item.get("duration") or "").strip()[:60],
|
||||
"type": (item.get("type") or "").strip()[:30],
|
||||
"vendor": (item.get("vendor") or "").strip()[:80],
|
||||
})
|
||||
return out
|
||||
Reference in New Issue
Block a user