refactor: split cookie_screenshot_ocr.py (642 → 290 + 353 LOC)
CI / detect-changes (push) Successful in 7s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 11s
CI / loc-budget (push) Failing after 14s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m19s
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 29s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped

CI hard-cap 500 LOC. cookie_screenshot_ocr.py war auf 642 gewachsen,
also gesplittet:

  - cookie_screenshot_ocr_engines.py (353 LOC, NEU)
    OCR-Engine-Funktionen: _slice_screenshot, Vision-LLM (qwen2.5vl),
    PaddleOCR, Tesseract, parse_ocr_cookie_table, parse_vision_response,
    Konstanten VISION_MODEL/OLLAMA_URL/VISION_PROMPT.

  - cookie_screenshot_ocr.py (290 LOC, REWRITE)
    Orchestration: capture_cookie_evidence_slices, _ocr_one_slice,
    ocr_slices_extract_cookies, capture_cookie_screenshot,
    extract_cookies_via_vision, cookies_to_vendor_records.
    Re-Exports der Engine-Funktionen für Backward-Kompat.

Einziger externer Importer (_phase_d1_vendors_raw.py) braucht keinen
Code-Change — Public-API stabil.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-06 23:35:33 +02:00
parent ff796fb480
commit 02879a2c3a
9 changed files with 1790 additions and 384 deletions
@@ -0,0 +1,232 @@
'use client'
/**
* Strukturierter Editor fuer JSONB-Conditions:
* { kind: 'all'|'any', clauses: [{field, op, value}] }
*
* Wird im RuleEditor verwendet. Reine Praesentations-Komponente — Parent
* verwaltet State.
*/
import type {
ClauseOperator, RuleClause, RuleCondition,
} from '../_types'
import { OPERATOR_LABELS, PROFILE_FIELDS } from '../_types'
interface Props {
value: RuleCondition
onChange: (next: RuleCondition) => void
readOnly?: boolean
}
export default function ConditionBuilder({ value, onChange, readOnly }: Props) {
const setKind = (kind: 'all' | 'any') => onChange({ ...value, kind })
const setClause = (idx: number, clause: RuleClause) => {
const next = [...value.clauses]
next[idx] = clause
onChange({ ...value, clauses: next })
}
const addClause = () =>
onChange({
...value,
clauses: [
...value.clauses,
{ field: PROFILE_FIELDS[0].key, op: 'eq', value: '' },
],
})
const removeClause = (idx: number) =>
onChange({ ...value, clauses: value.clauses.filter((_, i) => i !== idx) })
return (
<div className="space-y-2">
<div className="flex items-center gap-2">
<span className="text-xs text-gray-600">Bedingung:</span>
<select
className="text-xs px-2 py-1 border border-gray-300 rounded"
value={value.kind}
disabled={readOnly}
onChange={(e) => setKind(e.target.value as 'all' | 'any')}
>
<option value="all">ALLE Klauseln müssen zutreffen (AND)</option>
<option value="any">MIND. EINE Klausel trifft zu (OR)</option>
</select>
</div>
{value.clauses.length === 0 && (
<div className="text-xs text-gray-500 italic px-1">
Keine Klauseln Regel gilt für jedes Profil.
</div>
)}
<ul className="space-y-1">
{value.clauses.map((clause, idx) => (
<li key={idx} className="flex items-start gap-1 p-1.5 bg-gray-50 rounded border border-gray-200">
<ClauseRow
clause={clause}
onChange={(c) => setClause(idx, c)}
readOnly={!!readOnly}
/>
{!readOnly && (
<button
className="text-xs px-1.5 py-0.5 text-rose-700 hover:bg-rose-50 rounded"
onClick={() => removeClause(idx)}
title="Klausel entfernen"
>
×
</button>
)}
</li>
))}
</ul>
{!readOnly && (
<button
className="text-xs px-2 py-1 border border-gray-300 rounded text-gray-700 hover:bg-gray-50"
onClick={addClause}
>
+ Klausel hinzufügen
</button>
)}
</div>
)
}
function ClauseRow({
clause, onChange, readOnly,
}: {
clause: RuleClause
onChange: (c: RuleClause) => void
readOnly: boolean
}) {
const field = PROFILE_FIELDS.find((f) => f.key === clause.field) || PROFILE_FIELDS[0]
const operators: ClauseOperator[] =
field.type === 'enum'
? ['eq', 'neq', 'in', 'not_in', 'exists', 'truthy', 'falsy']
: field.type === 'boolean'
? ['truthy', 'falsy', 'eq', 'neq']
: field.type === 'number'
? ['eq', 'neq', 'gt', 'gte', 'lt', 'lte']
: ['eq', 'neq', 'in', 'not_in', 'exists']
const requiresValue = !['exists', 'truthy', 'falsy'].includes(clause.op)
const multiValue = clause.op === 'in' || clause.op === 'not_in'
return (
<div className="flex-1 grid grid-cols-12 gap-1 items-center text-xs">
<select
className="col-span-4 px-1 py-0.5 border border-gray-300 rounded bg-white truncate"
value={clause.field}
disabled={readOnly}
onChange={(e) => onChange({ ...clause, field: e.target.value })}
>
{PROFILE_FIELDS.map((f) => (
<option key={f.key} value={f.key}>{f.label} ({f.key})</option>
))}
</select>
<select
className="col-span-3 px-1 py-0.5 border border-gray-300 rounded bg-white"
value={clause.op}
disabled={readOnly}
onChange={(e) => onChange({ ...clause, op: e.target.value as ClauseOperator })}
>
{operators.map((op) => (
<option key={op} value={op}>{OPERATOR_LABELS[op]}</option>
))}
</select>
<div className="col-span-5">
{requiresValue && (
<ValueInput
field={field}
multi={multiValue}
value={clause.value}
onChange={(v) => onChange({ ...clause, value: v })}
readOnly={readOnly}
/>
)}
</div>
</div>
)
}
function ValueInput({
field, multi, value, onChange, readOnly,
}: {
field: typeof PROFILE_FIELDS[number]
multi: boolean
value: unknown
onChange: (v: unknown) => void
readOnly: boolean
}) {
if (field.type === 'enum' && field.options) {
if (multi) {
const selected = Array.isArray(value) ? (value as string[]) : []
return (
<select
multiple
className="w-full px-1 py-0.5 border border-gray-300 rounded bg-white h-16"
value={selected}
disabled={readOnly}
onChange={(e) => {
const opts = Array.from(e.target.selectedOptions, (o) => o.value)
onChange(opts)
}}
>
{field.options.map((o) => (
<option key={o.value} value={o.value}>{o.label}</option>
))}
</select>
)
}
return (
<select
className="w-full px-1 py-0.5 border border-gray-300 rounded bg-white"
value={typeof value === 'string' ? value : ''}
disabled={readOnly}
onChange={(e) => onChange(e.target.value)}
>
<option value=""> wählen </option>
{field.options.map((o) => (
<option key={o.value} value={o.value}>{o.label}</option>
))}
</select>
)
}
if (field.type === 'number') {
return (
<input
type="number"
className="w-full px-1 py-0.5 border border-gray-300 rounded"
value={typeof value === 'number' ? value : 0}
disabled={readOnly}
onChange={(e) => onChange(Number(e.target.value))}
/>
)
}
if (field.type === 'boolean') {
return (
<select
className="w-full px-1 py-0.5 border border-gray-300 rounded bg-white"
value={value ? 'true' : 'false'}
disabled={readOnly}
onChange={(e) => onChange(e.target.value === 'true')}
>
<option value="true">true</option>
<option value="false">false</option>
</select>
)
}
return (
<input
type="text"
className="w-full px-1 py-0.5 border border-gray-300 rounded"
value={typeof value === 'string' ? value : ''}
disabled={readOnly}
onChange={(e) => onChange(e.target.value)}
/>
)
}
@@ -0,0 +1,414 @@
'use client'
/**
* Rechte Spalte: Detail-Editor fuer die ausgewaehlte Regel.
*
* - Zeigt Live-Version + offenen Draft (falls vorhanden)
* - Erlaubt Draft-Edit (classification, conditions, source_citation, rationale)
* - Buttons: "Neuen Draft starten" (kopiert von Live), "Einreichen" (mit Pflicht
* change_summary-Modal), "Intern freigeben" (DSB), "Publish" (= Mandanten-Freigabe)
* - Versionshistorie + Approval-Trail unten als Akkordeon
*/
import { useEffect, useMemo, useState } from 'react'
import type {
ApprovalHistoryEntry, Classification, Rule, RuleCondition, RuleVersion,
} from '../_types'
import { CLASSIFICATION_LABELS, STATUS_LABELS } from '../_types'
import ConditionBuilder from './ConditionBuilder'
interface Props {
rule: Rule
versions: RuleVersion[]
history: ApprovalHistoryEntry[]
onCreateDraft: (payload: {
classification: Classification
conditions: RuleCondition
source_citation: string
rationale?: string | null
}) => Promise<void>
onUpdateDraft: (versionId: string, patch: {
classification?: Classification
conditions?: RuleCondition
source_citation?: string
rationale?: string | null
}) => Promise<void>
onSubmitForReview: (versionId: string, changeSummary: string) => Promise<void>
onApprove: (versionId: string) => Promise<void>
onPublish: (versionId: string) => Promise<void>
onReject: (versionId: string, reason: string) => Promise<void>
}
export default function RuleEditor({
rule, versions, history,
onCreateDraft, onUpdateDraft,
onSubmitForReview, onApprove, onPublish, onReject,
}: Props) {
const liveVersion = useMemo(
() => versions.find((v) => v.is_live) || null,
[versions],
)
const draftVersion = useMemo(
() => versions.find((v) => ['draft', 'review'].includes(v.status)) || null,
[versions],
)
// Edit-State
const [classification, setClassification] = useState<Classification>('required')
const [conditions, setConditions] = useState<RuleCondition>({ kind: 'all', clauses: [] })
const [sourceCitation, setSourceCitation] = useState('')
const [rationale, setRationale] = useState('')
// Modal-State
const [showSubmit, setShowSubmit] = useState(false)
const [changeSummary, setChangeSummary] = useState('')
const [showHistory, setShowHistory] = useState(false)
const [rejectReason, setRejectReason] = useState('')
const [showReject, setShowReject] = useState(false)
// Sync Edit-State mit ausgewaehltem Version (Draft hat Vorrang)
const sourceVersion = draftVersion || liveVersion
useEffect(() => {
if (sourceVersion) {
setClassification(sourceVersion.classification)
setConditions(sourceVersion.conditions)
setSourceCitation(sourceVersion.source_citation)
setRationale(sourceVersion.rationale || '')
}
}, [sourceVersion?.id])
const isDraftMode = !!draftVersion && draftVersion.status === 'draft'
const isReviewMode = !!draftVersion && draftVersion.status === 'review'
const readOnly = !isDraftMode
const handleCreateDraft = () => {
onCreateDraft({
classification: liveVersion?.classification || 'recommended',
conditions: liveVersion?.conditions || { kind: 'all', clauses: [] },
source_citation: liveVersion?.source_citation || '',
rationale: liveVersion?.rationale,
})
}
const handleSaveDraft = () => {
if (!draftVersion) return
onUpdateDraft(draftVersion.id, {
classification, conditions, source_citation: sourceCitation, rationale,
})
}
const handleSubmit = () => {
if (!draftVersion || !changeSummary.trim()) return
onSubmitForReview(draftVersion.id, changeSummary.trim())
setShowSubmit(false)
setChangeSummary('')
}
return (
<div className="h-full flex flex-col overflow-hidden bg-white">
<header className="px-5 py-3 border-b border-gray-200">
<div className="flex items-baseline justify-between gap-3">
<div className="min-w-0">
<h2 className="text-base font-semibold text-gray-800 truncate">{rule.title}</h2>
<div className="text-xs text-gray-500">
<code>{rule.document_type}</code> · {rule.rule_key}
</div>
</div>
<div className="flex items-center gap-2 text-xs text-gray-600">
{liveVersion && (
<span>
Live: v{liveVersion.version_number} (
<code>{liveVersion.classification}</code>)
</span>
)}
{draftVersion && (
<span className="px-1.5 py-0.5 bg-amber-100 text-amber-800 rounded border border-amber-300">
Draft v{draftVersion.version_number} · {STATUS_LABELS[draftVersion.status]}
</span>
)}
</div>
</div>
</header>
<div className="flex-1 overflow-y-auto p-5 space-y-4">
{!draftVersion && (
<div className="bg-amber-50 border border-amber-200 rounded p-3 flex items-center justify-between">
<span className="text-sm text-amber-800">
Kein offener Draft. Starte einen neuen Draft, um die Regel zu ändern.
</span>
<button
className="px-3 py-1.5 text-sm bg-amber-600 text-white rounded hover:bg-amber-700"
onClick={handleCreateDraft}
>
+ Neuen Draft starten
</button>
</div>
)}
{/* Klassifikation */}
<section>
<label className="text-xs font-medium text-gray-700 block mb-1">
Klassifikation
</label>
<select
className="text-sm px-2 py-1 border border-gray-300 rounded"
value={classification}
disabled={readOnly}
onChange={(e) => setClassification(e.target.value as Classification)}
>
{(['required', 'recommended', 'optional'] as const).map((c) => (
<option key={c} value={c}>{CLASSIFICATION_LABELS[c]}</option>
))}
</select>
</section>
{/* Bedingung */}
<section>
<label className="text-xs font-medium text-gray-700 block mb-1">
Bedingung
</label>
<ConditionBuilder
value={conditions}
onChange={setConditions}
readOnly={readOnly}
/>
</section>
{/* Source Citation (Pflicht) */}
<section>
<label className="text-xs font-medium text-gray-700 block mb-1">
Quelle / Norm-Citation <span className="text-rose-600">*</span>
</label>
<input
type="text"
className="w-full text-sm px-2 py-1.5 border border-gray-300 rounded"
placeholder="z.B. § 12 HinSchG, Art. 28 DSGVO, EuGH C-311/18"
value={sourceCitation}
disabled={readOnly}
onChange={(e) => setSourceCitation(e.target.value)}
/>
</section>
{/* Rationale */}
<section>
<label className="text-xs font-medium text-gray-700 block mb-1">
Begründung / Rationale (optional)
</label>
<textarea
className="w-full text-sm px-2 py-1.5 border border-gray-300 rounded"
rows={3}
placeholder="Anwalts-Kommentar, warum die Regel so klassifiziert ist…"
value={rationale}
disabled={readOnly}
onChange={(e) => setRationale(e.target.value)}
/>
</section>
{/* Versionshistorie */}
<section>
<button
className="text-xs text-gray-600 hover:text-gray-800"
onClick={() => setShowHistory((v) => !v)}
>
{showHistory ? '▾' : '▸'} Versionshistorie + Approval-Trail ({versions.length} Versionen)
</button>
{showHistory && (
<HistoryList versions={versions} history={history} />
)}
</section>
</div>
{/* Footer-Aktionen */}
<footer className="px-5 py-3 border-t border-gray-200 bg-gray-50 flex items-center gap-2 flex-wrap">
{isDraftMode && (
<>
<button
className="px-3 py-1.5 text-sm border border-gray-300 rounded text-gray-700 hover:bg-white"
onClick={handleSaveDraft}
>
Draft speichern
</button>
<button
className="px-3 py-1.5 text-sm bg-amber-600 text-white rounded hover:bg-amber-700 disabled:opacity-50"
disabled={!sourceCitation.trim()}
onClick={() => setShowSubmit(true)}
title={!sourceCitation.trim() ? 'Source Citation ist Pflicht' : ''}
>
Zur internen Prüfung einreichen
</button>
</>
)}
{isReviewMode && (
<>
<button
className="px-3 py-1.5 text-sm bg-emerald-600 text-white rounded hover:bg-emerald-700"
onClick={() => draftVersion && onApprove(draftVersion.id)}
>
Intern freigeben Mandant
</button>
<button
className="px-3 py-1.5 text-sm bg-blue-600 text-white rounded hover:bg-blue-700"
onClick={() => draftVersion && onPublish(draftVersion.id)}
title="Wird sofort live (Test-Modus)"
>
Publish (sofort live)
</button>
<button
className="px-3 py-1.5 text-sm border border-rose-300 text-rose-700 rounded hover:bg-rose-50"
onClick={() => setShowReject(true)}
>
Ablehnen
</button>
</>
)}
</footer>
{showSubmit && (
<SubmitDialog
value={changeSummary}
onChange={setChangeSummary}
onCancel={() => setShowSubmit(false)}
onSubmit={handleSubmit}
/>
)}
{showReject && (
<RejectDialog
value={rejectReason}
onChange={setRejectReason}
onCancel={() => { setShowReject(false); setRejectReason('') }}
onSubmit={() => {
if (!draftVersion || !rejectReason.trim()) return
onReject(draftVersion.id, rejectReason.trim())
setShowReject(false); setRejectReason('')
}}
/>
)}
</div>
)
}
function HistoryList({ versions, history }: { versions: RuleVersion[]; history: ApprovalHistoryEntry[] }) {
return (
<div className="mt-2 space-y-2 text-xs">
<div>
<div className="font-medium text-gray-700 mb-1">Versionen:</div>
<ul className="space-y-1">
{versions.map((v) => (
<li key={v.id} className="bg-white border border-gray-200 rounded p-2">
<div className="flex items-center gap-2">
<span className="font-medium">v{v.version_number}</span>
<span className="px-1.5 py-0.5 bg-gray-100 rounded">{STATUS_LABELS[v.status]}</span>
{v.is_live && <span className="text-emerald-700"> Live</span>}
<span className="text-gray-500 ml-auto">
{new Date(v.created_at).toLocaleString('de-DE')}
</span>
</div>
{v.change_summary && (
<div className="mt-1 text-gray-600">Änderung: {v.change_summary}</div>
)}
{v.source_citation && (
<div className="mt-0.5 text-gray-500">Quelle: {v.source_citation}</div>
)}
</li>
))}
</ul>
</div>
<div>
<div className="font-medium text-gray-700 mb-1">Approval-Trail:</div>
<ul className="space-y-0.5">
{history.map((h) => (
<li key={h.id} className="text-gray-600">
{new Date(h.created_at).toLocaleString('de-DE')} · {h.action}
{h.approver && ` · ${h.approver}`}
{h.comment && `${h.comment}`}
</li>
))}
</ul>
</div>
</div>
)
}
function SubmitDialog({
value, onChange, onCancel, onSubmit,
}: {
value: string
onChange: (s: string) => void
onCancel: () => void
onSubmit: () => void
}) {
return (
<div className="fixed inset-0 bg-black/30 z-50 flex items-center justify-center" onClick={onCancel}>
<div className="bg-white rounded-lg shadow-xl w-[520px]" onClick={(e) => e.stopPropagation()}>
<header className="px-5 py-3 border-b border-gray-200">
<h3 className="font-semibold">Zur internen Prüfung einreichen</h3>
</header>
<div className="p-5">
<label className="text-xs font-medium text-gray-700">
Was wurde geändert? <span className="text-rose-600">*</span>
</label>
<textarea
autoFocus
rows={4}
className="w-full mt-1 text-sm px-2 py-1.5 border border-gray-300 rounded"
placeholder="z.B. Schwelle auf 50 MA angehoben (BAG-Urteil X)"
value={value}
onChange={(e) => onChange(e.target.value)}
/>
</div>
<footer className="px-5 py-3 border-t border-gray-200 flex justify-end gap-2">
<button className="px-3 py-1.5 text-sm text-gray-600" onClick={onCancel}>Abbrechen</button>
<button
className="px-4 py-1.5 text-sm bg-amber-600 text-white rounded disabled:opacity-50"
disabled={!value.trim()}
onClick={onSubmit}
>
Einreichen
</button>
</footer>
</div>
</div>
)
}
function RejectDialog({
value, onChange, onCancel, onSubmit,
}: {
value: string
onChange: (s: string) => void
onCancel: () => void
onSubmit: () => void
}) {
return (
<div className="fixed inset-0 bg-black/30 z-50 flex items-center justify-center" onClick={onCancel}>
<div className="bg-white rounded-lg shadow-xl w-[480px]" onClick={(e) => e.stopPropagation()}>
<header className="px-5 py-3 border-b border-gray-200">
<h3 className="font-semibold">Draft ablehnen</h3>
</header>
<div className="p-5">
<label className="text-xs font-medium text-gray-700">
Ablehnungsgrund <span className="text-rose-600">*</span>
</label>
<textarea
autoFocus
rows={3}
className="w-full mt-1 text-sm px-2 py-1.5 border border-gray-300 rounded"
value={value}
onChange={(e) => onChange(e.target.value)}
/>
</div>
<footer className="px-5 py-3 border-t border-gray-200 flex justify-end gap-2">
<button className="px-3 py-1.5 text-sm text-gray-600" onClick={onCancel}>Abbrechen</button>
<button
className="px-4 py-1.5 text-sm bg-rose-600 text-white rounded disabled:opacity-50"
disabled={!value.trim()}
onClick={onSubmit}
>
Ablehnen
</button>
</footer>
</div>
</div>
)
}
@@ -0,0 +1,111 @@
'use client'
/**
* Linke Spalte: Liste der globalen Empfehlungs-Regeln.
*
* Filterbar nach document_type. Klassifikations-Chip + Live-Indikator.
*/
import { useMemo, useState } from 'react'
import type { Rule, RuleVersion } from '../_types'
import { CLASSIFICATION_LABELS, STATUS_LABELS } from '../_types'
interface Props {
rules: Rule[]
versionsByRule: Record<string, RuleVersion | undefined>
selectedRuleId: string | null
onSelectRule: (ruleId: string) => void
}
export default function RuleList({
rules, versionsByRule, selectedRuleId, onSelectRule,
}: Props) {
const [filter, setFilter] = useState('')
const filtered = useMemo(() => {
if (!filter.trim()) return rules
const q = filter.toLowerCase()
return rules.filter(
(r) =>
r.title.toLowerCase().includes(q) ||
r.rule_key.toLowerCase().includes(q) ||
r.document_type.toLowerCase().includes(q),
)
}, [rules, filter])
return (
<div className="h-full flex flex-col overflow-hidden border-r border-gray-200 bg-gray-50">
<div className="p-3 border-b border-gray-200 bg-white">
<input
type="text"
placeholder="Suchen (Titel, Key, Doc-Type)…"
value={filter}
onChange={(e) => setFilter(e.target.value)}
className="w-full text-sm px-2 py-1.5 border border-gray-300 rounded"
/>
<div className="text-xs text-gray-500 mt-1">
{filtered.length} von {rules.length} Regeln
</div>
</div>
<ul className="flex-1 overflow-y-auto">
{filtered.map((rule) => {
const live = versionsByRule[rule.id]
const isSelected = rule.id === selectedRuleId
return (
<li key={rule.id}>
<button
className={`w-full text-left px-3 py-2 border-b border-gray-100 hover:bg-white ${
isSelected ? 'bg-white border-l-4 border-l-amber-500' : ''
}`}
onClick={() => onSelectRule(rule.id)}
>
<div className="flex items-center gap-2 mb-0.5">
{live && (
<ClassificationChip classification={live.classification} />
)}
{!live && (
<span className="px-1.5 py-0.5 text-xs rounded bg-gray-200 text-gray-600">
ohne Live-Version
</span>
)}
</div>
<div className="text-sm font-medium text-gray-800 truncate">
{rule.title}
</div>
<div className="text-xs text-gray-500 truncate">
<code>{rule.document_type}</code> · {rule.rule_key}
</div>
{live && (
<div className="text-[10px] text-gray-500 mt-0.5">
v{live.version_number} · {STATUS_LABELS[live.status]}
{live.is_live && (
<span className="ml-1 inline-block w-1.5 h-1.5 bg-emerald-500 rounded-full" />
)}
</div>
)}
</button>
</li>
)
})}
{filtered.length === 0 && (
<li className="px-3 py-4 text-sm text-gray-500 italic">
Keine Regeln gefunden.
</li>
)}
</ul>
</div>
)
}
function ClassificationChip({ classification }: { classification: 'required' | 'recommended' | 'optional' }) {
const colorMap = {
required: 'bg-rose-100 text-rose-800 border-rose-300',
recommended: 'bg-amber-100 text-amber-800 border-amber-300',
optional: 'bg-slate-100 text-slate-700 border-slate-300',
} as const
return (
<span className={`px-1.5 py-0.5 text-[10px] font-medium rounded border ${colorMap[classification]}`}>
{CLASSIFICATION_LABELS[classification]}
</span>
)
}
@@ -0,0 +1,183 @@
/**
* Hook fuer Template-Rule-Editor: laedt Regeln/Versions/History und exponiert
* Lifecycle-Actions (submit/approve/publish/reject) + Tenant-Override-CRUD.
*
* Alle API-Calls gehen ueber /api/sdk/v1/compliance/* (Next.js-Proxy zum
* backend-compliance).
*/
import { useCallback } from 'react'
import type {
ApprovalHistoryEntry,
Classification,
Rule,
RuleCondition,
RuleVersion,
TenantRuleOverride,
} from '../_types'
const API_BASE = '/api/sdk/v1/compliance'
async function req<T>(url: string, init?: RequestInit): Promise<T> {
const res = await fetch(url, {
...init,
headers: {
'Content-Type': 'application/json',
...(init?.headers || {}),
},
})
if (!res.ok) {
const text = await res.text().catch(() => res.statusText)
throw new Error(`${res.status}: ${text}`)
}
if (res.status === 204) return undefined as T
return res.json() as Promise<T>
}
export function useRuleEditorActions() {
const listRules = useCallback(
(documentType?: string) => {
const q = documentType ? `?document_type=${encodeURIComponent(documentType)}` : ''
return req<Rule[]>(`${API_BASE}/template-rules${q}`)
},
[],
)
const getRule = useCallback(
(ruleId: string) => req<Rule>(`${API_BASE}/template-rules/${ruleId}`),
[],
)
const listVersions = useCallback(
(ruleId: string) => req<RuleVersion[]>(`${API_BASE}/template-rules/${ruleId}/versions`),
[],
)
const getVersion = useCallback(
(versionId: string) => req<RuleVersion>(`${API_BASE}/template-rule-versions/${versionId}`),
[],
)
const createDraftVersion = useCallback(
(
ruleId: string,
payload: {
classification: Classification
conditions: RuleCondition
source_citation: string
rationale?: string | null
created_by?: string | null
},
) =>
req<RuleVersion>(`${API_BASE}/template-rules/${ruleId}/versions`, {
method: 'POST',
body: JSON.stringify({
rule_id: ruleId,
...payload,
}),
}),
[],
)
const updateDraftVersion = useCallback(
(
versionId: string,
patch: {
classification?: Classification
conditions?: RuleCondition
source_citation?: string
rationale?: string | null
change_summary?: string | null
},
) =>
req<RuleVersion>(`${API_BASE}/template-rule-versions/${versionId}`, {
method: 'PATCH',
body: JSON.stringify(patch),
}),
[],
)
const submitForReview = useCallback(
(
versionId: string,
payload: { change_summary: string; submitter?: string; comment?: string },
) =>
req<RuleVersion>(`${API_BASE}/template-rule-versions/${versionId}/submit-review`, {
method: 'POST',
body: JSON.stringify(payload),
}),
[],
)
const approveVersion = useCallback(
(versionId: string, payload: { approver?: string; comment?: string } = {}) =>
req<RuleVersion>(`${API_BASE}/template-rule-versions/${versionId}/approve`, {
method: 'POST',
body: JSON.stringify(payload),
}),
[],
)
const publishVersion = useCallback(
(versionId: string, payload: { approver?: string; comment?: string } = {}) =>
req<RuleVersion>(`${API_BASE}/template-rule-versions/${versionId}/publish`, {
method: 'POST',
body: JSON.stringify(payload),
}),
[],
)
const rejectVersion = useCallback(
(
versionId: string,
payload: { rejection_reason: string; rejector?: string; comment?: string },
) =>
req<RuleVersion>(`${API_BASE}/template-rule-versions/${versionId}/reject`, {
method: 'POST',
body: JSON.stringify(payload),
}),
[],
)
const getApprovalHistory = useCallback(
(versionId: string) =>
req<ApprovalHistoryEntry[]>(
`${API_BASE}/template-rule-versions/${versionId}/approval-history`,
),
[],
)
const listOverrides = useCallback(
() => req<TenantRuleOverride[]>(`${API_BASE}/tenant-rule-overrides`),
[],
)
const upsertOverride = useCallback(
(payload: {
rule_id: string
override_classification: Classification | null
reason: string
created_by?: string
}) =>
req<TenantRuleOverride>(`${API_BASE}/tenant-rule-overrides`, {
method: 'POST',
body: JSON.stringify(payload),
}),
[],
)
const deleteOverride = useCallback(
(overrideId: string) =>
req<void>(`${API_BASE}/tenant-rule-overrides/${overrideId}`, { method: 'DELETE' }),
[],
)
return {
listRules, getRule,
listVersions, getVersion,
createDraftVersion, updateDraftVersion,
submitForReview, approveVersion, publishVersion, rejectVersion,
getApprovalHistory,
listOverrides, upsertOverride, deleteOverride,
}
}
@@ -0,0 +1,246 @@
/**
* Types fuer den Template-Rule-Editor (SDK).
*
* Spiegeln die Pydantic-Modelle aus
* backend-compliance/compliance/schemas/template_rule.py.
*/
export type Classification = 'required' | 'recommended' | 'optional'
export type RuleStatus =
| 'draft' | 'review' | 'approved' | 'published' | 'archived' | 'rejected'
export type ClauseOperator =
| 'eq' | 'neq' | 'in' | 'not_in'
| 'gt' | 'gte' | 'lt' | 'lte'
| 'exists' | 'truthy' | 'falsy'
export interface RuleClause {
field: string
op: ClauseOperator
value?: unknown
}
export interface RuleCondition {
kind: 'all' | 'any'
clauses: RuleClause[]
}
export interface Rule {
id: string
rule_key: string
document_type: string
title: string
current_version_id: string | null
created_at: string
updated_at: string | null
}
export interface RuleVersion {
id: string
rule_id: string
version_number: number
status: RuleStatus
is_live: boolean
classification: Classification
conditions: RuleCondition
source_citation: string
rationale: string | null
change_summary: string | null
created_by: string | null
submitted_by: string | null
submitted_at: string | null
approved_by: string | null
approved_at: string | null
published_by: string | null
published_at: string | null
rejected_by: string | null
rejected_at: string | null
rejection_reason: string | null
created_at: string
updated_at: string | null
}
export interface ApprovalHistoryEntry {
id: string
version_id: string
action: string
approver: string | null
comment: string | null
created_at: string
}
export interface TenantRuleOverride {
id: string
tenant_id: string
rule_id: string
override_classification: Classification | null
reason: string
created_by: string | null
created_at: string
updated_at: string | null
}
// ---- Profil-Felder fuer Condition-Builder ----
export interface ProfileFieldOption {
/** Key der im Profil verwendet wird */
key: string
/** Label fuer die UI */
label: string
/** Kategorie fuer Gruppierung */
category: 'org' | 'proc' | 'prod' | 'comp' | 'tech' | 'compliance'
/** Erwarteter Datentyp */
type: 'string' | 'number' | 'boolean' | 'enum'
/** Wenn enum: Mögliche Werte mit Label */
options?: { value: string; label: string }[]
}
/**
* Die 17 Profil-Felder, die in den 33 Initial-Regeln verwendet werden.
* Aus templateRecommendations.ts portiert + compliance_depth_level ergaenzt.
*/
export const PROFILE_FIELDS: ProfileFieldOption[] = [
{
key: 'compliance_depth_level',
label: 'Compliance-Tiefe',
category: 'compliance', type: 'enum',
options: [
{ value: 'L1', label: 'L1 — Lean Startup' },
{ value: 'L2', label: 'L2 — Standard' },
{ value: 'L3', label: 'L3 — Strict' },
{ value: 'L4', label: 'L4 — Zertifizierungsbereit' },
],
},
{
key: 'org_employee_count',
label: 'Mitarbeiterzahl',
category: 'org', type: 'enum',
options: [
{ value: 'none', label: 'Keine' },
{ value: '1_9', label: '19' },
{ value: '10_49', label: '1049' },
{ value: '50_249', label: '50249' },
{ value: '250_999', label: '250999' },
{ value: '1000_plus', label: '1000+' },
],
},
{
key: 'org_has_employees', label: 'Hat Mitarbeiter', category: 'org', type: 'enum',
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
},
{
key: 'org_business_model', label: 'Geschäftsmodell', category: 'org', type: 'enum',
options: [
{ value: 'b2b_saas', label: 'B2B SaaS' },
{ value: 'b2c_shop', label: 'B2C Shop' },
{ value: 'platform', label: 'Plattform' },
{ value: 'marketplace', label: 'Marktplatz' },
{ value: 'social', label: 'Social Media' },
{ value: 'saas', label: 'SaaS' },
{ value: 'media', label: 'Media' },
{ value: 'manufacturing', label: 'Maschinenbau' },
{ value: 'other', label: 'Sonstiges' },
],
},
{
key: 'org_has_social_media', label: 'Hat Social Media', category: 'org', type: 'enum',
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
},
{
key: 'org_has_video_conferencing', label: 'Hat Video-Konferenzen', category: 'org', type: 'enum',
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
},
{
key: 'org_cert_target', label: 'Zertifizierungsziel', category: 'org', type: 'enum',
options: [
{ value: 'none', label: 'Keines' },
{ value: 'iso27001', label: 'ISO 27001' },
{ value: 'iso27701', label: 'ISO 27701' },
{ value: 'tisax', label: 'TISAX' },
],
},
{
key: 'proc_ai_usage', label: 'KI-Nutzung', category: 'proc', type: 'enum',
options: [
{ value: 'none', label: 'Keine' },
{ value: 'limited', label: 'Begrenzt' },
{ value: 'extensive', label: 'Umfangreich' },
],
},
{
key: 'proc_uses_ai_tools', label: 'Nutzt KI-Tools', category: 'proc', type: 'boolean',
},
{
key: 'proc_byod_allowed', label: 'BYOD erlaubt', category: 'proc', type: 'enum',
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
},
{
key: 'proc_dsfa_required', label: 'DSFA erforderlich', category: 'proc', type: 'enum',
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
},
{
key: 'prod_webshop', label: 'Webshop', category: 'prod', type: 'enum',
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
},
{
key: 'prod_ugc_platform', label: 'UGC-Plattform', category: 'prod', type: 'enum',
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
},
{
key: 'prod_consent_management', label: 'Consent Management', category: 'prod', type: 'enum',
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
},
{
key: 'comp_has_processors', label: 'Auftragsverarbeiter', category: 'comp', type: 'enum',
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
},
{
key: 'comp_vendor_management', label: 'Vendor-Management', category: 'comp', type: 'enum',
options: [{ value: 'yes', label: 'Ja' }, { value: 'no', label: 'Nein' }],
},
{
key: 'comp_dsfa_processes', label: 'DSFA-Prozesse', category: 'comp', type: 'enum',
options: [{ value: 'required', label: 'Erforderlich' }, { value: 'optional', label: 'Optional' }],
},
{
key: 'tech_third_country', label: 'Drittland-Transfer', category: 'tech', type: 'enum',
options: [
{ value: 'no', label: 'Nein' },
{ value: 'us_dpf_only', label: 'Nur US-DPF' },
{ value: 'adequate_only', label: 'Nur Angemessenheitsbeschluss' },
{ value: 'yes_us', label: 'Ja, USA' },
{ value: 'yes_other', label: 'Ja, Sonstige' },
],
},
]
export const OPERATOR_LABELS: Record<ClauseOperator, string> = {
eq: 'gleich (=)',
neq: 'ungleich (≠)',
in: 'in Liste',
not_in: 'nicht in Liste',
gt: 'größer (>)',
gte: 'größer/gleich (≥)',
lt: 'kleiner (<)',
lte: 'kleiner/gleich (≤)',
exists: 'existiert',
truthy: 'ist gesetzt',
falsy: 'ist leer',
}
export const CLASSIFICATION_LABELS: Record<Classification, string> = {
required: 'Pflicht',
recommended: 'Empfohlen',
optional: 'Optional',
}
export const STATUS_LABELS: Record<RuleStatus, string> = {
draft: 'Entwurf',
review: 'In Prüfung',
approved: 'Freigegeben',
published: 'Live',
archived: 'Archiviert',
rejected: 'Abgelehnt',
}
@@ -0,0 +1,205 @@
'use client'
/**
* Template Rule Editor — Editorial-UI fuer Anwaelte/DSBs.
*
* Architektur:
* - Links: RuleList mit Filter
* - Rechts: RuleEditor mit Klassifikation, Condition-Builder, Source-Citation,
* Approval-Workflow (draft → review → approved → published)
*
* Backend: /api/sdk/v1/compliance/template-rules + /template-rule-versions/*
*/
import { useEffect, useState, useCallback } from 'react'
import { useSDK } from '@/lib/sdk'
import StepHeader from '@/components/sdk/StepHeader/StepHeader'
import { useRuleEditorActions } from './_hooks/useRuleEditorActions'
import type {
ApprovalHistoryEntry, Classification, Rule, RuleCondition, RuleVersion,
} from './_types'
import RuleList from './_components/RuleList'
import RuleEditor from './_components/RuleEditor'
export default function TemplateRuleEditorPage() {
useSDK()
const actions = useRuleEditorActions()
const [rules, setRules] = useState<Rule[]>([])
const [liveVersionsByRule, setLiveVersionsByRule] = useState<Record<string, RuleVersion | undefined>>({})
const [selectedRuleId, setSelectedRuleId] = useState<string | null>(null)
const [selectedVersions, setSelectedVersions] = useState<RuleVersion[]>([])
const [selectedHistory, setSelectedHistory] = useState<ApprovalHistoryEntry[]>([])
const [loading, setLoading] = useState(true)
const [error, setError] = useState<string | null>(null)
// Initial: Regeln laden + Live-Versions
const loadRules = useCallback(async () => {
setLoading(true)
setError(null)
try {
const list = await actions.listRules()
setRules(list)
const byRule: Record<string, RuleVersion | undefined> = {}
// Live-Versionen parallel
await Promise.all(
list.map(async (r) => {
try {
const versions = await actions.listVersions(r.id)
const live = versions.find((v) => v.is_live)
byRule[r.id] = live
} catch {
byRule[r.id] = undefined
}
}),
)
setLiveVersionsByRule(byRule)
if (list.length > 0 && !selectedRuleId) {
setSelectedRuleId(list[0].id)
}
} catch (e) {
setError((e as Error).message)
} finally {
setLoading(false)
}
}, [actions, selectedRuleId])
// Bei Selektions-Wechsel: Versions + History laden
const loadSelected = useCallback(async () => {
if (!selectedRuleId) {
setSelectedVersions([])
setSelectedHistory([])
return
}
try {
const versions = await actions.listVersions(selectedRuleId)
setSelectedVersions(versions)
const live = versions.find((v) => v.is_live)
if (live) {
const history = await actions.getApprovalHistory(live.id)
setSelectedHistory(history)
} else {
setSelectedHistory([])
}
} catch (e) {
setError((e as Error).message)
}
}, [actions, selectedRuleId])
useEffect(() => { loadRules() }, [])
useEffect(() => { loadSelected() }, [selectedRuleId])
const handleCreateDraft = async (payload: {
classification: Classification
conditions: RuleCondition
source_citation: string
rationale?: string | null
}) => {
if (!selectedRuleId) return
try {
await actions.createDraftVersion(selectedRuleId, payload)
await loadSelected()
} catch (e) {
setError((e as Error).message)
}
}
const handleUpdateDraft = async (versionId: string, patch: {
classification?: Classification
conditions?: RuleCondition
source_citation?: string
rationale?: string | null
}) => {
try {
await actions.updateDraftVersion(versionId, patch)
await loadSelected()
} catch (e) {
setError((e as Error).message)
}
}
const handleSubmitForReview = async (versionId: string, changeSummary: string) => {
try {
await actions.submitForReview(versionId, { change_summary: changeSummary })
await loadSelected()
} catch (e) {
setError((e as Error).message)
}
}
const handleApprove = async (versionId: string) => {
try {
await actions.approveVersion(versionId)
await loadSelected()
} catch (e) {
setError((e as Error).message)
}
}
const handlePublish = async (versionId: string) => {
try {
await actions.publishVersion(versionId)
await loadRules()
await loadSelected()
} catch (e) {
setError((e as Error).message)
}
}
const handleReject = async (versionId: string, reason: string) => {
try {
await actions.rejectVersion(versionId, { rejection_reason: reason })
await loadSelected()
} catch (e) {
setError((e as Error).message)
}
}
const selectedRule = rules.find((r) => r.id === selectedRuleId)
return (
<div className="h-full flex flex-col bg-white">
<StepHeader
stepId="template-rule-editor"
title="Empfehlungs-Regeln"
description="Editorial-UI für profilbasierte Dokument-Empfehlungen. Anwälte/DSBs editieren globale Regeln mit Approval-Workflow + Quellen-Attribution."
/>
{error && (
<div className="px-5 py-2 bg-rose-50 border-b border-rose-200 text-sm text-rose-800">
{error}
</div>
)}
{loading && (
<div className="p-5 text-sm text-gray-500">Lade Regeln</div>
)}
{!loading && (
<div className="flex-1 grid grid-cols-[320px_1fr] overflow-hidden">
<RuleList
rules={rules}
versionsByRule={liveVersionsByRule}
selectedRuleId={selectedRuleId}
onSelectRule={setSelectedRuleId}
/>
{selectedRule ? (
<RuleEditor
rule={selectedRule}
versions={selectedVersions}
history={selectedHistory}
onCreateDraft={handleCreateDraft}
onUpdateDraft={handleUpdateDraft}
onSubmitForReview={handleSubmitForReview}
onApprove={handleApprove}
onPublish={handlePublish}
onReject={handleReject}
/>
) : (
<div className="h-full grid place-items-center text-sm text-gray-500">
Wähle links eine Regel zum Bearbeiten.
</div>
)}
</div>
)}
</div>
)
}
@@ -494,4 +494,18 @@ export const SDK_STEPS: SDKStep[] = [
prerequisiteSteps: [],
isOptional: true,
},
{
id: 'template-rule-editor',
seq: 5000,
phase: 2,
package: 'betrieb',
order: 13,
name: 'Empfehlungs-Regeln',
nameShort: 'Regeln',
description: 'Editorial-UI fuer profilbasierte Dokument-Empfehlungen (Anwalt/DSB)',
url: '/sdk/template-rule-editor',
checkpointId: 'CP-RULES',
prerequisiteSteps: [],
isOptional: true,
},
]
@@ -1,336 +1,49 @@
"""Screenshot-basierte Cookie-Extraktion mit Tesseract-OCR.
"""Screenshot-basierte Cookie-Extraktion (Orchestration).
Pipeline:
1. consent-tester macht Full-Page-Screenshot (Banner akzeptiert,
Accordions ausgeklappt, Timestamp eingebrannt) → PNG b64
2. Tesseract OCR (lang=deu, psm=4) → Rohtext mit Tabellen-Reihen
3. _parse_ocr_cookie_table(text) → strukturierte Liste {name, category,
purpose, duration, type, vendor}
3. parse_ocr_cookie_table(text) → strukturierte Liste
Funktioniert site-unabhaengig — egal welches CMP, egal welche Sprache
(Tesseract kann viele), egal welches DOM-Layout. Timestamp im Bild =
Beweis was wir zum Scan-Zeitpunkt wirklich gesehen haben.
Phase-1-Split (2026-06-06): Engine-Funktionen
(_slice_screenshot / vision-OCR / paddle / tesseract / parse) leben
jetzt in `cookie_screenshot_ocr_engines.py`. Re-Exports halten die
Public-API stabil — externe Importer (`_phase_d1_vendors_raw.py`)
brauchen keinen Code-Change.
"""
from __future__ import annotations
import base64 as _b64
import json
import logging
import os
import re
import httpx
logger = logging.getLogger(__name__)
from .cookie_screenshot_ocr_engines import ( # noqa: F401 (re-exports)
OLLAMA_URL,
VISION_MODEL,
VISION_PROMPT,
_PADDLE_OCR,
_call_vision_on_slice,
_slice_screenshot,
ocr_screenshot_via_paddle,
ocr_screenshot_via_tesseract,
ocr_screenshot_via_vision_slices,
parse_ocr_cookie_table,
parse_vision_response,
)
logger = logging.getLogger(__name__)
CONSENT_TESTER_URL = os.getenv(
"CONSENT_TESTER_URL", "http://bp-compliance-consent-tester:8094"
)
VISION_MODEL = os.getenv("COOKIE_VISION_MODEL", "qwen2.5vl:32b")
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
def _slice_screenshot(png_bytes: bytes, slice_h: int = 1500,
max_slices: int = 25) -> list[str]:
"""Cut a tall full-page screenshot into 1280×slice_h slices and return
each as base64-encoded PNG. Vision models choke on 25k-tall images
(resampled down to ~1024 → unreadable text); slicing keeps DPI."""
if not png_bytes:
return []
try:
from PIL import Image
from io import BytesIO
except ImportError:
return []
img = Image.open(BytesIO(png_bytes)).convert("RGB")
w, h = img.size
n = min((h + slice_h - 1) // slice_h, max_slices)
out: list[str] = []
for i in range(n):
top = i * slice_h
bot = min((i + 1) * slice_h, h)
chunk = img.crop((0, top, w, bot))
buf = BytesIO()
chunk.save(buf, format="PNG", optimize=True)
out.append(_b64.b64encode(buf.getvalue()).decode("ascii"))
return out
async def _call_vision_on_slice(b64_png: str, timeout_s: float = 240.0) -> str:
"""Ask the vision model to dump all cookie-row text from one slice
as raw text (NOT JSON). We parse it downstream with parse_flat regex."""
prompt = (
"Du siehst einen Bildausschnitt einer Cookie-Richtlinien-Tabelle. "
"Liste ALLE Tabellen-Zeilen wortwoertlich auf, eine Zeile pro "
"Cookie. Jede Zeile soll enthalten: Cookie-Name, Kategorie, "
"Zweck, Speicherdauer, Art (Permanent/Session). "
"Format: '<Name> | <Kategorie> | <Zweck> | <Dauer> | <Art>'. "
"KEINE Cookies erfinden, nur was im Bild steht. Nur die Tabellen-"
"Zeilen, keine Erklaerungen."
)
payload = {
"model": VISION_MODEL,
"stream": False,
"messages": [{
"role": "user", "content": prompt, "images": [b64_png],
}],
"options": {"temperature": 0.05, "num_predict": 4000},
}
try:
async with httpx.AsyncClient(timeout=timeout_s) as c:
r = await c.post(f"{OLLAMA_URL.rstrip('/')}/api/chat", json=payload)
r.raise_for_status()
return (r.json().get("message") or {}).get("content", "") or ""
except Exception as e:
logger.debug("vision slice failed: %s", e)
return ""
async def ocr_screenshot_via_vision_slices(png_bytes: bytes,
max_slices: int = 20) -> str:
"""Slice + vision-OCR each slice + concatenate. Returns raw text that
can be fed to parse_flat_cookie_text."""
slices = _slice_screenshot(png_bytes, slice_h=1500, max_slices=max_slices)
if not slices:
return ""
logger.info("Vision-slicing: %d slices → vision-OCR (model=%s)",
len(slices), VISION_MODEL)
import asyncio as _aio
# Run slices SEQUENTIALLY: ollama is single-GPU and loading the same
# model for parallel requests causes OOM + thrashing on Mac Mini.
parts: list[str] = []
for i, s in enumerate(slices):
txt = await _call_vision_on_slice(s)
if txt:
parts.append(txt)
logger.info("Vision-slice %d/%d: %d chars", i + 1, len(slices),
len(txt))
full = "\n".join(parts)
logger.info("Vision-OCR slicing total: %d chars from %d slices",
len(full), len(slices))
return full
def ocr_screenshot_via_paddle(png_bytes: bytes) -> str:
"""Run PaddleOCR over the full-page screenshot, returning the
concatenated text. Deterministic, no LLM halluzination.
Splits tall screenshots into 1280x3000 slices so OCR works in chunks
without OOM on large pages (VW cookie-page is ~25k px tall).
"""
if not png_bytes:
return ""
try:
from PIL import Image
from io import BytesIO
from paddleocr import PaddleOCR
except ImportError as e:
logger.warning("PaddleOCR / PIL not available: %s", e)
return ""
try:
img = Image.open(BytesIO(png_bytes)).convert("RGB")
except Exception as e:
logger.warning("PIL open failed: %s", e)
return ""
w, h = img.size
slice_h = 3000
n_slices = (h + slice_h - 1) // slice_h
logger.info("PaddleOCR: %dx%d screenshot → %d slices of %d high",
w, h, n_slices, slice_h)
# Global OCR instance reused — initial init is ~10s.
global _PADDLE_OCR
if "_PADDLE_OCR" not in globals() or _PADDLE_OCR is None:
try:
_PADDLE_OCR = PaddleOCR(use_angle_cls=False, lang="german",
show_log=False)
except Exception as e:
logger.warning("PaddleOCR init failed: %s", e)
return ""
parts: list[str] = []
import numpy as np
for i in range(n_slices):
top = i * slice_h
bot = min((i + 1) * slice_h, h)
crop = img.crop((0, top, w, bot))
arr = np.array(crop)
try:
result = _PADDLE_OCR.ocr(arr, cls=False)
except Exception as e:
logger.warning("PaddleOCR slice %d failed: %s", i, e)
continue
# PaddleOCR returns list-of-lines where each line is
# [bbox, (text, conf)] — variable nesting depending on version.
if not result:
continue
for page in result:
if not page: continue
for line in page:
if not line: continue
try:
if isinstance(line, list) and len(line) >= 2:
txt = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1])
else:
txt = str(line)
if txt: parts.append(txt)
except Exception:
continue
full_text = "\n".join(parts)
logger.info("PaddleOCR: extracted %d lines / %d chars from %d slices",
len(parts), len(full_text), n_slices)
return full_text
_PADDLE_OCR = None
# ── Tesseract-based parser ────────────────────────────────────────────
def ocr_screenshot_via_tesseract(png_bytes: bytes,
lang: str = "deu",
psm: int = 4) -> str:
"""Run Tesseract OCR on a full-page screenshot. Returns normalized text
where multi-newline paragraphs are collapsed but blank lines preserved
(helps anchor-based parsing).
psm=4 means single column of text of variable sizes (cookie-tables).
"""
if not png_bytes:
return ""
try:
import pytesseract
from PIL import Image
from io import BytesIO
import re as _re
except ImportError as e:
logger.warning("tesseract/PIL not available: %s", e)
return ""
try:
img = Image.open(BytesIO(png_bytes)).convert("RGB")
raw = pytesseract.image_to_string(img, lang=lang,
config=f"--psm {psm}")
# Collapse intra-paragraph newlines so OCR cells flow on one line.
norm = _re.sub(r"[ \t]+", " ", raw)
norm = _re.sub(r"\n(?!\s*\n)", " ", norm)
norm = _re.sub(r"\s{2,}", " ", norm)
logger.info(
"Tesseract OCR: %d chars / %d words (image %dx%d)",
len(norm), len(norm.split()), img.size[0], img.size[1],
)
return norm
except Exception as e:
logger.warning("Tesseract OCR failed: %s (%s)",
str(e) or "(no msg)", type(e).__name__)
return ""
# Kategorie-Anchor-Tokens that ALWAYS follow the Cookie-Name in the
# typical column layout: [NAME] [KATEGORIE] [ZWECK] [DAUER] [ART]
_CATEGORY_ANCHORS = (
r"Funktionscookie", r"Trackingcookie",
r"Tracking Cookies?", r"Session Cookies?",
r"Funktional", r"Marketing", r"Analytics", r"Necessary",
r"Werbung", r"Personalisierung", r"Statistik",
r"Notwendig", r"Erforderlich",
)
_CATEGORY_PATTERN = "(?:" + "|".join(_CATEGORY_ANCHORS) + r")(?:\s*\([^)]*\))?"
# Cookie-Name: alphanum + underscore + dash + dot. Wir erlauben optional
# einen Suffix-Underscore (Spalten-Umbruch bei VW: `VWD6_ENSIGHTEN_PRIVACY_`
# als Name-Fragment). Mind. 3, max. 60 chars.
_COOKIE_NAME_RE = (
r"(?:[A-Za-z][\w\-.]{2,60}|[A-Za-z][\w\-.]{2,60}<[^>]+>)"
)
def parse_ocr_cookie_table(text: str) -> list[dict]:
"""Extract cookie-records from Tesseract-OCR text using anchor-based
pattern: <name> <category> <purpose...> <duration> <type>.
Returns list of {name, category, purpose, duration, type}. Vendor is
NOT inferred here — caller maps via _guess_vendor.
KEINE Cookie-Namens-Korrektur — `awsalb` bleibt `awsalb`, nicht
`awesome`. Falsche Korrektur waere ein Compliance-Verlust.
"""
if not text or len(text) < 200:
return []
import re as _re
# Pattern: capture name + anchor category, then up to 250 chars
# forward to grab duration + type tokens.
pattern = _re.compile(
rf"(?P<name>{_COOKIE_NAME_RE})\s+"
rf"(?P<category>{_CATEGORY_PATTERN})"
rf"(?P<rest>[^A-Z]{{0,300}}?)"
rf"(?:(?P<duration>\d+(?:[.,]\s*)?\s*(?:Tage|Jahre?|Monate?|Minuten|Stunden|Sekunden)\.?)?\s*"
rf"(?P<type>Permanent/Protokoll|Session\s*Cookie|Persistent\s*Cookie|Persistent\s*cookie))?",
_re.IGNORECASE | _re.DOTALL,
)
seen_names: set[str] = set()
out: list[dict] = []
for m in pattern.finditer(text):
name = (m.group("name") or "").strip()
# Filter obvious garbage (UI strings, navigation, common words)
if not name or len(name) < 3:
continue
nl = name.lower()
if nl in seen_names:
continue
# Reject common non-cookie words. Cookie-Namen sind technische IDs:
# haben oft Unterstrich/Bindestrich/Camel-Case oder sind kurze IDs.
if nl in ("name", "art", "zweck", "dauer", "kategorie", "anbieter",
"cookie", "cookies", "name des cookies",
"this", "dieser", "diese", "alle", "und", "von", "der",
"die", "das", "ein", "eine", "session", "permanent",
"category"):
continue
# Cookie-Namen sollen kein reines Lower-Word sein OHNE _ oder -
# (z.B. "verwendet" wuerde sonst matchen)
has_marker = any(c in name for c in "_-.<>")
is_caps = name.upper() == name and len(name) >= 3
is_camel = any(c.isupper() for c in name[1:]) and any(c.islower() for c in name)
if not (has_marker or is_caps or is_camel):
# Lowercase word ohne Marker → vermutlich kein Cookie-Name
continue
seen_names.add(nl)
out.append({
"name": name[:80],
"category": (m.group("category") or "").strip()[:60],
"purpose": (m.group("rest") or "").strip()[:200],
"duration": (m.group("duration") or "").strip()[:60],
"type": (m.group("type") or "").strip()[:30],
"vendor": "",
})
logger.info("parse_ocr_cookie_table: %d unique cookies extracted", len(out))
return out
_VISION_PROMPT = (
"Du analysierst einen Screenshot einer Cookie-Richtlinie. Auf der Seite "
"ist eine Tabelle mit Cookies aufgelistet. Spalten sind ueblicherweise: "
"Name des Cookies, Kategorie (z.B. 'Funktional', 'Marketing', "
"'Analytics'), Verwendungszweck, Speicherdauer, Art des Cookies "
"(z.B. 'Permanent', 'Session').\n\n"
"Extrahiere ALLE Cookies aus dem Bild. Wenn die Tabelle abgeschnitten "
"ist, extrahiere alles was sichtbar ist. KEINE Cookies erfinden, KEINE "
"Halluzinationen.\n\n"
"Antworte als reines JSON-Objekt im Format:\n"
'{"cookies": [\n'
' {"name": "<Cookie-Name exakt>", "category": "<Kategorie>", '
'"purpose": "<Kurzfassung Zweck max 120 chars>", '
'"duration": "<Speicherdauer mit Einheit>", '
'"type": "<Permanent|Session|...>", '
'"vendor": "<Anbieter falls bekannt, sonst leer>"}\n'
"]}\n\n"
"Nur JSON, kein Erklaerungstext, keine Code-Fences."
)
# Backward-compat: some callers may import _parse_vision_response
_parse_vision_response = parse_vision_response
async def capture_cookie_evidence_slices(
@@ -414,9 +127,7 @@ async def capture_cookie_evidence_slices(
def _ocr_one_slice(s: dict) -> tuple[dict, list[dict]]:
"""Helper for parallel execution: tesseract + parse for one slice.
Returns (slice_metadata_summary, cookies)."""
import base64 as _b64
"""Helper for parallel execution: tesseract + parse for one slice."""
try:
png = _b64.b64decode(s.get("png_b64", ""))
except Exception:
@@ -440,10 +151,6 @@ def ocr_slices_extract_cookies(
ThreadPoolExecutor with 4 workers yields ~4x speedup on multi-core
machines (M4 Pro has plenty). Sequential 32 slices = ~60s, parallel
~15s.
Returns (cookies, stats) where stats has:
per_slice: [{idx, cookies_found, ts, top_y, bot_y}]
total_raw, total_unique, slices
"""
from concurrent.futures import ThreadPoolExecutor
@@ -451,7 +158,6 @@ def ocr_slices_extract_cookies(
return [], {"per_slice": [], "total_raw": 0,
"total_unique": 0, "slices": 0}
# Keep slice order so the per-slice report is sequential.
with ThreadPoolExecutor(max_workers=max_workers) as ex:
results = list(ex.map(_ocr_one_slice, slices))
@@ -474,7 +180,8 @@ def ocr_slices_extract_cookies(
}
logger.info(
"ocr_slices_extract_cookies (parallel=%d): %d slices → %d raw → %d unique",
max_workers, stats["slices"], stats["total_raw"], stats["total_unique"],
max_workers, stats["slices"], stats["total_raw"],
stats["total_unique"],
)
return all_cookies, stats
@@ -482,11 +189,7 @@ def ocr_slices_extract_cookies(
async def capture_cookie_screenshot(
cookie_url: str, check_id: str = "", timeout_s: float = 60.0,
) -> dict:
"""Trigger consent-tester to capture full-page screenshot of cookie URL.
Returns dict with png_b64, captured_at, url, width_px, height_px etc.
Empty png_b64 on error.
"""
"""Trigger consent-tester to capture full-page screenshot of cookie URL."""
if not cookie_url:
return {"png_b64": "", "error": "no url"}
try:
@@ -514,11 +217,7 @@ async def capture_cookie_screenshot(
async def extract_cookies_via_vision(
png_b64: str, timeout_s: float = 240.0,
) -> list[dict]:
"""Call Ollama llama3.2-vision with the screenshot + extraction prompt.
Returns list of {name, category, purpose, duration, type, vendor}.
Empty list on failure.
"""
"""Call Ollama vision model with the screenshot + extraction prompt."""
if not png_b64:
return []
payload = {
@@ -527,13 +226,10 @@ async def extract_cookies_via_vision(
"format": "json",
"messages": [{
"role": "user",
"content": _VISION_PROMPT,
"content": VISION_PROMPT,
"images": [png_b64],
}],
"options": {
"temperature": 0.05,
"num_predict": 8000,
},
"options": {"temperature": 0.05, "num_predict": 8000},
}
try:
async with httpx.AsyncClient(timeout=timeout_s) as c:
@@ -543,7 +239,7 @@ async def extract_cookies_via_vision(
)
r.raise_for_status()
content = (r.json().get("message") or {}).get("content", "") or ""
cookies = _parse_vision_response(content)
cookies = parse_vision_response(content)
logger.info(
"Vision-OCR extracted %d cookies (model=%s, response_len=%d)",
len(cookies), VISION_MODEL, len(content),
@@ -557,59 +253,11 @@ async def extract_cookies_via_vision(
return []
def _parse_vision_response(content: str) -> list[dict]:
"""Be lenient: code fences, leading prose, partial JSON."""
if not content:
return []
txt = content.strip()
if txt.startswith("```"):
lines = txt.split("\n")
if lines and lines[-1].strip().startswith("```"):
txt = "\n".join(lines[1:-1])
else:
txt = "\n".join(lines[1:])
a, b = txt.find("{"), txt.rfind("}")
if not (0 <= a < b):
return []
try:
obj = json.loads(txt[a:b + 1])
except json.JSONDecodeError:
return []
if not isinstance(obj, dict):
return []
arr = obj.get("cookies") or obj.get("Cookies") or []
if not isinstance(arr, list):
return []
out: list[dict] = []
for item in arr[:300]: # cap to sanity
if not isinstance(item, dict):
continue
name = (item.get("name") or "").strip()
if not name or len(name) < 2 or len(name) > 80:
continue
# Strip obvious garbage
if re.fullmatch(r"[\s\-_.]+", name):
continue
out.append({
"name": name[:80],
"category": (item.get("category") or "").strip()[:60],
"purpose": (item.get("purpose") or "").strip()[:200],
"duration": (item.get("duration") or "").strip()[:60],
"type": (item.get("type") or "").strip()[:30],
"vendor": (item.get("vendor") or "").strip()[:80],
})
return out
def cookies_to_vendor_records(
cookies: list[dict], guess_vendor_fn=None,
) -> list[dict]:
"""Aggregate OCR-extracted cookies into vendor records compatible with
cmp_vendors-schema. guess_vendor_fn: optional callable name → vendor.
Each cookie's vendor field is used; if empty, we fall back to
guess_vendor_fn (e.g. _guess_vendor from cookies_table_parser).
"""
cmp_vendors-schema. guess_vendor_fn: optional callable name → vendor."""
by_vendor: dict[str, dict] = {}
for c in cookies:
v_name = (c.get("vendor") or "").strip()
@@ -0,0 +1,353 @@
"""OCR-Engine-Funktionen für cookie_screenshot_ocr (Phase-1 Split).
Aus dem Hauptmodul ausgelagert, damit es unter dem 500-LOC-Hard-Cap bleibt:
- PIL-basiertes _slice_screenshot (zerteilt PNG in subimages)
- Vision-LLM-OCR (ollama qwen2.5vl:32b)
- PaddleOCR fallback
- Tesseract OCR (Hauptpfad)
- Anchor-basierter Parser parse_ocr_cookie_table
- _parse_vision_response (JSON-Toleranz für Vision-Output)
"""
from __future__ import annotations
import base64 as _b64
import json
import logging
import os
import re
import httpx
logger = logging.getLogger(__name__)
VISION_MODEL = os.getenv("COOKIE_VISION_MODEL", "qwen2.5vl:32b")
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
_PADDLE_OCR = None # lazy-initialised PaddleOCR instance
# ── 1. Screenshot-Slicing für Vision-Models ────────────────────────
def _slice_screenshot(png_bytes: bytes, slice_h: int = 1500,
max_slices: int = 25) -> list[str]:
"""Cut a tall full-page screenshot into 1280×slice_h slices and return
each as base64-encoded PNG. Vision models choke on 25k-tall images
(resampled down to ~1024 → unreadable text); slicing keeps DPI."""
if not png_bytes:
return []
try:
from PIL import Image
from io import BytesIO
except ImportError:
return []
img = Image.open(BytesIO(png_bytes)).convert("RGB")
w, h = img.size
n = min((h + slice_h - 1) // slice_h, max_slices)
out: list[str] = []
for i in range(n):
top = i * slice_h
bot = min((i + 1) * slice_h, h)
chunk = img.crop((0, top, w, bot))
buf = BytesIO()
chunk.save(buf, format="PNG", optimize=True)
out.append(_b64.b64encode(buf.getvalue()).decode("ascii"))
return out
# ── 2. Vision-LLM-OCR ──────────────────────────────────────────────
async def _call_vision_on_slice(b64_png: str,
timeout_s: float = 240.0) -> str:
"""Ask the vision model to dump all cookie-row text from one slice
as raw text (NOT JSON). We parse it downstream with parse_flat regex."""
prompt = (
"Du siehst einen Bildausschnitt einer Cookie-Richtlinien-Tabelle. "
"Liste ALLE Tabellen-Zeilen wortwoertlich auf, eine Zeile pro "
"Cookie. Jede Zeile soll enthalten: Cookie-Name, Kategorie, "
"Zweck, Speicherdauer, Art (Permanent/Session). "
"Format: '<Name> | <Kategorie> | <Zweck> | <Dauer> | <Art>'. "
"KEINE Cookies erfinden, nur was im Bild steht. Nur die Tabellen-"
"Zeilen, keine Erklaerungen."
)
payload = {
"model": VISION_MODEL,
"stream": False,
"messages": [{
"role": "user", "content": prompt, "images": [b64_png],
}],
"options": {"temperature": 0.05, "num_predict": 4000},
}
try:
async with httpx.AsyncClient(timeout=timeout_s) as c:
r = await c.post(f"{OLLAMA_URL.rstrip('/')}/api/chat",
json=payload)
r.raise_for_status()
return (r.json().get("message") or {}).get("content", "") or ""
except Exception as e:
logger.debug("vision slice failed: %s", e)
return ""
async def ocr_screenshot_via_vision_slices(png_bytes: bytes,
max_slices: int = 20) -> str:
"""Slice + vision-OCR each slice + concatenate."""
slices = _slice_screenshot(png_bytes, slice_h=1500,
max_slices=max_slices)
if not slices:
return ""
logger.info("Vision-slicing: %d slices → vision-OCR (model=%s)",
len(slices), VISION_MODEL)
parts: list[str] = []
for i, s in enumerate(slices):
txt = await _call_vision_on_slice(s)
if txt:
parts.append(txt)
logger.info("Vision-slice %d/%d: %d chars", i + 1, len(slices),
len(txt))
full = "\n".join(parts)
logger.info("Vision-OCR slicing total: %d chars from %d slices",
len(full), len(slices))
return full
# ── 3. PaddleOCR (fallback) ────────────────────────────────────────
def ocr_screenshot_via_paddle(png_bytes: bytes) -> str:
"""Run PaddleOCR over the full-page screenshot, returning the
concatenated text. Splits tall screenshots into 1280x3000 slices."""
if not png_bytes:
return ""
try:
from PIL import Image
from io import BytesIO
from paddleocr import PaddleOCR
except ImportError as e:
logger.warning("PaddleOCR / PIL not available: %s", e)
return ""
try:
img = Image.open(BytesIO(png_bytes)).convert("RGB")
except Exception as e:
logger.warning("PIL open failed: %s", e)
return ""
w, h = img.size
slice_h = 3000
n_slices = (h + slice_h - 1) // slice_h
logger.info("PaddleOCR: %dx%d screenshot → %d slices of %d high",
w, h, n_slices, slice_h)
global _PADDLE_OCR
if _PADDLE_OCR is None:
try:
_PADDLE_OCR = PaddleOCR(use_angle_cls=False, lang="german",
show_log=False)
except Exception as e:
logger.warning("PaddleOCR init failed: %s", e)
return ""
parts: list[str] = []
import numpy as np
for i in range(n_slices):
top = i * slice_h
bot = min((i + 1) * slice_h, h)
crop = img.crop((0, top, w, bot))
arr = np.array(crop)
try:
result = _PADDLE_OCR.ocr(arr, cls=False)
except Exception as e:
logger.warning("PaddleOCR slice %d failed: %s", i, e)
continue
if not result:
continue
for page in result:
if not page:
continue
for line in page:
if not line:
continue
try:
if isinstance(line, list) and len(line) >= 2:
txt = (line[1][0]
if isinstance(line[1], (list, tuple))
else str(line[1]))
else:
txt = str(line)
if txt:
parts.append(txt)
except Exception:
continue
full_text = "\n".join(parts)
logger.info("PaddleOCR: extracted %d lines / %d chars from %d slices",
len(parts), len(full_text), n_slices)
return full_text
# ── 4. Tesseract OCR (Hauptpfad) ───────────────────────────────────
def ocr_screenshot_via_tesseract(png_bytes: bytes,
lang: str = "deu",
psm: int = 4) -> str:
"""Run Tesseract OCR on a full-page screenshot. psm=4 = single column
of text of variable sizes (cookie-tables)."""
if not png_bytes:
return ""
try:
import pytesseract
from PIL import Image
from io import BytesIO
import re as _re
except ImportError as e:
logger.warning("tesseract/PIL not available: %s", e)
return ""
try:
img = Image.open(BytesIO(png_bytes)).convert("RGB")
raw = pytesseract.image_to_string(img, lang=lang,
config=f"--psm {psm}")
norm = _re.sub(r"[ \t]+", " ", raw)
norm = _re.sub(r"\n(?!\s*\n)", " ", norm)
norm = _re.sub(r"\s{2,}", " ", norm)
logger.info(
"Tesseract OCR: %d chars / %d words (image %dx%d)",
len(norm), len(norm.split()), img.size[0], img.size[1],
)
return norm
except Exception as e:
logger.warning("Tesseract OCR failed: %s (%s)",
str(e) or "(no msg)", type(e).__name__)
return ""
# ── 5. Anchor-basierter Parser ─────────────────────────────────────
_CATEGORY_ANCHORS = (
r"Funktionscookie", r"Trackingcookie",
r"Tracking Cookies?", r"Session Cookies?",
r"Funktional", r"Marketing", r"Analytics", r"Necessary",
r"Werbung", r"Personalisierung", r"Statistik",
r"Notwendig", r"Erforderlich",
)
_CATEGORY_PATTERN = ("(?:" + "|".join(_CATEGORY_ANCHORS)
+ r")(?:\s*\([^)]*\))?")
_COOKIE_NAME_RE = (
r"(?:[A-Za-z][\w\-.]{2,60}|[A-Za-z][\w\-.]{2,60}<[^>]+>)"
)
def parse_ocr_cookie_table(text: str) -> list[dict]:
"""Extract cookie-records from Tesseract-OCR text. KEINE Cookie-Namens-
Korrektur — `awsalb` bleibt `awsalb`."""
if not text or len(text) < 200:
return []
pattern = re.compile(
rf"(?P<name>{_COOKIE_NAME_RE})\s+"
rf"(?P<category>{_CATEGORY_PATTERN})"
rf"(?P<rest>[^A-Z]{{0,300}}?)"
rf"(?:(?P<duration>\d+(?:[.,]\s*)?\s*"
rf"(?:Tage|Jahre?|Monate?|Minuten|Stunden|Sekunden)\.?)?\s*"
rf"(?P<type>Permanent/Protokoll|Session\s*Cookie|"
rf"Persistent\s*Cookie|Persistent\s*cookie))?",
re.IGNORECASE | re.DOTALL,
)
seen_names: set[str] = set()
out: list[dict] = []
for m in pattern.finditer(text):
name = (m.group("name") or "").strip()
if not name or len(name) < 3:
continue
nl = name.lower()
if nl in seen_names:
continue
if nl in ("name", "art", "zweck", "dauer", "kategorie", "anbieter",
"cookie", "cookies", "name des cookies",
"this", "dieser", "diese", "alle", "und", "von", "der",
"die", "das", "ein", "eine", "session", "permanent",
"category"):
continue
has_marker = any(c in name for c in "_-.<>")
is_caps = name.upper() == name and len(name) >= 3
is_camel = (any(c.isupper() for c in name[1:])
and any(c.islower() for c in name))
if not (has_marker or is_caps or is_camel):
continue
seen_names.add(nl)
out.append({
"name": name[:80],
"category": (m.group("category") or "").strip()[:60],
"purpose": (m.group("rest") or "").strip()[:200],
"duration": (m.group("duration") or "").strip()[:60],
"type": (m.group("type") or "").strip()[:30],
"vendor": "",
})
logger.info("parse_ocr_cookie_table: %d unique cookies extracted",
len(out))
return out
# ── 6. Vision-Response-Parser ──────────────────────────────────────
VISION_PROMPT = (
"Du analysierst einen Screenshot einer Cookie-Richtlinie. Auf der Seite "
"ist eine Tabelle mit Cookies aufgelistet. Spalten sind ueblicherweise: "
"Name des Cookies, Kategorie (z.B. 'Funktional', 'Marketing', "
"'Analytics'), Verwendungszweck, Speicherdauer, Art des Cookies "
"(z.B. 'Permanent', 'Session').\n\n"
"Extrahiere ALLE Cookies aus dem Bild. Wenn die Tabelle abgeschnitten "
"ist, extrahiere alles was sichtbar ist. KEINE Cookies erfinden, KEINE "
"Halluzinationen.\n\n"
"Antworte als reines JSON-Objekt im Format:\n"
'{"cookies": [\n'
' {"name": "<Cookie-Name exakt>", "category": "<Kategorie>", '
'"purpose": "<Kurzfassung Zweck max 120 chars>", '
'"duration": "<Speicherdauer mit Einheit>", '
'"type": "<Permanent|Session|...>", '
'"vendor": "<Anbieter falls bekannt, sonst leer>"}\n'
"]}\n\n"
"Nur JSON, kein Erklaerungstext, keine Code-Fences."
)
def parse_vision_response(content: str) -> list[dict]:
"""Be lenient: code fences, leading prose, partial JSON."""
if not content:
return []
txt = content.strip()
if txt.startswith("```"):
lines = txt.split("\n")
if lines and lines[-1].strip().startswith("```"):
txt = "\n".join(lines[1:-1])
else:
txt = "\n".join(lines[1:])
a, b = txt.find("{"), txt.rfind("}")
if not (0 <= a < b):
return []
try:
obj = json.loads(txt[a:b + 1])
except json.JSONDecodeError:
return []
if not isinstance(obj, dict):
return []
arr = obj.get("cookies") or obj.get("Cookies") or []
if not isinstance(arr, list):
return []
out: list[dict] = []
for item in arr[:300]:
if not isinstance(item, dict):
continue
name = (item.get("name") or "").strip()
if not name or len(name) < 2 or len(name) > 80:
continue
if re.fullmatch(r"[\s\-_.]+", name):
continue
out.append({
"name": name[:80],
"category": (item.get("category") or "").strip()[:60],
"purpose": (item.get("purpose") or "").strip()[:200],
"duration": (item.get("duration") or "").strip()[:60],
"type": (item.get("type") or "").strip()[:30],
"vendor": (item.get("vendor") or "").strip()[:80],
})
return out