feat(b15): AI-Act Rechtsgrundlage-Check (GT AI-ACT-RISK-001)

Erkennt: LLM/GPAI-System (Vertex AI, OpenAI/GPT, Claude) wird in DSE oder Cookie-Doc auf Art. 6 Abs. 1 lit. f (berechtigtes Interesse) gestützt — statt auf lit. a (Einwilligung). GT-Anker (Elli AI-ACT-RISK-001): Vertex-AI-Chatbot mit lit. f deklariert. Bei LLM-Prompt/Output-Logging + US-Transfer + Profiling-Ähnlichkeit ist Interessenabwägung fragwürdig. Heuristik: - KB-basiert (chat_providers.json filter: ai_capable + LLM-Type-Hint) - LLM-Vendor-Aliases inkl. Marken-Familien (PaLM, Gemini, GPT-4, ChatGPT, Claude 3, Azure OpenAI) - Absatz-Boundary-Scope: Provider + lit. f im selben Absatz - Negativ-Filter: wenn lit. a / Einwilligung ebenfalls im Absatz → kein Finding (Side-Purpose-Erwähnung) - Dedup pro (doc_type, provider_id) Severity: MEDIUM. Norm: DSGVO Art. 6 Abs. 1 lit. a vs lit. f + AI Act Art. 50 + 51. Tests: 17/17 grün. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-07 00:15:08 +02:00
parent 6aad774fc1
commit b0b7f80914
5 changed files with 361 additions and 0 deletions
@@ -0,0 +1,64 @@
+"""B15 wiring — AI-Act Rechtsgrundlage-Check für LLM-Vendors.
+
+Hängt sich an `state["extra_findings"]` an und rendert einen V2-Block
+(`ai_legal_basis_html`).
+"""
+
+from __future__ import annotations
+
+import html
+import logging
+
+from compliance.services.ai_legal_basis_check import check_ai_legal_basis
+
+logger = logging.getLogger(__name__)
+
+
+def run_b15(state: dict) -> None:
+    new = check_ai_legal_basis(state)
+    if not new:
+        return
+    extras = state.get("extra_findings") or []
+    extras.extend(new)
+    state["extra_findings"] = extras
+    state["ai_legal_basis_html"] = _render(new)
+    logger.info("B15 ai-legal-basis: %d finding(s)", len(new))
+
+
+def _render(findings: list[dict]) -> str:
+    cards = []
+    for f in findings:
+        sev = (f.get("severity") or "").upper()
+        color = "#dc2626" if sev == "HIGH" else "#f59e0b"
+        meta = (
+            "<div style='font-size:12px;color:#475569;margin-top:6px;'>"
+            f"<em>Provider: {html.escape(f.get('provider') or '?')} · "
+            f"Doc: {html.escape(f.get('doc_type') or '?')}</em></div>"
+        )
+        cards.append(
+            f"<div style='margin:12px 0;padding:14px;background:#fff;"
+            f"border-left:3px solid {color};border-radius:4px;'>"
+            f"<div style='font-weight:600;color:{color};font-size:14px;'>"
+            f"{sev} · {html.escape(f.get('check_id') or '')}</div>"
+            f"<div style='font-size:14px;margin-top:4px;'>"
+            f"<strong>{html.escape(f.get('title') or '')}</strong></div>"
+            f"<div style='font-size:12px;color:#64748b;margin-top:2px;'>"
+            f"{html.escape(f.get('norm') or '')}</div>"
+            f"{meta}"
+            f"<div style='font-size:12px;color:#475569;margin-top:6px;'>"
+            f"<em>{html.escape(f.get('evidence') or '')}</em></div>"
+            f"<div style='font-size:13px;margin-top:8px;background:#dcfce7;"
+            f"padding:8px 10px;border-radius:4px;'>"
+            f"<strong>→ Empfehlung:</strong> "
+            f"{html.escape(f.get('action') or '')}</div>"
+            "</div>"
+        )
+    return (
+        "<div style='margin:24px 0;padding:16px;border-left:4px solid #f59e0b;"
+        "background:#fffbeb;border-radius:4px;'>"
+        "<h2 style='margin:0 0 8px;color:#92400e;font-size:16px;'>"
+        "🤖 AI-Act Rechtsgrundlage (LLM-Vendor auf berechtigtem Interesse)"
+        "</h2>"
+        + "".join(cards) +
+        "</div>"
+    )
@@ -25,6 +25,7 @@ from ._b9b10_wiring import run_b9b10
 from ._b12_wiring import run_b12
 from ._b13_wiring import run_b13
 from ._b14_wiring import run_b14
+from ._b15_wiring import run_b15
 from ._constants import _compliance_check_jobs
 from ._phase_a_resolve import run_phase_a
 from ._phase_b_profile_check import run_phase_b
@@ -74,6 +75,7 @@ async def run_compliance_check(check_id: str, req) -> None:
        run_b12(state)  # Chatbot-Cookie-Klassifikation (B11 ist in B9B10)
        run_b13(state)  # Widerrufsbelehrung-Reachability (B2C-Pflicht)
        run_b14(state)  # Widersprüchliche Speicherdauer im selben Doc
+        run_b15(state)  # AI-Act Rechtsgrundlage (LLM-Vendor auf lit. f)
        # Phase D-3 top/mid/bot: Step 5 HTML blocks
        await run_phase_d3_top(state)
        await run_phase_d3_mid(state)
@@ -0,0 +1,184 @@
+"""B15 — AI-Act Rechtsgrundlage-Check für LLM-Vendors.
+
+Erkennt: LLM/GPAI-System (Vertex AI, OpenAI/GPT, Claude) wird in
+DSE/Cookie-Doc auf Art. 6 Abs. 1 lit. f (berechtigtes Interesse)
+gestützt — statt auf lit. a (Einwilligung).
+
+Norm-Argument:
+  - LLMs verarbeiten Prompts + Outputs als personenbezogene Daten
+  - oft US-Transfer (Vertex / OpenAI / Anthropic)
+  - LLM-Logging hat Profiling-Ähnlichkeit
+  → DSK + EDPB-Linie: Einwilligung ist sauberere Rechtsgrundlage,
+    lit. f-Interessenabwägung ist bei diesen Faktoren fragwürdig.
+
+Norm: DSGVO Art. 6 Abs. 1 lit. a vs lit. f + AI Act Art. 50 + 51.
+
+GT-Anker: Elli AI-ACT-RISK-001 — Vertex-AI-Chatbot mit lit. f
+deklariert.
+
+Heuristik:
+  1. Absatz-Splitting (\\n\\n).
+  2. Pro Absatz: enthält Mention eines LLM-Providers UND
+     "berechtigtes Interesse" / "lit. f" / "legitimate interest"?
+  3. Wenn ja → Finding MEDIUM.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import re
+
+logger = logging.getLogger(__name__)
+
+_KB_PATH = os.path.join(
+    os.path.dirname(__file__),
+    "specialist_agents", "_kb", "chat_providers.json",
+)
+
+
+def _load_llm_providers() -> list[dict]:
+    """Return KB entries marked ai_capable AND that look LLM-based.
+
+    Not every chat-platform with 'ai_capable=true' is an LLM-vendor
+    in the AI-Act-Art. 50 sense. We tighten the list with a name
+    regex (LLM/GPT/Claude/Vertex/Gemini) plus a 'type' substring
+    check so that ordinary chat widgets that only ROUTE to AI don't
+    trigger this finding.
+    """
+    try:
+        with open(_KB_PATH, encoding="utf-8") as f:
+            kb = json.load(f)
+    except Exception as e:
+        logger.warning("AI-legal-basis KB load failed: %s", e)
+        return []
+    out: list[dict] = []
+    llm_type_hints = ("ai-chatbot", "conversational-ai",
+                      "ai chatbot", "llm", "gpt", "claude", "vertex")
+    for pid, prov in (kb.get("providers") or {}).items():
+        if not prov.get("ai_capable"):
+            continue
+        type_str = (prov.get("type") or "").lower()
+        company = (prov.get("company") or "").lower()
+        if (any(h in type_str for h in llm_type_hints)
+                or any(h in company for h in llm_type_hints)):
+            out.append({"id": pid, "data": prov})
+    return out
+
+
+_LLM_PROVIDERS = _load_llm_providers()
+
+# Aliases that appear in DSE-prose for each provider. Built from KB +
+# common-knowledge synonyms (Google's "Vertex AI" is also referenced as
+# "Google Cloud AI" / "PaLM" / "Gemini" / "Generative AI").
+_LLM_NAME_ALIASES: dict[str, list[str]] = {
+    "vertex_ai_chatbot": [
+        "vertex ai", "vertex-ai", "vertexai", "google cloud ai",
+        "google generative ai", "google palm", "palm 2", "gemini",
+    ],
+    "openai_chatbot": [
+        "openai", "open ai", "gpt-3", "gpt-4", "gpt 3", "gpt 4",
+        "chatgpt", "chat gpt", "azure openai",
+    ],
+    "anthropic_claude": [
+        "anthropic", "claude 3", "claude-3", "claude.ai", "claude ai",
+    ],
+}
+
+
+_LIT_F_PATTERNS = (
+    re.compile(r"berechtigt(?:e[snm]?)?\s+interess", re.IGNORECASE),
+    re.compile(r"Art\.?\s*6\s*Abs\.?\s*1\s*(?:lit\.?\s*)?f\b", re.IGNORECASE),
+    re.compile(r"Art\.?\s*6\s*(?:Abs\.?\s*1)?\s*\(\s*1\s*\)\s*\(?f", re.IGNORECASE),
+    re.compile(r"legitimate\s+interest", re.IGNORECASE),
+)
+
+
+_LIT_A_PATTERNS = (
+    re.compile(r"einwilligung", re.IGNORECASE),
+    re.compile(r"Art\.?\s*6\s*Abs\.?\s*1\s*(?:lit\.?\s*)?a\b", re.IGNORECASE),
+    re.compile(r"\bconsent\b", re.IGNORECASE),
+)
+
+
+def _paragraph_split(text: str) -> list[str]:
+    return [p.strip() for p in re.split(r"\n\s*\n", text or "") if p.strip()]
+
+
+def _has_lit_f(paragraph: str) -> bool:
+    return any(p.search(paragraph) for p in _LIT_F_PATTERNS)
+
+
+def _has_lit_a(paragraph: str) -> bool:
+    return any(p.search(paragraph) for p in _LIT_A_PATTERNS)
+
+
+def _find_llm_mention(paragraph: str) -> tuple[str, str] | None:
+    p_lc = paragraph.lower()
+    for prov in _LLM_PROVIDERS:
+        aliases = _LLM_NAME_ALIASES.get(prov["id"]) or []
+        # also include the company name directly
+        aliases = aliases + [(prov["data"].get("company") or "").lower()]
+        for alias in aliases:
+            if alias and alias in p_lc:
+                return prov["id"], prov["data"].get("company") or prov["id"]
+    return None
+
+
+def check_ai_legal_basis(state: dict) -> list[dict]:
+    """Emit findings when an LLM provider is mentioned in a paragraph
+    that grounds processing on Art. 6 Abs. 1 lit. f."""
+    doc_texts = state.get("doc_texts") or {}
+    findings: list[dict] = []
+    seen: set[tuple[str, str]] = set()
+    for doc_type in ("dse", "cookie"):
+        text = doc_texts.get(doc_type) or ""
+        if not text:
+            continue
+        for para in _paragraph_split(text):
+            mention = _find_llm_mention(para)
+            if not mention:
+                continue
+            if not _has_lit_f(para):
+                continue
+            # If the same paragraph ALSO names lit. a / Einwilligung,
+            # the lit. f reference is likely about a side-purpose
+            # (e.g. analytics-Logging) — skip to avoid noise.
+            if _has_lit_a(para):
+                continue
+            prov_id, prov_company = mention
+            key = (doc_type, prov_id)
+            if key in seen:
+                continue
+            seen.add(key)
+            findings.append({
+                "check_id": "AI-LEGAL-BASIS-001",
+                "severity": "MEDIUM",
+                "severity_reason": "questionable_basis",
+                "doc_type": doc_type,
+                "provider": prov_company,
+                "title": (
+                    f"LLM-System '{prov_company}' auf Art. 6 Abs. 1 lit. f "
+                    "gestützt statt auf Einwilligung"
+                ),
+                "norm": (
+                    "DSGVO Art. 6 Abs. 1 lit. a vs lit. f + "
+                    "AI Act Art. 50 + 51"
+                ),
+                "evidence": (
+                    "LLM-Provider in einem Absatz erwähnt, der berechtigtes "
+                    "Interesse / lit. f als Rechtsgrundlage angibt. Bei "
+                    "Prompt-/Output-Logging mit US-Transfer und Profiling-"
+                    "Ähnlichkeit ist die Interessenabwägung fragwürdig."
+                ),
+                "action": (
+                    f"Rechtsgrundlage für {prov_company} auf Art. 6 Abs. 1 "
+                    "lit. a (Einwilligung) umstellen. Pre-Interaction-"
+                    "Consent + AI-Act Art. 50 Disclosure am Chat-UI "
+                    "einrichten."
+                ),
+            })
+    if findings:
+        logger.info("B15 ai-legal-basis: %d finding(s)", len(findings))
+    return findings
@@ -52,6 +52,8 @@ def compose_v2(state: dict) -> str:
        state.get("widerruf_reach_html", ""),
        # B14 Widersprüchliche Speicherdauer im selben Doc
        state.get("retention_conflict_html", ""),
+        # B15 AI-Act Rechtsgrundlage (LLM-Vendor auf lit. f)
+        state.get("ai_legal_basis_html", ""),
        # Browser-Matrix (Stage 1.c)
        state.get("browser_matrix_html", ""),
        # All legacy build_*_html() wrapped in V2 sections — preserves
@@ -0,0 +1,109 @@
+"""Tests for B15 AI-Act Rechtsgrundlage-Check (GT AI-ACT-RISK-001)."""
+
+from compliance.services.ai_legal_basis_check import (
+    _find_llm_mention,
+    _has_lit_a,
+    _has_lit_f,
+    check_ai_legal_basis,
+)
+
+
+class TestLitFDetection:
+    def test_berechtigtes_interesse(self):
+        assert _has_lit_f("Wir verarbeiten auf Grundlage des berechtigten "
+                          "Interesses.")
+
+    def test_lit_f_short(self):
+        assert _has_lit_f("Rechtsgrundlage: Art. 6 Abs. 1 lit. f DSGVO")
+
+    def test_lit_f_english(self):
+        assert _has_lit_f("Legal basis: legitimate interest.")
+
+    def test_lit_a_not_lit_f(self):
+        assert not _has_lit_f("Verarbeitung nur mit Einwilligung.")
+
+
+class TestLitADetection:
+    def test_einwilligung(self):
+        assert _has_lit_a("Verarbeitung auf Grundlage einer Einwilligung.")
+
+    def test_lit_a_short(self):
+        assert _has_lit_a("Art. 6 Abs. 1 lit. a DSGVO")
+
+    def test_lit_a_english(self):
+        assert _has_lit_a("Processing requires consent.")
+
+
+class TestLLMMention:
+    def test_vertex_ai(self):
+        m = _find_llm_mention("Wir nutzen Google Vertex AI für den Chatbot.")
+        assert m is not None
+        assert m[0] == "vertex_ai_chatbot"
+
+    def test_openai(self):
+        m = _find_llm_mention("Der Bot basiert auf ChatGPT von OpenAI.")
+        assert m is not None
+        assert m[0] == "openai_chatbot"
+
+    def test_anthropic(self):
+        m = _find_llm_mention("Antworten via Anthropic Claude 3.")
+        assert m is not None
+        assert m[0] == "anthropic_claude"
+
+    def test_no_llm_mention(self):
+        # iAdvize is ai_capable=true but NOT an LLM-vendor — should be
+        # filtered out by the LLM-tightening hint list.
+        m = _find_llm_mention("Live-Chat mit iAdvize SAS.")
+        assert m is None
+
+
+class TestCheckAILegalBasis:
+    def test_vertex_ai_lit_f_finding(self):
+        dse = (
+            "Wir setzen für den AI Assistant Google Vertex AI ein. "
+            "Rechtsgrundlage ist Art. 6 Abs. 1 lit. f DSGVO "
+            "(berechtigtes Interesse)."
+        )
+        findings = check_ai_legal_basis({"doc_texts": {"dse": dse}})
+        assert len(findings) == 1
+        f = findings[0]
+        assert f["check_id"] == "AI-LEGAL-BASIS-001"
+        assert f["severity"] == "MEDIUM"
+        assert "Vertex" in f["provider"]
+
+    def test_vertex_ai_with_consent_no_finding(self):
+        # If consent is ALSO named in the paragraph, no finding.
+        dse = (
+            "Vertex AI verarbeitet nur nach vorheriger Einwilligung "
+            "(Art. 6 Abs. 1 lit. a). Optional auch berechtigtes Interesse "
+            "(lit. f) für Server-Logs."
+        )
+        assert check_ai_legal_basis({"doc_texts": {"dse": dse}}) == []
+
+    def test_no_llm_no_finding(self):
+        dse = "Wir nutzen iAdvize Live-Chat auf Grundlage des "\
+              "berechtigten Interesses."
+        assert check_ai_legal_basis({"doc_texts": {"dse": dse}}) == []
+
+    def test_llm_without_lit_f_no_finding(self):
+        dse = "OpenAI / GPT verarbeitet nur mit Einwilligung."
+        assert check_ai_legal_basis({"doc_texts": {"dse": dse}}) == []
+
+    def test_dedup_same_provider_per_doc(self):
+        dse = (
+            "Vertex AI auf berechtigtem Interesse.\n\n"
+            "Vertex AI nochmals mit berechtigtem Interesse genannt."
+        )
+        findings = check_ai_legal_basis({"doc_texts": {"dse": dse}})
+        assert len(findings) == 1
+
+    def test_separate_doc_types_dedup_per_doc(self):
+        dse = "OpenAI / GPT auf berechtigtem Interesse."
+        cookie = "OpenAI ChatGPT auf Grundlage des berechtigten Interesses."
+        findings = check_ai_legal_basis({
+            "doc_texts": {"dse": dse, "cookie": cookie}
+        })
+        # one finding per (doc, provider)
+        assert len(findings) == 2
+        doc_types = {f["doc_type"] for f in findings}
+        assert doc_types == {"dse", "cookie"}