b0b7f80914
Erkennt: LLM/GPAI-System (Vertex AI, OpenAI/GPT, Claude) wird in
DSE oder Cookie-Doc auf Art. 6 Abs. 1 lit. f (berechtigtes Interesse)
gestützt — statt auf lit. a (Einwilligung).
GT-Anker (Elli AI-ACT-RISK-001): Vertex-AI-Chatbot mit lit. f
deklariert. Bei LLM-Prompt/Output-Logging + US-Transfer +
Profiling-Ähnlichkeit ist Interessenabwägung fragwürdig.
Heuristik:
- KB-basiert (chat_providers.json filter: ai_capable + LLM-Type-Hint)
- LLM-Vendor-Aliases inkl. Marken-Familien (PaLM, Gemini, GPT-4,
ChatGPT, Claude 3, Azure OpenAI)
- Absatz-Boundary-Scope: Provider + lit. f im selben Absatz
- Negativ-Filter: wenn lit. a / Einwilligung ebenfalls im Absatz →
kein Finding (Side-Purpose-Erwähnung)
- Dedup pro (doc_type, provider_id)
Severity: MEDIUM.
Norm: DSGVO Art. 6 Abs. 1 lit. a vs lit. f + AI Act Art. 50 + 51.
Tests: 17/17 grün.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
185 lines
6.6 KiB
Python
185 lines
6.6 KiB
Python
"""B15 — AI-Act Rechtsgrundlage-Check für LLM-Vendors.
|
|
|
|
Erkennt: LLM/GPAI-System (Vertex AI, OpenAI/GPT, Claude) wird in
|
|
DSE/Cookie-Doc auf Art. 6 Abs. 1 lit. f (berechtigtes Interesse)
|
|
gestützt — statt auf lit. a (Einwilligung).
|
|
|
|
Norm-Argument:
|
|
- LLMs verarbeiten Prompts + Outputs als personenbezogene Daten
|
|
- oft US-Transfer (Vertex / OpenAI / Anthropic)
|
|
- LLM-Logging hat Profiling-Ähnlichkeit
|
|
→ DSK + EDPB-Linie: Einwilligung ist sauberere Rechtsgrundlage,
|
|
lit. f-Interessenabwägung ist bei diesen Faktoren fragwürdig.
|
|
|
|
Norm: DSGVO Art. 6 Abs. 1 lit. a vs lit. f + AI Act Art. 50 + 51.
|
|
|
|
GT-Anker: Elli AI-ACT-RISK-001 — Vertex-AI-Chatbot mit lit. f
|
|
deklariert.
|
|
|
|
Heuristik:
|
|
1. Absatz-Splitting (\\n\\n).
|
|
2. Pro Absatz: enthält Mention eines LLM-Providers UND
|
|
"berechtigtes Interesse" / "lit. f" / "legitimate interest"?
|
|
3. Wenn ja → Finding MEDIUM.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_KB_PATH = os.path.join(
|
|
os.path.dirname(__file__),
|
|
"specialist_agents", "_kb", "chat_providers.json",
|
|
)
|
|
|
|
|
|
def _load_llm_providers() -> list[dict]:
|
|
"""Return KB entries marked ai_capable AND that look LLM-based.
|
|
|
|
Not every chat-platform with 'ai_capable=true' is an LLM-vendor
|
|
in the AI-Act-Art. 50 sense. We tighten the list with a name
|
|
regex (LLM/GPT/Claude/Vertex/Gemini) plus a 'type' substring
|
|
check so that ordinary chat widgets that only ROUTE to AI don't
|
|
trigger this finding.
|
|
"""
|
|
try:
|
|
with open(_KB_PATH, encoding="utf-8") as f:
|
|
kb = json.load(f)
|
|
except Exception as e:
|
|
logger.warning("AI-legal-basis KB load failed: %s", e)
|
|
return []
|
|
out: list[dict] = []
|
|
llm_type_hints = ("ai-chatbot", "conversational-ai",
|
|
"ai chatbot", "llm", "gpt", "claude", "vertex")
|
|
for pid, prov in (kb.get("providers") or {}).items():
|
|
if not prov.get("ai_capable"):
|
|
continue
|
|
type_str = (prov.get("type") or "").lower()
|
|
company = (prov.get("company") or "").lower()
|
|
if (any(h in type_str for h in llm_type_hints)
|
|
or any(h in company for h in llm_type_hints)):
|
|
out.append({"id": pid, "data": prov})
|
|
return out
|
|
|
|
|
|
_LLM_PROVIDERS = _load_llm_providers()
|
|
|
|
# Aliases that appear in DSE-prose for each provider. Built from KB +
|
|
# common-knowledge synonyms (Google's "Vertex AI" is also referenced as
|
|
# "Google Cloud AI" / "PaLM" / "Gemini" / "Generative AI").
|
|
_LLM_NAME_ALIASES: dict[str, list[str]] = {
|
|
"vertex_ai_chatbot": [
|
|
"vertex ai", "vertex-ai", "vertexai", "google cloud ai",
|
|
"google generative ai", "google palm", "palm 2", "gemini",
|
|
],
|
|
"openai_chatbot": [
|
|
"openai", "open ai", "gpt-3", "gpt-4", "gpt 3", "gpt 4",
|
|
"chatgpt", "chat gpt", "azure openai",
|
|
],
|
|
"anthropic_claude": [
|
|
"anthropic", "claude 3", "claude-3", "claude.ai", "claude ai",
|
|
],
|
|
}
|
|
|
|
|
|
_LIT_F_PATTERNS = (
|
|
re.compile(r"berechtigt(?:e[snm]?)?\s+interess", re.IGNORECASE),
|
|
re.compile(r"Art\.?\s*6\s*Abs\.?\s*1\s*(?:lit\.?\s*)?f\b", re.IGNORECASE),
|
|
re.compile(r"Art\.?\s*6\s*(?:Abs\.?\s*1)?\s*\(\s*1\s*\)\s*\(?f", re.IGNORECASE),
|
|
re.compile(r"legitimate\s+interest", re.IGNORECASE),
|
|
)
|
|
|
|
|
|
_LIT_A_PATTERNS = (
|
|
re.compile(r"einwilligung", re.IGNORECASE),
|
|
re.compile(r"Art\.?\s*6\s*Abs\.?\s*1\s*(?:lit\.?\s*)?a\b", re.IGNORECASE),
|
|
re.compile(r"\bconsent\b", re.IGNORECASE),
|
|
)
|
|
|
|
|
|
def _paragraph_split(text: str) -> list[str]:
|
|
return [p.strip() for p in re.split(r"\n\s*\n", text or "") if p.strip()]
|
|
|
|
|
|
def _has_lit_f(paragraph: str) -> bool:
|
|
return any(p.search(paragraph) for p in _LIT_F_PATTERNS)
|
|
|
|
|
|
def _has_lit_a(paragraph: str) -> bool:
|
|
return any(p.search(paragraph) for p in _LIT_A_PATTERNS)
|
|
|
|
|
|
def _find_llm_mention(paragraph: str) -> tuple[str, str] | None:
|
|
p_lc = paragraph.lower()
|
|
for prov in _LLM_PROVIDERS:
|
|
aliases = _LLM_NAME_ALIASES.get(prov["id"]) or []
|
|
# also include the company name directly
|
|
aliases = aliases + [(prov["data"].get("company") or "").lower()]
|
|
for alias in aliases:
|
|
if alias and alias in p_lc:
|
|
return prov["id"], prov["data"].get("company") or prov["id"]
|
|
return None
|
|
|
|
|
|
def check_ai_legal_basis(state: dict) -> list[dict]:
|
|
"""Emit findings when an LLM provider is mentioned in a paragraph
|
|
that grounds processing on Art. 6 Abs. 1 lit. f."""
|
|
doc_texts = state.get("doc_texts") or {}
|
|
findings: list[dict] = []
|
|
seen: set[tuple[str, str]] = set()
|
|
for doc_type in ("dse", "cookie"):
|
|
text = doc_texts.get(doc_type) or ""
|
|
if not text:
|
|
continue
|
|
for para in _paragraph_split(text):
|
|
mention = _find_llm_mention(para)
|
|
if not mention:
|
|
continue
|
|
if not _has_lit_f(para):
|
|
continue
|
|
# If the same paragraph ALSO names lit. a / Einwilligung,
|
|
# the lit. f reference is likely about a side-purpose
|
|
# (e.g. analytics-Logging) — skip to avoid noise.
|
|
if _has_lit_a(para):
|
|
continue
|
|
prov_id, prov_company = mention
|
|
key = (doc_type, prov_id)
|
|
if key in seen:
|
|
continue
|
|
seen.add(key)
|
|
findings.append({
|
|
"check_id": "AI-LEGAL-BASIS-001",
|
|
"severity": "MEDIUM",
|
|
"severity_reason": "questionable_basis",
|
|
"doc_type": doc_type,
|
|
"provider": prov_company,
|
|
"title": (
|
|
f"LLM-System '{prov_company}' auf Art. 6 Abs. 1 lit. f "
|
|
"gestützt statt auf Einwilligung"
|
|
),
|
|
"norm": (
|
|
"DSGVO Art. 6 Abs. 1 lit. a vs lit. f + "
|
|
"AI Act Art. 50 + 51"
|
|
),
|
|
"evidence": (
|
|
"LLM-Provider in einem Absatz erwähnt, der berechtigtes "
|
|
"Interesse / lit. f als Rechtsgrundlage angibt. Bei "
|
|
"Prompt-/Output-Logging mit US-Transfer und Profiling-"
|
|
"Ähnlichkeit ist die Interessenabwägung fragwürdig."
|
|
),
|
|
"action": (
|
|
f"Rechtsgrundlage für {prov_company} auf Art. 6 Abs. 1 "
|
|
"lit. a (Einwilligung) umstellen. Pre-Interaction-"
|
|
"Consent + AI-Act Art. 50 Disclosure am Chat-UI "
|
|
"einrichten."
|
|
),
|
|
})
|
|
if findings:
|
|
logger.info("B15 ai-legal-basis: %d finding(s)", len(findings))
|
|
return findings
|