"""B15 — AI-Act Rechtsgrundlage-Check für LLM-Vendors. Erkennt: LLM/GPAI-System (Vertex AI, OpenAI/GPT, Claude) wird in DSE/Cookie-Doc auf Art. 6 Abs. 1 lit. f (berechtigtes Interesse) gestützt — statt auf lit. a (Einwilligung). Norm-Argument: - LLMs verarbeiten Prompts + Outputs als personenbezogene Daten - oft US-Transfer (Vertex / OpenAI / Anthropic) - LLM-Logging hat Profiling-Ähnlichkeit → DSK + EDPB-Linie: Einwilligung ist sauberere Rechtsgrundlage, lit. f-Interessenabwägung ist bei diesen Faktoren fragwürdig. Norm: DSGVO Art. 6 Abs. 1 lit. a vs lit. f + AI Act Art. 50 + 51. GT-Anker: Elli AI-ACT-RISK-001 — Vertex-AI-Chatbot mit lit. f deklariert. Heuristik: 1. Absatz-Splitting (\\n\\n). 2. Pro Absatz: enthält Mention eines LLM-Providers UND "berechtigtes Interesse" / "lit. f" / "legitimate interest"? 3. Wenn ja → Finding MEDIUM. """ from __future__ import annotations import json import logging import os import re logger = logging.getLogger(__name__) _KB_PATH = os.path.join( os.path.dirname(__file__), "specialist_agents", "_kb", "chat_providers.json", ) def _load_llm_providers() -> list[dict]: """Return KB entries marked ai_capable AND that look LLM-based. Not every chat-platform with 'ai_capable=true' is an LLM-vendor in the AI-Act-Art. 50 sense. We tighten the list with a name regex (LLM/GPT/Claude/Vertex/Gemini) plus a 'type' substring check so that ordinary chat widgets that only ROUTE to AI don't trigger this finding. """ try: with open(_KB_PATH, encoding="utf-8") as f: kb = json.load(f) except Exception as e: logger.warning("AI-legal-basis KB load failed: %s", e) return [] out: list[dict] = [] llm_type_hints = ("ai-chatbot", "conversational-ai", "ai chatbot", "llm", "gpt", "claude", "vertex") for pid, prov in (kb.get("providers") or {}).items(): if not prov.get("ai_capable"): continue type_str = (prov.get("type") or "").lower() company = (prov.get("company") or "").lower() if (any(h in type_str for h in llm_type_hints) or any(h in company for h in llm_type_hints)): out.append({"id": pid, "data": prov}) return out _LLM_PROVIDERS = _load_llm_providers() # Aliases that appear in DSE-prose for each provider. Built from KB + # common-knowledge synonyms (Google's "Vertex AI" is also referenced as # "Google Cloud AI" / "PaLM" / "Gemini" / "Generative AI"). _LLM_NAME_ALIASES: dict[str, list[str]] = { "vertex_ai_chatbot": [ "vertex ai", "vertex-ai", "vertexai", "google cloud ai", "google generative ai", "google palm", "palm 2", "gemini", ], "openai_chatbot": [ "openai", "open ai", "gpt-3", "gpt-4", "gpt 3", "gpt 4", "chatgpt", "chat gpt", "azure openai", ], "anthropic_claude": [ "anthropic", "claude 3", "claude-3", "claude.ai", "claude ai", ], } _LIT_F_PATTERNS = ( re.compile(r"berechtigt(?:e[snm]?)?\s+interess", re.IGNORECASE), re.compile(r"Art\.?\s*6\s*Abs\.?\s*1\s*(?:lit\.?\s*)?f\b", re.IGNORECASE), re.compile(r"Art\.?\s*6\s*(?:Abs\.?\s*1)?\s*\(\s*1\s*\)\s*\(?f", re.IGNORECASE), re.compile(r"legitimate\s+interest", re.IGNORECASE), ) _LIT_A_PATTERNS = ( re.compile(r"einwilligung", re.IGNORECASE), re.compile(r"Art\.?\s*6\s*Abs\.?\s*1\s*(?:lit\.?\s*)?a\b", re.IGNORECASE), re.compile(r"\bconsent\b", re.IGNORECASE), ) def _paragraph_split(text: str) -> list[str]: return [p.strip() for p in re.split(r"\n\s*\n", text or "") if p.strip()] def _has_lit_f(paragraph: str) -> bool: return any(p.search(paragraph) for p in _LIT_F_PATTERNS) def _has_lit_a(paragraph: str) -> bool: return any(p.search(paragraph) for p in _LIT_A_PATTERNS) def _find_llm_mention(paragraph: str) -> tuple[str, str] | None: p_lc = paragraph.lower() for prov in _LLM_PROVIDERS: aliases = _LLM_NAME_ALIASES.get(prov["id"]) or [] # also include the company name directly aliases = aliases + [(prov["data"].get("company") or "").lower()] for alias in aliases: if alias and alias in p_lc: return prov["id"], prov["data"].get("company") or prov["id"] return None def check_ai_legal_basis(state: dict) -> list[dict]: """Emit findings when an LLM provider is mentioned in a paragraph that grounds processing on Art. 6 Abs. 1 lit. f.""" doc_texts = state.get("doc_texts") or {} findings: list[dict] = [] seen: set[tuple[str, str]] = set() for doc_type in ("dse", "cookie"): text = doc_texts.get(doc_type) or "" if not text: continue for para in _paragraph_split(text): mention = _find_llm_mention(para) if not mention: continue if not _has_lit_f(para): continue # If the same paragraph ALSO names lit. a / Einwilligung, # the lit. f reference is likely about a side-purpose # (e.g. analytics-Logging) — skip to avoid noise. if _has_lit_a(para): continue prov_id, prov_company = mention key = (doc_type, prov_id) if key in seen: continue seen.add(key) findings.append({ "check_id": "AI-LEGAL-BASIS-001", "severity": "MEDIUM", "severity_reason": "questionable_basis", "doc_type": doc_type, "provider": prov_company, "title": ( f"LLM-System '{prov_company}' auf Art. 6 Abs. 1 lit. f " "gestützt statt auf Einwilligung" ), "norm": ( "DSGVO Art. 6 Abs. 1 lit. a vs lit. f + " "AI Act Art. 50 + 51" ), "evidence": ( "LLM-Provider in einem Absatz erwähnt, der berechtigtes " "Interesse / lit. f als Rechtsgrundlage angibt. Bei " "Prompt-/Output-Logging mit US-Transfer und Profiling-" "Ähnlichkeit ist die Interessenabwägung fragwürdig." ), "action": ( f"Rechtsgrundlage für {prov_company} auf Art. 6 Abs. 1 " "lit. a (Einwilligung) umstellen. Pre-Interaction-" "Consent + AI-Act Art. 50 Disclosure am Chat-UI " "einrichten." ), }) if findings: logger.info("B15 ai-legal-basis: %d finding(s)", len(findings)) return findings