breakpilot-compliance/backend-compliance/compliance/services/ai_legal_basis_check.py

"""B15 — AI-Act Rechtsgrundlage-Check für LLM-Vendors.

Erkennt: LLM/GPAI-System (Vertex AI, OpenAI/GPT, Claude) wird in
DSE/Cookie-Doc auf Art. 6 Abs. 1 lit. f (berechtigtes Interesse)
gestützt — statt auf lit. a (Einwilligung).

Norm-Argument:
  - LLMs verarbeiten Prompts + Outputs als personenbezogene Daten
  - oft US-Transfer (Vertex / OpenAI / Anthropic)
  - LLM-Logging hat Profiling-Ähnlichkeit
  → DSK + EDPB-Linie: Einwilligung ist sauberere Rechtsgrundlage,
    lit. f-Interessenabwägung ist bei diesen Faktoren fragwürdig.

Norm: DSGVO Art. 6 Abs. 1 lit. a vs lit. f + AI Act Art. 50 + 51.

GT-Anker: Elli AI-ACT-RISK-001 — Vertex-AI-Chatbot mit lit. f
deklariert.

Heuristik:
  1. Absatz-Splitting (\\n\\n).
  2. Pro Absatz: enthält Mention eines LLM-Providers UND
     "berechtigtes Interesse" / "lit. f" / "legitimate interest"?
  3. Wenn ja → Finding MEDIUM.
"""

from __future__ import annotations

import json
import logging
import os
import re

logger = logging.getLogger(__name__)

_KB_PATH = os.path.join(
    os.path.dirname(__file__),
    "specialist_agents", "_kb", "chat_providers.json",
)


def _load_llm_providers() -> list[dict]:
    """Return KB entries marked ai_capable AND that look LLM-based.

    Not every chat-platform with 'ai_capable=true' is an LLM-vendor
    in the AI-Act-Art. 50 sense. We tighten the list with a name
    regex (LLM/GPT/Claude/Vertex/Gemini) plus a 'type' substring
    check so that ordinary chat widgets that only ROUTE to AI don't
    trigger this finding.
    """
    try:
        with open(_KB_PATH, encoding="utf-8") as f:
            kb = json.load(f)
    except Exception as e:
        logger.warning("AI-legal-basis KB load failed: %s", e)
        return []
    out: list[dict] = []
    llm_type_hints = ("ai-chatbot", "conversational-ai",
                      "ai chatbot", "llm", "gpt", "claude", "vertex")
    for pid, prov in (kb.get("providers") or {}).items():
        if not prov.get("ai_capable"):
            continue
        type_str = (prov.get("type") or "").lower()
        company = (prov.get("company") or "").lower()
        if (any(h in type_str for h in llm_type_hints)
                or any(h in company for h in llm_type_hints)):
            out.append({"id": pid, "data": prov})
    return out


_LLM_PROVIDERS = _load_llm_providers()

# Aliases that appear in DSE-prose for each provider. Built from KB +
# common-knowledge synonyms (Google's "Vertex AI" is also referenced as
# "Google Cloud AI" / "PaLM" / "Gemini" / "Generative AI").
_LLM_NAME_ALIASES: dict[str, list[str]] = {
    "vertex_ai_chatbot": [
        "vertex ai", "vertex-ai", "vertexai", "google cloud ai",
        "google generative ai", "google palm", "palm 2", "gemini",
    ],
    "openai_chatbot": [
        "openai", "open ai", "gpt-3", "gpt-4", "gpt 3", "gpt 4",
        "chatgpt", "chat gpt", "azure openai",
    ],
    "anthropic_claude": [
        "anthropic", "claude 3", "claude-3", "claude.ai", "claude ai",
    ],
}


_LIT_F_PATTERNS = (
    re.compile(r"berechtigt(?:e[snm]?)?\s+interess", re.IGNORECASE),
    re.compile(r"Art\.?\s*6\s*Abs\.?\s*1\s*(?:lit\.?\s*)?f\b", re.IGNORECASE),
    re.compile(r"Art\.?\s*6\s*(?:Abs\.?\s*1)?\s*\(\s*1\s*\)\s*\(?f", re.IGNORECASE),
    re.compile(r"legitimate\s+interest", re.IGNORECASE),
)


_LIT_A_PATTERNS = (
    re.compile(r"einwilligung", re.IGNORECASE),
    re.compile(r"Art\.?\s*6\s*Abs\.?\s*1\s*(?:lit\.?\s*)?a\b", re.IGNORECASE),
    re.compile(r"\bconsent\b", re.IGNORECASE),
)


def _paragraph_split(text: str) -> list[str]:
    return [p.strip() for p in re.split(r"\n\s*\n", text or "") if p.strip()]


def _has_lit_f(paragraph: str) -> bool:
    return any(p.search(paragraph) for p in _LIT_F_PATTERNS)


def _has_lit_a(paragraph: str) -> bool:
    return any(p.search(paragraph) for p in _LIT_A_PATTERNS)


def _find_llm_mention(paragraph: str) -> tuple[str, str] | None:
    p_lc = paragraph.lower()
    for prov in _LLM_PROVIDERS:
        aliases = _LLM_NAME_ALIASES.get(prov["id"]) or []
        # also include the company name directly
        aliases = aliases + [(prov["data"].get("company") or "").lower()]
        for alias in aliases:
            if alias and alias in p_lc:
                return prov["id"], prov["data"].get("company") or prov["id"]
    return None


def check_ai_legal_basis(state: dict) -> list[dict]:
    """Emit findings when an LLM provider is mentioned in a paragraph
    that grounds processing on Art. 6 Abs. 1 lit. f."""
    doc_texts = state.get("doc_texts") or {}
    findings: list[dict] = []
    seen: set[tuple[str, str]] = set()
    for doc_type in ("dse", "cookie"):
        text = doc_texts.get(doc_type) or ""
        if not text:
            continue
        for para in _paragraph_split(text):
            mention = _find_llm_mention(para)
            if not mention:
                continue
            if not _has_lit_f(para):
                continue
            # If the same paragraph ALSO names lit. a / Einwilligung,
            # the lit. f reference is likely about a side-purpose
            # (e.g. analytics-Logging) — skip to avoid noise.
            if _has_lit_a(para):
                continue
            prov_id, prov_company = mention
            key = (doc_type, prov_id)
            if key in seen:
                continue
            seen.add(key)
            findings.append({
                "check_id": "AI-LEGAL-BASIS-001",
                "severity": "MEDIUM",
                "severity_reason": "questionable_basis",
                "doc_type": doc_type,
                "provider": prov_company,
                "title": (
                    f"LLM-System '{prov_company}' auf Art. 6 Abs. 1 lit. f "
                    "gestützt statt auf Einwilligung"
                ),
                "norm": (
                    "DSGVO Art. 6 Abs. 1 lit. a vs lit. f + "
                    "AI Act Art. 50 + 51"
                ),
                "evidence": (
                    "LLM-Provider in einem Absatz erwähnt, der berechtigtes "
                    "Interesse / lit. f als Rechtsgrundlage angibt. Bei "
                    "Prompt-/Output-Logging mit US-Transfer und Profiling-"
                    "Ähnlichkeit ist die Interessenabwägung fragwürdig."
                ),
                "action": (
                    f"Rechtsgrundlage für {prov_company} auf Art. 6 Abs. 1 "
                    "lit. a (Einwilligung) umstellen. Pre-Interaction-"
                    "Consent + AI-Act Art. 50 Disclosure am Chat-UI "
                    "einrichten."
                ),
            })
    if findings:
        logger.info("B15 ai-legal-basis: %d finding(s)", len(findings))
    return findings