breakpilot-compliance/backend-compliance/compliance/services/legal_basis_validator.py

"""
Legal Basis Validator — checks if the correct DSGVO legal basis (lit. a-f)
is used for each processing purpose in the privacy policy.

Common mistakes:
- Cookie tracking on lit. f (legitimate interest) instead of lit. a (consent)
- Marketing emails on lit. f instead of lit. a
- Analytics on lit. b (contract) — incorrect overextension
- Klarna credit check without Art. 22 reference
"""

import logging
import re
from dataclasses import dataclass

logger = logging.getLogger(__name__)


@dataclass
class LitFinding:
    purpose: str
    stated_basis: str
    correct_basis: str
    severity: str
    text: str
    legal_ref: str
    original_text: str = ""


# Purpose → correct legal basis mapping
# Based on: DSK Kurzpapiere, Planet49 (EuGH C-673/17), BGH Cookie-Urteil
CORRECT_BASIS: dict[str, dict] = {
    "cookie_tracking": {
        "correct": "lit. a (Einwilligung)",
        "wrong_patterns": ["berechtigtes interesse", "lit. f", "lit.f", "legitimate interest"],
        "detect_patterns": ["cookie", "tracking", "pixel", "analytics.*cookie"],
        "ref": "EuGH C-673/17 (Planet49), §25 TDDDG",
    },
    "web_analytics": {
        "correct": "lit. a (Einwilligung)",
        "wrong_patterns": ["berechtigtes interesse", "lit. f", "lit.f", "vertragserfuellung", "lit. b", "lit.b"],
        "detect_patterns": ["google analytics", "webanalyse", "web analytics", "reichweitenmessung",
                            "nutzungsanalyse", "hotjar", "matomo"],
        "ref": "DSK Orientierungshilfe Telemedien, §25 TDDDG",
    },
    "marketing_email": {
        "correct": "lit. a (Einwilligung)",
        "wrong_patterns": ["berechtigtes interesse", "lit. f", "lit.f"],
        "detect_patterns": ["newsletter", "marketing.*mail", "werbe.*mail", "werbe.*email",
                            "marketing.*email", "werbliche.*kommunikation"],
        "ref": "Art. 7 DSGVO, §7 UWG (Double Opt-In)",
    },
    "remarketing": {
        "correct": "lit. a (Einwilligung)",
        "wrong_patterns": ["berechtigtes interesse", "lit. f", "lit.f"],
        "detect_patterns": ["remarketing", "retargeting", "personalisierte werbung",
                            "personalized advertising", "custom audience"],
        "ref": "§25 TDDDG, EuGH C-673/17",
    },
    "credit_check": {
        "correct": "lit. b/f + Art. 22 DSGVO Hinweis",
        "wrong_patterns": [],  # Not about wrong basis, but missing Art. 22
        "detect_patterns": ["bonitaet", "bonität", "kreditprüfung", "kreditpruefung",
                            "schufa", "auskunftei", "klarna.*rechnung", "ratenzahlung"],
        "ref": "Art. 22 DSGVO (automatisierte Einzelentscheidung)",
        "must_contain": ["art. 22", "art.22", "automatisierte entscheidung",
                         "automated decision", "einzelentscheidung"],
    },
    "social_media_embed": {
        "correct": "lit. a (Einwilligung)",
        "wrong_patterns": ["berechtigtes interesse", "lit. f", "lit.f"],
        "detect_patterns": ["facebook.*plugin", "social.*plugin", "like.*button",
                            "share.*button", "instagram.*embed", "twitter.*embed"],
        "ref": "EuGH C-40/17 (Fashion ID), 2-Klick-Loesung",
    },
    "session_recording": {
        "correct": "lit. a (Einwilligung)",
        "wrong_patterns": ["berechtigtes interesse", "lit. f", "lit.f"],
        "detect_patterns": ["session.?recording", "session.?replay", "heatmap",
                            "mouseflow", "hotjar.*recording", "clarity.*recording",
                            "fullstory", "lucky orange"],
        "ref": "§25 TDDDG, Aufzeichnung von Nutzerverhalten",
    },
}


def validate_legal_bases(dse_text: str) -> list[LitFinding]:
    """Check if correct legal bases are used in the privacy policy."""
    findings = []
    text_lower = dse_text.lower()

    for purpose_id, rules in CORRECT_BASIS.items():
        # Step 1: Is this purpose mentioned in the DSE?
        purpose_found = False
        matched_text = ""
        for pattern in rules["detect_patterns"]:
            match = re.search(pattern, text_lower)
            if match:
                purpose_found = True
                # Extract surrounding context (200 chars)
                start = max(0, match.start() - 100)
                end = min(len(text_lower), match.end() + 200)
                matched_text = dse_text[start:end].strip()
                break

        if not purpose_found:
            continue

        context_lower = matched_text.lower()

        # Step 2: Check if wrong legal basis is stated
        for wrong in rules["wrong_patterns"]:
            if wrong in context_lower:
                findings.append(LitFinding(
                    purpose=purpose_id,
                    stated_basis=wrong,
                    correct_basis=rules["correct"],
                    severity="HIGH",
                    text=f"Falsche Rechtsgrundlage: '{_purpose_label(purpose_id)}' nutzt "
                         f"'{wrong}' statt '{rules['correct']}'",
                    legal_ref=rules["ref"],
                    original_text=matched_text[:300],
                ))
                break

        # Step 3: Special check — must_contain (e.g., Art. 22 for credit checks)
        if "must_contain" in rules:
            has_required = any(req in context_lower for req in rules["must_contain"])
            if not has_required:
                findings.append(LitFinding(
                    purpose=purpose_id,
                    stated_basis="(fehlt)",
                    correct_basis=rules["correct"],
                    severity="HIGH",
                    text=f"Pflichthinweis fehlt: '{_purpose_label(purpose_id)}' erwaehnt "
                         f"keine automatisierte Entscheidungsfindung ({rules['ref']})",
                    legal_ref=rules["ref"],
                    original_text=matched_text[:300],
                ))

    return findings


def _purpose_label(purpose_id: str) -> str:
    """German label for purpose ID."""
    labels = {
        "cookie_tracking": "Cookie-Tracking",
        "web_analytics": "Webanalyse",
        "marketing_email": "Marketing-Emails/Newsletter",
        "remarketing": "Remarketing/Retargeting",
        "credit_check": "Bonitaetspruefung",
        "social_media_embed": "Social Media Einbindung",
        "session_recording": "Session Recording/Heatmaps",
    }
    return labels.get(purpose_id, purpose_id)