""" Legal Basis Validator — checks if the correct DSGVO legal basis (lit. a-f) is used for each processing purpose in the privacy policy. ⚠️ TECHNISCHE SCHULD / HARDCODED KNOWLEDGE: Dieses Modul enthält hartkodierte Rechtsgrundlagen-Zuordnungen (CORRECT_BASIS dict). Das ist ein TEMPORAERER Fallback bis die Control Library entsprechende Controls hat. MITTELFRISTIGES ZIEL: Dieses Dict durch RAG/Control-Library-Abfragen ersetzen. Neue Controls sollten in der Pipeline generiert werden, z.B.: "Cookie-Tracking erfordert Art. 6(1)(a) Einwilligung (EuGH C-673/17 Planet49)" → canonical_controls mit scope_conditions + legal_ref BIS DAHIN: Dieses Dict wird als Fallback genutzt mit einem Warning-Log wenn es herangezogen wird. Bei jedem neuen Gesetz/Urteil muss SOWOHL die Pipeline als auch dieses Dict aktualisiert werden — oder besser: das Dict entfernen und nur noch Controls nutzen. Erstellt: 2026-04-29 | Review-Datum: 2026-07-01 | Owner: Agent-Team Common mistakes detected: - Cookie tracking on lit. f (legitimate interest) instead of lit. a (consent) - Marketing emails on lit. f instead of lit. a - Analytics on lit. b (contract) — incorrect overextension - Klarna credit check without Art. 22 reference """ import logging import re from dataclasses import dataclass logger = logging.getLogger(__name__) @dataclass class LitFinding: purpose: str stated_basis: str correct_basis: str severity: str text: str legal_ref: str original_text: str = "" # Purpose → correct legal basis mapping # Based on: DSK Kurzpapiere, Planet49 (EuGH C-673/17), BGH Cookie-Urteil CORRECT_BASIS: dict[str, dict] = { "cookie_tracking": { "correct": "lit. a (Einwilligung)", "wrong_patterns": ["berechtigtes interesse", "lit. f", "lit.f", "legitimate interest"], "detect_patterns": ["cookie", "tracking", "pixel", "analytics.*cookie"], "ref": "EuGH C-673/17 (Planet49), §25 TDDDG", }, "web_analytics": { "correct": "lit. a (Einwilligung)", "wrong_patterns": ["berechtigtes interesse", "lit. f", "lit.f", "vertragserfuellung", "lit. b", "lit.b"], "detect_patterns": ["google analytics", "webanalyse", "web analytics", "reichweitenmessung", "nutzungsanalyse", "hotjar", "matomo"], "ref": "DSK Orientierungshilfe Telemedien, §25 TDDDG", }, "marketing_email": { "correct": "lit. a (Einwilligung)", "wrong_patterns": ["berechtigtes interesse", "lit. f", "lit.f"], "detect_patterns": ["newsletter", "marketing.*mail", "werbe.*mail", "werbe.*email", "marketing.*email", "werbliche.*kommunikation"], "ref": "Art. 7 DSGVO, §7 UWG (Double Opt-In)", }, "remarketing": { "correct": "lit. a (Einwilligung)", "wrong_patterns": ["berechtigtes interesse", "lit. f", "lit.f"], "detect_patterns": ["remarketing", "retargeting", "personalisierte werbung", "personalized advertising", "custom audience"], "ref": "§25 TDDDG, EuGH C-673/17", }, "credit_check": { "correct": "lit. b/f + Art. 22 DSGVO Hinweis", "wrong_patterns": [], # Not about wrong basis, but missing Art. 22 "detect_patterns": ["bonitaet", "bonität", "kreditprüfung", "kreditpruefung", "schufa", "auskunftei", "klarna.*rechnung", "ratenzahlung"], "ref": "Art. 22 DSGVO (automatisierte Einzelentscheidung)", "must_contain": ["art. 22", "art.22", "automatisierte entscheidung", "automated decision", "einzelentscheidung"], }, "social_media_embed": { "correct": "lit. a (Einwilligung)", "wrong_patterns": ["berechtigtes interesse", "lit. f", "lit.f"], "detect_patterns": ["facebook.*plugin", "social.*plugin", "like.*button", "share.*button", "instagram.*embed", "twitter.*embed"], "ref": "EuGH C-40/17 (Fashion ID), 2-Klick-Loesung", }, "session_recording": { "correct": "lit. a (Einwilligung)", "wrong_patterns": ["berechtigtes interesse", "lit. f", "lit.f"], "detect_patterns": ["session.?recording", "session.?replay", "heatmap", "mouseflow", "hotjar.*recording", "clarity.*recording", "fullstory", "lucky orange"], "ref": "§25 TDDDG, Aufzeichnung von Nutzerverhalten", }, } def validate_legal_bases(dse_text: str) -> list[LitFinding]: """Check if correct legal bases are used in the privacy policy. ⚠️ Uses HARDCODED CORRECT_BASIS dict as fallback. TODO: Replace with RAG/Control Library query when lit-mapping Controls exist. """ logger.warning( "legal_basis_validator: Using HARDCODED rules (CORRECT_BASIS dict). " "This should be replaced with Control Library queries. Review date: 2026-07-01" ) findings = [] text_lower = dse_text.lower() for purpose_id, rules in CORRECT_BASIS.items(): # Step 1: Is this purpose mentioned in the DSE? purpose_found = False matched_text = "" for pattern in rules["detect_patterns"]: match = re.search(pattern, text_lower) if match: purpose_found = True # Extract surrounding context (200 chars) start = max(0, match.start() - 100) end = min(len(text_lower), match.end() + 200) matched_text = dse_text[start:end].strip() break if not purpose_found: continue context_lower = matched_text.lower() # Step 2: Check if wrong legal basis is stated for wrong in rules["wrong_patterns"]: if wrong in context_lower: findings.append(LitFinding( purpose=purpose_id, stated_basis=wrong, correct_basis=rules["correct"], severity="HIGH", text=f"Falsche Rechtsgrundlage: '{_purpose_label(purpose_id)}' nutzt " f"'{wrong}' statt '{rules['correct']}'", legal_ref=rules["ref"], original_text=matched_text[:300], )) break # Step 3: Special check — must_contain (e.g., Art. 22 for credit checks) if "must_contain" in rules: has_required = any(req in context_lower for req in rules["must_contain"]) if not has_required: findings.append(LitFinding( purpose=purpose_id, stated_basis="(fehlt)", correct_basis=rules["correct"], severity="HIGH", text=f"Pflichthinweis fehlt: '{_purpose_label(purpose_id)}' erwaehnt " f"keine automatisierte Entscheidungsfindung ({rules['ref']})", legal_ref=rules["ref"], original_text=matched_text[:300], )) return findings def _purpose_label(purpose_id: str) -> str: """German label for purpose ID.""" labels = { "cookie_tracking": "Cookie-Tracking", "web_analytics": "Webanalyse", "marketing_email": "Marketing-Emails/Newsletter", "remarketing": "Remarketing/Retargeting", "credit_check": "Bonitaetspruefung", "social_media_embed": "Social Media Einbindung", "session_recording": "Session Recording/Heatmaps", } return labels.get(purpose_id, purpose_id)