""" Lit-Mapping Validation Test — verifies that BOTH the hardcoded dict AND the Control Library detect the same legal basis errors. If both produce the same results, we can safely delete the dict. Test cases use deliberately WRONG legal basis assignments that are common mistakes on real websites. """ import asyncio import json import os import sys # Add parent to path for imports sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # ═══════════════════════════════════════════════════════════════ # TEST CASES — Deliberately wrong DSE text blocks # ═══════════════════════════════════════════════════════════════ TEST_CASES = [ { "id": "cookie_tracking_wrong_litf", "description": "Cookie-Tracking auf lit. f statt lit. a", "dse_text": ( "Wir setzen Cookies und aehnliche Tracking-Technologien ein, " "um die Nutzung unserer Website zu analysieren. Die Verarbeitung " "erfolgt auf Grundlage unseres berechtigten Interesses gemaess " "Art. 6 Abs. 1 lit. f DSGVO an der Optimierung unseres Angebots." ), "expected_finding": True, "expected_purpose": "cookie_tracking", "correct_basis": "lit. a (Einwilligung)", "wrong_basis": "lit. f", }, { "id": "analytics_wrong_litb", "description": "Google Analytics auf lit. b (Vertragserfuellung) statt lit. a", "dse_text": ( "Wir nutzen Google Analytics zur Webanalyse. Die Datenverarbeitung " "erfolgt auf Basis der Vertragserfuellung gemaess Art. 6 Abs. 1 lit. b DSGVO, " "da die Analyse fuer die Erbringung unserer Dienste erforderlich ist." ), "expected_finding": True, "expected_purpose": "web_analytics", "correct_basis": "lit. a (Einwilligung)", "wrong_basis": "lit. b", }, { "id": "newsletter_wrong_litf", "description": "Newsletter auf lit. f statt lit. a", "dse_text": ( "Wir versenden regelmaessig Newsletter mit Informationen zu unseren Produkten. " "Die Verarbeitung Ihrer E-Mail-Adresse erfolgt auf Grundlage unseres " "berechtigten Interesses gemaess Art. 6 Abs. 1 lit. f DSGVO an der " "Direktwerbung fuer eigene aehnliche Produkte." ), "expected_finding": True, "expected_purpose": "marketing_email", "correct_basis": "lit. a (Einwilligung)", "wrong_basis": "lit. f", }, { "id": "remarketing_wrong_litf", "description": "Remarketing/Retargeting auf lit. f statt lit. a", "dse_text": ( "Wir setzen Remarketing-Technologien ein, um Ihnen auf anderen Websites " "personalisierte Werbung anzuzeigen. Die Verarbeitung basiert auf unserem " "berechtigten Interesse an effektiver Werbung (Art. 6 Abs. 1 lit. f DSGVO)." ), "expected_finding": True, "expected_purpose": "remarketing", "correct_basis": "lit. a (Einwilligung)", "wrong_basis": "lit. f", }, { "id": "klarna_missing_art22", "description": "Klarna Bonitaetspruefung ohne Art. 22 Hinweis", "dse_text": ( "Bei Auswahl der Zahlungsart Rechnung ueber Klarna wird eine " "Bonitaetspruefung durchgefuehrt. Klarna AB, Stockholm, Schweden, " "uebermittelt Ihre Daten an Auskunfteien. Rechtsgrundlage ist " "Art. 6 Abs. 1 lit. b DSGVO (Vertragserfuellung)." ), "expected_finding": True, "expected_purpose": "credit_check", "correct_basis": "lit. b/f + Art. 22 DSGVO Hinweis", "wrong_basis": "(fehlt)", }, { "id": "session_recording_wrong_litf", "description": "Session Recording (Hotjar) auf lit. f statt lit. a", "dse_text": ( "Wir nutzen Hotjar zur Analyse des Nutzerverhaltens mittels Session Recording " "und Heatmaps. Die Aufzeichnung der Nutzersitzungen erfolgt auf Grundlage " "unseres berechtigten Interesses (Art. 6 Abs. 1 lit. f DSGVO)." ), "expected_finding": True, "expected_purpose": "session_recording", "correct_basis": "lit. a (Einwilligung)", "wrong_basis": "lit. f", }, { "id": "payment_correct_litb", "description": "Zahlung korrekt auf lit. b — sollte KEIN Finding sein", "dse_text": ( "Die Verarbeitung Ihrer Zahlungsdaten durch unseren Zahlungsdienstleister " "Stripe erfolgt auf Grundlage der Vertragserfuellung gemaess " "Art. 6 Abs. 1 lit. b DSGVO." ), "expected_finding": False, "expected_purpose": None, "correct_basis": "lit. b (Vertragserfuellung)", "wrong_basis": None, }, { "id": "analytics_correct_lita", "description": "Analytics korrekt auf lit. a — sollte KEIN Finding sein", "dse_text": ( "Wir setzen Google Analytics nur mit Ihrer ausdruecklichen Einwilligung " "gemaess Art. 6 Abs. 1 lit. a DSGVO ein. Sie koennen Ihre Einwilligung " "jederzeit widerrufen." ), "expected_finding": False, "expected_purpose": None, "correct_basis": "lit. a (Einwilligung)", "wrong_basis": None, }, ] def test_hardcoded_dict(): """Test the hardcoded CORRECT_BASIS dict against test cases.""" from compliance.services.legal_basis_validator import validate_legal_bases print("\n" + "=" * 70) print("TEST 1: Hartkodiertes Dict (legal_basis_validator.py)") print("=" * 70) passed = 0 failed = 0 for tc in TEST_CASES: findings = validate_legal_bases(tc["dse_text"]) has_finding = len(findings) > 0 if has_finding == tc["expected_finding"]: status = "PASS" passed += 1 else: status = "FAIL" failed += 1 print(f" [{status}] {tc['id']}: {tc['description']}") if has_finding: for f in findings: print(f" → {f.text[:80]}") elif tc["expected_finding"]: print(f" → ERWARTET: Finding fuer {tc['expected_purpose']}, aber KEINS gefunden") print(f"\n Ergebnis: {passed} bestanden, {failed} fehlgeschlagen\n") return passed, failed def test_control_library(): """Test the Control Library against the same test cases. Queries canonical_controls for lit-mapping controls and checks if they would detect the same errors. """ try: import asyncpg except ImportError: print("\n SKIP: asyncpg nicht installiert — Control Library Test uebersprungen") return 0, 0 db_url = os.environ.get( "COMPLIANCE_DATABASE_URL", os.environ.get("DATABASE_URL", ""), ) if not db_url: print("\n SKIP: Keine DATABASE_URL — Control Library Test uebersprungen") return 0, 0 print("\n" + "=" * 70) print("TEST 2: Control Library (canonical_controls)") print("=" * 70) async def _run(): pool = await asyncpg.create_pool(db_url, min_size=1, max_size=2) passed = 0 failed = 0 try: async with pool.acquire() as conn: # Fetch lit-mapping relevant controls controls = await conn.fetch(""" SELECT control_id, title, objective, requirements FROM compliance.canonical_controls WHERE ( title ILIKE '%einwilligung%tracking%' OR title ILIKE '%rechtsgrundlage%cookie%' OR title ILIKE '%consent%cookie%' OR title ILIKE '%einwilligung%cookie%' OR title ILIKE '%art. 22%' OR title ILIKE '%automatisierte%entscheidung%' OR requirements ILIKE '%lit. a%tracking%' OR requirements ILIKE '%einwilligung%analytics%' ) AND release_state = 'published' LIMIT 50 """) print(f" Gefundene Lit-Mapping Controls: {len(controls)}") for c in controls[:10]: print(f" [{c['control_id']}] {c['title'][:60]}") if not controls: print(" WARNUNG: Keine Lit-Mapping Controls in der DB!") return 0, 0 # For each test case, check if a control would catch it for tc in TEST_CASES: text_lower = tc["dse_text"].lower() matched_control = None for c in controls: title_lower = (c["title"] or "").lower() req_lower = (c["requirements"] or "").lower() obj_lower = (c["objective"] or "").lower() # Check if this control is relevant for this test case relevant = False if tc["expected_purpose"] == "cookie_tracking": relevant = "cookie" in title_lower or "tracking" in title_lower elif tc["expected_purpose"] == "web_analytics": relevant = "analytics" in title_lower or "tracking" in title_lower elif tc["expected_purpose"] == "marketing_email": relevant = "newsletter" in title_lower or "marketing" in title_lower elif tc["expected_purpose"] == "remarketing": relevant = "remarketing" in title_lower or "retargeting" in title_lower elif tc["expected_purpose"] == "credit_check": relevant = "art. 22" in title_lower or "bonitaet" in title_lower elif tc["expected_purpose"] == "session_recording": relevant = "recording" in title_lower or "heatmap" in title_lower if relevant: # Check if the control requires consent (lit. a) requires_consent = ( "einwilligung" in req_lower or "consent" in req_lower or "lit. a" in req_lower ) if requires_consent and tc["expected_finding"]: matched_control = c break has_match = matched_control is not None # For negative test cases (no finding expected), no match = correct if not tc["expected_finding"]: correct = not has_match else: correct = has_match if correct: status = "PASS" passed += 1 else: status = "FAIL" failed += 1 print(f" [{status}] {tc['id']}: {tc['description']}") if matched_control: print(f" → Control: [{matched_control['control_id']}] {matched_control['title'][:60]}") elif tc["expected_finding"]: print(f" → KEIN passender Control gefunden!") finally: await pool.close() print(f"\n Ergebnis: {passed} bestanden, {failed} fehlgeschlagen\n") return passed, failed return asyncio.run(_run()) def test_comparison(): """Compare results: Dict vs. Control Library.""" print("\n" + "=" * 70) print("VERGLEICH: Dict vs. Control Library") print("=" * 70) dict_passed, dict_failed = test_hardcoded_dict() ctrl_passed, ctrl_failed = test_control_library() print("\n" + "=" * 70) print("ZUSAMMENFASSUNG") print("=" * 70) print(f" Dict: {dict_passed}/{dict_passed + dict_failed} bestanden") print(f" Control Library: {ctrl_passed}/{ctrl_passed + ctrl_failed} bestanden") if ctrl_passed >= dict_passed and ctrl_failed == 0: print("\n ✓ Control Library deckt alle Faelle ab → Dict kann entfernt werden") elif ctrl_passed > 0: print("\n ⚠ Control Library deckt teilweise ab → Dict als Fallback behalten") else: print("\n ✗ Control Library deckt nichts ab → Dict wird noch gebraucht") print("=" * 70) if __name__ == "__main__": test_comparison()