""" Category Tester — tests individual cookie consent categories. Tests each category in isolation: only "Statistics" on, only "Marketing" on, etc. Detects miscategorization: e.g., Facebook Pixel loading when only Statistics is enabled. """ import logging from dataclasses import dataclass, field from playwright.async_api import Page, BrowserContext from services.banner_detector import BannerInfo, click_button from services.script_analyzer import find_tracking_services, Violation logger = logging.getLogger(__name__) # Which tracking service belongs to which consent category SERVICE_CATEGORY_MAP: dict[str, str] = { # Statistics / Analytics "Google Analytics": "statistics", "Matomo": "statistics", "Plausible Analytics": "statistics", "Hotjar": "statistics", "Microsoft Clarity": "statistics", "etracker": "statistics", "Heap Analytics": "statistics", "Amplitude": "statistics", "Mixpanel": "statistics", "PostHog": "statistics", "Mouseflow": "statistics", "Crazy Egg": "statistics", "Lucky Orange": "statistics", "FullStory": "statistics", # Marketing / Advertising "Meta/Facebook Pixel": "marketing", "Google Ads": "marketing", "Google Ads/DoubleClick": "marketing", "TikTok Pixel": "marketing", "LinkedIn Insight": "marketing", "Pinterest Tag": "marketing", "Criteo": "marketing", "Taboola": "marketing", "Outbrain": "marketing", "Amazon Ads": "marketing", "Bing/Microsoft Ads": "marketing", "Salesforce Pardot": "marketing", # Functional "Intercom": "functional", "Zendesk": "functional", "Tidio Chat": "functional", "Crisp Chat": "functional", "LiveChat": "functional", "Freshdesk/Freshchat": "functional", "HelpScout Beacon": "functional", } CATEGORY_LABELS = { "statistics": "Statistik / Analytics", "marketing": "Marketing / Werbung", "functional": "Funktional / Komfort", "social_media": "Social Media", } # CMP-specific category selectors CMP_CATEGORY_CONFIG: dict[str, dict] = { "Cookiebot": { "settings_button": "#CybotCookiebotDialogBodyButtonDetails", "save_button": "#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowallSelection", "categories": { "statistics": "#CybotCookiebotDialogBodyLevelButtonStatistics", "marketing": "#CybotCookiebotDialogBodyLevelButtonMarketing", "preferences": "#CybotCookiebotDialogBodyLevelButtonPreferences", }, }, "OneTrust": { "settings_button": "#onetrust-pc-btn-handler, .ot-sdk-show-settings", "save_button": ".save-preference-btn-handler, #onetrust-accept-btn-handler", "categories": { "statistics": ".ot-switch[data-ot-category='C0002'] input, #ot-group-id-C0002", "marketing": ".ot-switch[data-ot-category='C0004'] input, #ot-group-id-C0004", "functional": ".ot-switch[data-ot-category='C0003'] input, #ot-group-id-C0003", }, }, "Usercentrics": { "settings_button": "[data-testid='uc-more-information-button'], button:has-text('Mehr Informationen')", "save_button": "[data-testid='uc-save-button']", "categories": { "statistics": "[data-testid='uc-category-statistics'] input", "marketing": "[data-testid='uc-category-marketing'] input", "functional": "[data-testid='uc-category-functional'] input", }, }, "Didomi": { "settings_button": "#didomi-notice-learn-more-button, .didomi-learn-more-button", "save_button": ".didomi-components-button--primary:has-text('Auswahl speichern'), #didomi-notice-agree-button", "categories": { "statistics": "[data-purpose='analytics_purposes'] input, [data-purpose='measure'] input", "marketing": "[data-purpose='advertising_purposes'] input, [data-purpose='ads'] input", }, }, } # Generic category keywords for fallback detection CATEGORY_KEYWORDS = { "statistics": ["statistik", "analytics", "analyse", "statistics", "messung", "reichweite"], "marketing": ["marketing", "werbung", "advertising", "targeting", "remarketing", "anzeigen"], "functional": ["funktional", "functional", "preferences", "praeferenz", "komfort", "einstellungen"], "social_media": ["social media", "soziale medien", "social", "teilen"], } @dataclass class CategoryInfo: name: str label: str selector: str @dataclass class CategoryTestResult: category: str category_label: str scripts_loaded: list[str] = field(default_factory=list) cookies_set: list[str] = field(default_factory=list) tracking_services: list[str] = field(default_factory=list) violations: list[dict] = field(default_factory=list) async def detect_categories(page: Page, banner: BannerInfo) -> list[CategoryInfo]: """Detect available cookie categories in the CMP.""" categories = [] provider = banner.provider # CMP-specific detection config = CMP_CATEGORY_CONFIG.get(provider) if config: # Open settings panel first if config.get("settings_button"): await click_button(page, config["settings_button"], timeout=3000) await page.wait_for_timeout(1000) for cat_name, selector in config.get("categories", {}).items(): try: if await page.locator(selector.split(",")[0].strip()).count() > 0: categories.append(CategoryInfo( name=cat_name, label=CATEGORY_LABELS.get(cat_name, cat_name), selector=selector, )) except Exception: continue # Generic fallback: search for toggle/checkbox elements with category keywords if not categories: try: toggles = await page.evaluate(""" () => { const elements = document.querySelectorAll( 'input[type="checkbox"], [role="switch"], [class*="toggle"], [class*="switch"]' ); return [...elements].map(el => ({ text: (el.closest('label')?.textContent || el.getAttribute('aria-label') || '').trim(), id: el.id || '', selector: el.id ? '#' + el.id : '', })).filter(e => e.text.length > 0); } """) for toggle in (toggles or []): text_lower = toggle["text"].lower() for cat_name, keywords in CATEGORY_KEYWORDS.items(): if any(kw in text_lower for kw in keywords): sel = toggle["selector"] or f'[aria-label*="{toggle["text"][:20]}"]' categories.append(CategoryInfo( name=cat_name, label=toggle["text"][:50], selector=sel, )) break except Exception as e: logger.warning("Generic category detection failed: %s", e) logger.info("Detected %d categories for %s", len(categories), provider) return categories async def test_single_category( context: BrowserContext, url: str, category: CategoryInfo, banner: BannerInfo, wait_ms: int = 5000, ) -> CategoryTestResult: """Test a single category in isolation: enable only this one, disable others.""" result = CategoryTestResult( category=category.name, category_label=category.label, ) try: page = await context.new_page() scripts: list[str] = [] page.on("request", lambda req: _collect(req, scripts)) await page.goto(url, wait_until="networkidle", timeout=20000) await page.wait_for_timeout(2000) config = CMP_CATEGORY_CONFIG.get(banner.provider) if config: # Open settings if config.get("settings_button"): await click_button(page, config["settings_button"], timeout=3000) await page.wait_for_timeout(1000) # Disable ALL categories first for cat_sel in config.get("categories", {}).values(): try: el = page.locator(cat_sel.split(",")[0].strip()).first if await el.is_checked(): await el.click() except Exception: continue # Enable ONLY the target category try: el = page.locator(category.selector.split(",")[0].strip()).first if not await el.is_checked(): await el.click() except Exception: logger.warning("Could not toggle category %s", category.name) # Save selection if config.get("save_button"): await click_button(page, config["save_button"], timeout=3000) await page.wait_for_timeout(wait_ms) # Collect results result.scripts_loaded = _dedup_scripts(scripts) result.cookies_set = [c.get("name", "") for c in await context.cookies()] result.tracking_services = find_tracking_services(result.scripts_loaded) # Find violations: services that don't belong to this category for service in result.tracking_services: expected_cat = SERVICE_CATEGORY_MAP.get(service) if expected_cat and expected_cat != category.name: result.violations.append({ "service": service, "severity": "HIGH", "text": f"{service} laedt bei '{category.label}' — gehoert aber zu '{CATEGORY_LABELS.get(expected_cat, expected_cat)}'", "expected_category": expected_cat, "actual_category": category.name, }) await page.close() except Exception as e: logger.error("Category test failed for %s: %s", category.name, e) return result def _collect(request, scripts: list[str]): if request.resource_type in ("script", "image", "xhr", "fetch"): scripts.append(request.url) def _dedup_scripts(scripts: list[str]) -> list[str]: seen = set() result = [] for url in scripts: domain = url.split("/")[2] if len(url.split("/")) > 2 else url if domain not in seen: seen.add(domain) result.append(url) return result[:30]