"""Cross-Doc Vendor-Consistency Check. Coverage gap discovered against the Elli ground truth (2026-06-06): the DSE declares "Google Vertex AI" for the customer-service chatbot, but `/de/cookies` lists "Iadvize" as the chat provider — a direct contradiction the deterministic pipeline missed. This check looks for cross-doc provider mismatches per service type: service_type keywords searched in DSE / cookie text ────────────── ─────────────────────────────────────── chatbot "chatbot", "AI assistant", "Konversations", "Live-Chat" analytics "Analytics", "Analyse" tag_manager "Tag Manager", "GTM" marketing_pixel "Pixel", "Tracking-Pixel" cdn "CDN", "Content Delivery" consent_mgmt "Consent Management", "CMP" For each service type, extract the provider name(s) mentioned in the DSE and in the cookie/cookie-policy text. When DSE and cookie text disagree → finding with severity HIGH (transparency contradiction). """ from __future__ import annotations import logging import re from dataclasses import dataclass logger = logging.getLogger(__name__) # Known providers per service type. Keep generous; we'd rather # detect Iadvize-vs-Vertex than under-detect. _PROVIDERS = { "chatbot": [ ("Google Vertex AI", ["vertex ai", "google vertex", "vertex-ai"]), ("OpenAI", ["openai", "gpt-4", "chatgpt"]), ("Anthropic Claude", ["anthropic", "claude.ai"]), ("Iadvize", ["iadvize", "i-advize"]), ("Intercom", ["intercom"]), ("Zendesk", ["zendesk"]), ("Drift", ["drift.com", "drift chat"]), ("Userlike", ["userlike"]), ("Tidio", ["tidio"]), ("LivePerson", ["liveperson"]), ("Salesforce Einstein", ["einstein bot", "salesforce einstein"]), ("HubSpot", ["hubspot chat", "hubspot conversation"]), ("Microsoft Copilot", ["copilot", "azure openai"]), ("Mistral AI", ["mistral ai", "mistral.ai"]), ("Hugging Face", ["hugging face", "huggingface"]), ], "analytics": [ ("Google Analytics", ["google analytics", "ga4", "_ga ", "_ga,", "_ga\""]), ("Matomo", ["matomo", "piwik"]), ("Plausible", ["plausible"]), ("Etracker", ["etracker"]), ("Adobe Analytics", ["adobe analytics", "omniture"]), ("Mixpanel", ["mixpanel"]), ("Heap", ["heap analytics"]), ("Amplitude", ["amplitude.com", "amplitude analytics"]), ], "tag_manager": [ ("Google Tag Manager", ["google tag manager", "gtm", "googletagmanager"]), ("Matomo Tag Manager", ["matomo tag", "mtm"]), ("Tealium", ["tealium"]), ("Adobe Launch", ["adobe launch"]), ], "marketing_pixel": [ ("Meta Pixel", ["meta pixel", "facebook pixel", "_fbp"]), ("LinkedIn Insight Tag", ["linkedin insight"]), ("TikTok Pixel", ["tiktok pixel"]), ("X Pixel", ["twitter pixel", "x pixel"]), ("Pinterest Tag", ["pinterest tag"]), ], "cdn": [ ("Cloudflare", ["cloudflare"]), ("Akamai", ["akamai"]), ("Fastly", ["fastly"]), ("AWS CloudFront", ["cloudfront"]), ], "consent_mgmt": [ ("Usercentrics", ["usercentrics"]), ("OneTrust", ["onetrust", "cookiepro"]), ("Cookiebot", ["cookiebot"]), ("Sourcepoint", ["sourcepoint"]), ("Klaro", ["klaro!"]), ], } @dataclass class ProviderMatch: service_type: str canonical: str in_dse: bool in_cookie: bool def _find_providers(text: str, service_type: str) -> set[str]: text_lc = (text or "").lower() if not text_lc: return set() out: set[str] = set() for canonical, kws in _PROVIDERS.get(service_type, []): for kw in kws: if kw in text_lc: out.add(canonical) break return out def check_vendor_consistency(state: dict) -> list[dict]: """Compare provider mentions across DSE and cookie-policy text. Returns a list of finding dicts, one per service_type with a mismatch. Empty list when there are no contradictions. """ doc_texts = state.get("doc_texts") or {} dse_text = doc_texts.get("dse") or "" cookie_text = doc_texts.get("cookie") or "" if not dse_text or not cookie_text: return [] findings: list[dict] = [] for service_type in _PROVIDERS: dse_set = _find_providers(dse_text, service_type) cookie_set = _find_providers(cookie_text, service_type) if not dse_set and not cookie_set: continue # Disagreement when both name a provider but no overlap. if dse_set and cookie_set and not (dse_set & cookie_set): findings.append({ "check_id": "VENDOR-CONSISTENCY-001", "service_type": service_type, "severity": "HIGH", "severity_reason": "factually_wrong", "dse_providers": sorted(dse_set), "cookie_providers": sorted(cookie_set), "title": ( f"{service_type.replace('_', '-').title()}: " f"DSE nennt {', '.join(sorted(dse_set))} — " f"Cookies-Seite nennt {', '.join(sorted(cookie_set))}" ), "norm": "DSGVO Art. 13 + Art. 5 Abs. 1 lit. a (Transparenz)", "action": ( "DSE und Cookie-Richtlinie auf denselben Provider " "abgleichen — entweder DSE ist veraltet oder die " "Cookie-Seite nennt einen ausgewechselten Provider." ), }) elif dse_set and not cookie_set: findings.append({ "check_id": "VENDOR-CONSISTENCY-002", "service_type": service_type, "severity": "MEDIUM", "severity_reason": "incomplete", "dse_providers": sorted(dse_set), "cookie_providers": [], "title": ( f"{service_type.replace('_', '-').title()}: " f"DSE nennt {', '.join(sorted(dse_set))} — auf der " "Cookies-Seite nicht erwähnt" ), "norm": "DSGVO Art. 13 + EDPB Cookie-Sweep", "action": ( f"Provider {', '.join(sorted(dse_set))} auf der " "Cookies-Seite ergänzen — Cookie-Tabelle prüfen." ), }) elif cookie_set and not dse_set: findings.append({ "check_id": "VENDOR-CONSISTENCY-003", "service_type": service_type, "severity": "HIGH", "severity_reason": "missing", "dse_providers": [], "cookie_providers": sorted(cookie_set), "title": ( f"{service_type.replace('_', '-').title()}: " f"Cookies-Seite nennt {', '.join(sorted(cookie_set))} " "— in DSE nicht deklariert" ), "norm": "DSGVO Art. 13 Abs. 1 lit. e Empfängerkategorien", "action": ( f"Provider {', '.join(sorted(cookie_set))} in der DSE " "als Empfänger benennen + Zweck + Rechtsgrundlage." ), }) if findings: logger.info("vendor-consistency: %d findings", len(findings)) return findings