""" Service Detector — find all third-party services mentioned in legal texts. Uses the service_registry (88+ services) as detection source. Works on lowercased text with simple keyword matching. Returns structured list of detected services with metadata. """ import logging import re from compliance.services.service_registry import SERVICE_REGISTRY logger = logging.getLogger(__name__) # Build a simple name→metadata lookup from the registry _SERVICE_BY_NAME: dict[str, dict] = {} _NAME_PATTERNS: list[tuple[str, dict]] = [] for _pattern, _meta in SERVICE_REGISTRY.items(): name = _meta["name"] _SERVICE_BY_NAME[name.lower()] = _meta # Also build lowercase search keywords from the name _NAME_PATTERNS.append((name.lower(), _meta)) # Additional text-based patterns (services often mentioned by name in DSI, # not by script URL pattern) _EXTRA_TEXT_PATTERNS: dict[str, dict] = { "adobe": {"id": "adobe", "name": "Adobe", "category": "tracking", "provider": "Adobe Inc.", "country": "US", "eu_adequate": False}, "sourcepoint": {"id": "sourcepoint", "name": "Sourcepoint", "category": "cmp", "provider": "Sourcepoint Technologies", "country": "US", "eu_adequate": False}, "salesforce": {"id": "salesforce", "name": "Salesforce", "category": "crm", "provider": "Salesforce Inc.", "country": "US", "eu_adequate": False}, "qualtrics": {"id": "qualtrics", "name": "Qualtrics", "category": "survey", "provider": "Qualtrics LLC", "country": "US", "eu_adequate": False}, "jw player": {"id": "jw_player", "name": "JW Player", "category": "video", "provider": "Longtail Ad Solutions", "country": "US", "eu_adequate": False}, "omnystudio": {"id": "omnystudio", "name": "Omnystudio", "category": "audio", "provider": "Triton Digital", "country": "CA", "eu_adequate": False}, "storifyme": {"id": "storifyme", "name": "Storifyme", "category": "content", "provider": "Storifyme GmbH", "country": "DE", "eu_adequate": True}, "iqd": {"id": "iqd", "name": "IQD", "category": "marketing", "provider": "IQ Digital Media Marketing", "country": "DE", "eu_adequate": True}, "id5": {"id": "id5", "name": "ID5", "category": "identity", "provider": "ID5 Technology Ltd", "country": "GB", "eu_adequate": True}, "utiq": {"id": "utiq", "name": "Utiq", "category": "tracking", "provider": "Utiq SA/NV", "country": "BE", "eu_adequate": True}, "mapbox": {"id": "mapbox", "name": "Mapbox", "category": "maps", "provider": "Mapbox Inc.", "country": "US", "eu_adequate": False}, "tiktok": {"id": "tiktok", "name": "TikTok", "category": "social", "provider": "TikTok Technology Limited", "country": "IE", "eu_adequate": True}, "spotify": {"id": "spotify", "name": "Spotify", "category": "audio", "provider": "Spotify AB", "country": "SE", "eu_adequate": True}, "reddit": {"id": "reddit", "name": "Reddit", "category": "social", "provider": "Reddit Inc.", "country": "US", "eu_adequate": False}, "bluesky": {"id": "bluesky", "name": "Bluesky", "category": "social", "provider": "Bluesky PBLLC", "country": "US", "eu_adequate": False}, "giphy": {"id": "giphy", "name": "Giphy", "category": "content", "provider": "Meta Platforms", "country": "US", "eu_adequate": False}, "imgur": {"id": "imgur", "name": "Imgur", "category": "content", "provider": "Imgur Inc.", "country": "US", "eu_adequate": False}, "instagram": {"id": "instagram", "name": "Instagram", "category": "social", "provider": "Meta Platforms", "country": "US", "eu_adequate": False}, "facebook": {"id": "facebook", "name": "Facebook", "category": "social", "provider": "Meta Platforms", "country": "US", "eu_adequate": False}, "meta platforms": {"id": "meta_platforms", "name": "Meta Platforms", "category": "social", "provider": "Meta Platforms Inc.", "country": "US", "eu_adequate": False}, "linkedin": {"id": "linkedin", "name": "LinkedIn", "category": "marketing", "provider": "LinkedIn Corp.", "country": "US", "eu_adequate": False}, "twitter": {"id": "twitter", "name": "X/Twitter", "category": "social", "provider": "X Corp.", "country": "US", "eu_adequate": False}, "x.com": {"id": "x_com", "name": "X/Twitter", "category": "social", "provider": "X Corp.", "country": "US", "eu_adequate": False}, "recaptcha": {"id": "recaptcha", "name": "Google reCAPTCHA", "category": "security", "provider": "Google LLC", "country": "US", "eu_adequate": False}, "xandr": {"id": "xandr", "name": "Xandr", "category": "marketing", "provider": "Microsoft/Xandr", "country": "US", "eu_adequate": False}, "criteo": {"id": "criteo", "name": "Criteo", "category": "marketing", "provider": "Criteo SA", "country": "FR", "eu_adequate": True}, "outbrain": {"id": "outbrain", "name": "Outbrain", "category": "marketing", "provider": "Outbrain Inc.", "country": "US", "eu_adequate": False}, "taboola": {"id": "taboola", "name": "Taboola", "category": "marketing", "provider": "Taboola Inc.", "country": "US", "eu_adequate": False}, "piano": {"id": "piano", "name": "Piano", "category": "paywall", "provider": "Piano Software Inc.", "country": "US", "eu_adequate": False}, "microsoft": {"id": "microsoft", "name": "Microsoft", "category": "cloud", "provider": "Microsoft Corp.", "country": "US", "eu_adequate": False}, "amazon web services": {"id": "aws", "name": "AWS", "category": "cloud", "provider": "Amazon Web Services", "country": "US", "eu_adequate": False}, } def detect_services_in_text(text: str) -> list[dict]: """Detect all third-party services mentioned in a legal document text. Searches for: 1. Service names from service_registry (88+ entries) 2. Additional common service names from _EXTRA_TEXT_PATTERNS 3. Generic "Auftragsverarbeiter" / provider patterns Returns list of detected service dicts with name, category, country, etc. """ text_lower = text.lower() found: dict[str, dict] = {} # id -> metadata (dedup) # 1. Registry services (by name) for name_lower, meta in _NAME_PATTERNS: # Search for service name as word (not substring) if name_lower in text_lower: sid = meta.get("id", name_lower) if sid not in found: found[sid] = {**meta, "source": "registry"} # 2. Extra text patterns for keyword, meta in _EXTRA_TEXT_PATTERNS.items(): if keyword in text_lower: sid = meta["id"] if sid not in found: found[sid] = {**meta, "source": "text_pattern"} # 3. Dedup: x.com and twitter are the same if "x_com" in found and "twitter" in found: del found["x_com"] logger.info("Detected %d services in text (%d words)", len(found), len(text.split())) return list(found.values())