diff --git a/backend-compliance/compliance/services/business_profiler.py b/backend-compliance/compliance/services/business_profiler.py index 83a8562..32eb4fe 100644 --- a/backend-compliance/compliance/services/business_profiler.py +++ b/backend-compliance/compliance/services/business_profiler.py @@ -163,10 +163,16 @@ async def detect_business_profile(documents: dict[str, str]) -> BusinessProfile: full_text = "\n".join(documents.values()).lower() full_text = full_text.replace("\xad", "") # strip soft hyphens - # ── Tracking services ──────────────────────────────────────── - for pattern, label in _TRACKING_SERVICES.items(): - if pattern in full_text: - profile.detected_services.append(label) + # ── Tracking services (use full service detector) ────────── + try: + from compliance.services.service_detector import detect_services_in_text + detected = detect_services_in_text(full_text) + profile.detected_services = [s["name"] for s in detected] + except Exception: + # Fallback to simple keyword list + for pattern, label in _TRACKING_SERVICES.items(): + if pattern in full_text: + profile.detected_services.append(label) # ── Online shop ────────────────────────────────────────────── shop_hits = _count_hits(full_text, _ONLINE_SHOP_KEYWORDS) diff --git a/backend-compliance/compliance/services/profile_extractor.py b/backend-compliance/compliance/services/profile_extractor.py index 4b5d987..4ac1a8b 100644 --- a/backend-compliance/compliance/services/profile_extractor.py +++ b/backend-compliance/compliance/services/profile_extractor.py @@ -64,6 +64,22 @@ def extract_profile_from_documents( "regulated_profession_type", "" ) + # ── Detected services (full list with metadata) ──────────── + try: + from compliance.services.service_detector import detect_services_in_text + detected = detect_services_in_text(all_text) + result["detected_services"] = detected + # Add non-EU services as scope hint + non_eu = [s for s in detected if not s.get("eu_adequate")] + if non_eu: + result["compliance_scope_hints"].append({ + "field": "hasThirdCountryTransfer", + "value": True, + "source": f"{len(non_eu)} Dienste ausserhalb EWR erkannt ({', '.join(s['name'] for s in non_eu[:5])}...)", + }) + except Exception as e: + logger.warning("Service detection failed: %s", e) + # ── Scope hints from document content ──────────────────────── _extract_scope_hints(all_text, result) diff --git a/backend-compliance/compliance/services/service_detector.py b/backend-compliance/compliance/services/service_detector.py new file mode 100644 index 0000000..7f17e32 --- /dev/null +++ b/backend-compliance/compliance/services/service_detector.py @@ -0,0 +1,128 @@ +""" +Service Detector — find all third-party services mentioned in legal texts. + +Uses the service_registry (88+ services) as detection source. +Works on lowercased text with simple keyword matching. +Returns structured list of detected services with metadata. +""" + +import logging +import re + +from compliance.services.service_registry import SERVICE_REGISTRY + +logger = logging.getLogger(__name__) + +# Build a simple name→metadata lookup from the registry +_SERVICE_BY_NAME: dict[str, dict] = {} +_NAME_PATTERNS: list[tuple[str, dict]] = [] + +for _pattern, _meta in SERVICE_REGISTRY.items(): + name = _meta["name"] + _SERVICE_BY_NAME[name.lower()] = _meta + # Also build lowercase search keywords from the name + _NAME_PATTERNS.append((name.lower(), _meta)) + +# Additional text-based patterns (services often mentioned by name in DSI, +# not by script URL pattern) +_EXTRA_TEXT_PATTERNS: dict[str, dict] = { + "adobe": {"id": "adobe", "name": "Adobe", "category": "tracking", + "provider": "Adobe Inc.", "country": "US", "eu_adequate": False}, + "sourcepoint": {"id": "sourcepoint", "name": "Sourcepoint", "category": "cmp", + "provider": "Sourcepoint Technologies", "country": "US", "eu_adequate": False}, + "salesforce": {"id": "salesforce", "name": "Salesforce", "category": "crm", + "provider": "Salesforce Inc.", "country": "US", "eu_adequate": False}, + "qualtrics": {"id": "qualtrics", "name": "Qualtrics", "category": "survey", + "provider": "Qualtrics LLC", "country": "US", "eu_adequate": False}, + "jw player": {"id": "jw_player", "name": "JW Player", "category": "video", + "provider": "Longtail Ad Solutions", "country": "US", "eu_adequate": False}, + "omnystudio": {"id": "omnystudio", "name": "Omnystudio", "category": "audio", + "provider": "Triton Digital", "country": "CA", "eu_adequate": False}, + "storifyme": {"id": "storifyme", "name": "Storifyme", "category": "content", + "provider": "Storifyme GmbH", "country": "DE", "eu_adequate": True}, + "iqd": {"id": "iqd", "name": "IQD", "category": "marketing", + "provider": "IQ Digital Media Marketing", "country": "DE", "eu_adequate": True}, + "id5": {"id": "id5", "name": "ID5", "category": "identity", + "provider": "ID5 Technology Ltd", "country": "GB", "eu_adequate": True}, + "utiq": {"id": "utiq", "name": "Utiq", "category": "tracking", + "provider": "Utiq SA/NV", "country": "BE", "eu_adequate": True}, + "mapbox": {"id": "mapbox", "name": "Mapbox", "category": "maps", + "provider": "Mapbox Inc.", "country": "US", "eu_adequate": False}, + "tiktok": {"id": "tiktok", "name": "TikTok", "category": "social", + "provider": "TikTok Technology Limited", "country": "IE", "eu_adequate": True}, + "spotify": {"id": "spotify", "name": "Spotify", "category": "audio", + "provider": "Spotify AB", "country": "SE", "eu_adequate": True}, + "reddit": {"id": "reddit", "name": "Reddit", "category": "social", + "provider": "Reddit Inc.", "country": "US", "eu_adequate": False}, + "bluesky": {"id": "bluesky", "name": "Bluesky", "category": "social", + "provider": "Bluesky PBLLC", "country": "US", "eu_adequate": False}, + "giphy": {"id": "giphy", "name": "Giphy", "category": "content", + "provider": "Meta Platforms", "country": "US", "eu_adequate": False}, + "imgur": {"id": "imgur", "name": "Imgur", "category": "content", + "provider": "Imgur Inc.", "country": "US", "eu_adequate": False}, + "instagram": {"id": "instagram", "name": "Instagram", "category": "social", + "provider": "Meta Platforms", "country": "US", "eu_adequate": False}, + "facebook": {"id": "facebook", "name": "Facebook", "category": "social", + "provider": "Meta Platforms", "country": "US", "eu_adequate": False}, + "meta platforms": {"id": "meta_platforms", "name": "Meta Platforms", "category": "social", + "provider": "Meta Platforms Inc.", "country": "US", "eu_adequate": False}, + "linkedin": {"id": "linkedin", "name": "LinkedIn", "category": "marketing", + "provider": "LinkedIn Corp.", "country": "US", "eu_adequate": False}, + "twitter": {"id": "twitter", "name": "X/Twitter", "category": "social", + "provider": "X Corp.", "country": "US", "eu_adequate": False}, + "x.com": {"id": "x_com", "name": "X/Twitter", "category": "social", + "provider": "X Corp.", "country": "US", "eu_adequate": False}, + "recaptcha": {"id": "recaptcha", "name": "Google reCAPTCHA", "category": "security", + "provider": "Google LLC", "country": "US", "eu_adequate": False}, + "xandr": {"id": "xandr", "name": "Xandr", "category": "marketing", + "provider": "Microsoft/Xandr", "country": "US", "eu_adequate": False}, + "criteo": {"id": "criteo", "name": "Criteo", "category": "marketing", + "provider": "Criteo SA", "country": "FR", "eu_adequate": True}, + "outbrain": {"id": "outbrain", "name": "Outbrain", "category": "marketing", + "provider": "Outbrain Inc.", "country": "US", "eu_adequate": False}, + "taboola": {"id": "taboola", "name": "Taboola", "category": "marketing", + "provider": "Taboola Inc.", "country": "US", "eu_adequate": False}, + "piano": {"id": "piano", "name": "Piano", "category": "paywall", + "provider": "Piano Software Inc.", "country": "US", "eu_adequate": False}, + "microsoft": {"id": "microsoft", "name": "Microsoft", "category": "cloud", + "provider": "Microsoft Corp.", "country": "US", "eu_adequate": False}, + "amazon web services": {"id": "aws", "name": "AWS", "category": "cloud", + "provider": "Amazon Web Services", "country": "US", "eu_adequate": False}, +} + + +def detect_services_in_text(text: str) -> list[dict]: + """Detect all third-party services mentioned in a legal document text. + + Searches for: + 1. Service names from service_registry (88+ entries) + 2. Additional common service names from _EXTRA_TEXT_PATTERNS + 3. Generic "Auftragsverarbeiter" / provider patterns + + Returns list of detected service dicts with name, category, country, etc. + """ + text_lower = text.lower() + found: dict[str, dict] = {} # id -> metadata (dedup) + + # 1. Registry services (by name) + for name_lower, meta in _NAME_PATTERNS: + # Search for service name as word (not substring) + if name_lower in text_lower: + sid = meta.get("id", name_lower) + if sid not in found: + found[sid] = {**meta, "source": "registry"} + + # 2. Extra text patterns + for keyword, meta in _EXTRA_TEXT_PATTERNS.items(): + if keyword in text_lower: + sid = meta["id"] + if sid not in found: + found[sid] = {**meta, "source": "text_pattern"} + + # 3. Dedup: x.com and twitter are the same + if "x_com" in found and "twitter" in found: + del found["x_com"] + + logger.info("Detected %d services in text (%d words)", + len(found), len(text.split())) + return list(found.values())