diff --git a/backend-compliance/compliance/services/vendor_extractor.py b/backend-compliance/compliance/services/vendor_extractor.py index 4e020945..f6ad7003 100644 --- a/backend-compliance/compliance/services/vendor_extractor.py +++ b/backend-compliance/compliance/services/vendor_extractor.py @@ -58,6 +58,14 @@ def extract_vendors_from_payloads(payloads: list[dict]) -> list[dict]: vendors = _extract_epaas(data) elif kind == "onetrust": vendors = _extract_onetrust(data) + elif kind == "cookiebot": + vendors = _extract_cookiebot(data) + elif kind == "usercentrics": + vendors = _extract_usercentrics(data) + elif kind == "didomi": + vendors = _extract_didomi(data) + elif kind == "trustarc": + vendors = _extract_trustarc(data) else: # Generic fallback: walk data for vendor-like dicts vendors = _extract_generic(data) @@ -153,6 +161,141 @@ def _extract_onetrust(d: dict) -> list[dict]: return list(out_by_name.values()) +# ── Cookiebot ─────────────────────────────────────────────────────── + +def _extract_cookiebot(d: dict) -> list[dict]: + """Cookiebot stores 'Categories[*].Cookies[*]' with Vendor/Host.""" + out: dict[str, dict] = {} + for cat in d.get("Categories") or d.get("categories") or []: + category = cat.get("Name") or cat.get("name") or "" + for c in cat.get("Cookies") or cat.get("cookies") or []: + provider = (c.get("Vendor") or c.get("vendor") + or c.get("Host") or c.get("host") or "").strip() + if not provider: + continue + cookie = { + "name": c.get("Name") or c.get("name") or "", + "purpose": _clean(c.get("Purpose") or c.get("purpose")), + "expiry": _clean(c.get("Expires") or c.get("expires")), + "is_third_party": bool(c.get("IsThirdParty")), + } + if provider in out: + out[provider]["cookies"].append(cookie) + else: + out[provider] = { + "name": provider, + "country": "", + "purpose": _clean(c.get("Purpose") or category), + "category": category, + "opt_out_url": "", + "privacy_policy_url": (c.get("PrivacyPolicyUrl") + or c.get("policyUrl") or ""), + "persistence": "", + "cookies": [cookie], + } + return list(out.values()) + + +# ── Usercentrics ──────────────────────────────────────────────────── + +def _extract_usercentrics(d: dict) -> list[dict]: + """Usercentrics 'services' / 'dataProcessingServices' shape.""" + out: list[dict] = [] + services = (d.get("services") or d.get("dataProcessingServices") + or (d.get("settings") or {}).get("services") or []) + for s in services: + name = s.get("name") or s.get("dataProcessor") or "" + if not name: + continue + max_age = s.get("cookieMaxAgeSeconds") + persistence = "" + if isinstance(max_age, int) and max_age > 0: + persistence = f"{max_age // 86400} Tage" + out.append({ + "name": name, + "country": (s.get("processingCompanyCountry") + or s.get("country") or "").strip(), + "purpose": _clean(s.get("dataPurpose") or s.get("description")), + "category": (s.get("categorySlug") or s.get("category") or "").strip(), + "opt_out_url": (s.get("optOutUrl") or "").strip(), + "privacy_policy_url": (s.get("policyOfProcessorUrl") + or s.get("urls", {}).get("privacyPolicy", "") + or "").strip(), + "persistence": persistence or _clean(s.get("retentionPeriodDescription")), + "cookies": [], + }) + return out + + +# ── Didomi ────────────────────────────────────────────────────────── + +def _extract_didomi(d: dict) -> list[dict]: + """Didomi 'app.vendors[]' with name, country, policyUrl.""" + out: list[dict] = [] + app = d.get("app", d) or {} + for v in app.get("vendors") or d.get("vendors") or []: + name = v.get("name") or "" + if not name: + continue + out.append({ + "name": name, + "country": (v.get("country") or "").strip(), + "purpose": _clean(v.get("description") or v.get("purpose")), + "category": (v.get("category") or "").strip(), + "opt_out_url": (v.get("optOutUrl") or "").strip(), + "privacy_policy_url": (v.get("policyUrl") or v.get("policy_url") + or "").strip(), + "persistence": "", + "cookies": [], + }) + return out + + +# ── TrustArc ──────────────────────────────────────────────────────── + +def _extract_trustarc(d: dict) -> list[dict]: + """TrustArc 'vendors[]' or per-category 'Cookies' with provider.""" + out_by_name: dict[str, dict] = {} + # vendors + for v in d.get("vendors") or d.get("Vendors") or []: + name = v.get("name") or v.get("Name") or "" + if not name: + continue + out_by_name[name] = { + "name": name, + "country": (v.get("country") or "").strip(), + "purpose": _clean(v.get("description") or v.get("Description")), + "category": (v.get("category") or "").strip(), + "opt_out_url": (v.get("optOutUrl") or "").strip(), + "privacy_policy_url": (v.get("policyUrl") or "").strip(), + "persistence": "", + "cookies": [], + } + # cookies per category + for cat in d.get("categories") or d.get("Categories") or []: + cat_name = cat.get("name") or cat.get("Name") or "" + for c in cat.get("cookies") or cat.get("Cookies") or []: + provider = c.get("provider") or c.get("Provider") or "" + if not provider: + continue + cookie = { + "name": c.get("name") or c.get("Name") or "", + "purpose": _clean(c.get("purpose") or c.get("Purpose")), + "expiry": _clean(c.get("expires") or c.get("Expires")), + "is_third_party": True, + } + if provider in out_by_name: + out_by_name[provider]["cookies"].append(cookie) + else: + out_by_name[provider] = { + "name": provider, "country": "", "purpose": "", + "category": cat_name, "opt_out_url": "", + "privacy_policy_url": "", "persistence": "", + "cookies": [cookie], + } + return list(out_by_name.values()) + + # ── Generic fallback (other CMPs / heuristic captures) ────────────── def _extract_generic(d: dict) -> list[dict]: