fab1e35847
Per user request: BMW (and others) put their own services AND external
vendors in the same cookie-policy widget. The VVT-Tabelle now groups
them by Art. 30(1)(d) DSGVO recipient category so the DSB can act on
the right buckets:
- INTERNAL — owner processing for itself ('BMW AG — XYZ')
- GROUP_COMPANY — same brand family, different legal entity ('BMW Bank')
- PROCESSOR — Auftragsverarbeiter, AVV-pflichtig (Adobe, Akamai)
- CONTROLLER — independent / joint controller (Meta Pixel, Google
Ads, LinkedIn — they run their own profiles)
- AUTHORITY — government bodies (rare in cookies)
- OTHER — fallback
New module vendor_classifier.py:
- owner_from_url(url) — derive site-owner token (bmw.de -> 'BMW',
mercedes-benz.de -> 'Mercedes-Benz')
- classify(name, category, owner) — strict 5-tier heuristic:
* INTERNAL: vendor name first-token is '<Owner>' / '<Owner> AG' /
'<Owner> SE' / '<Owner> GmbH' / '<Owner> AG & Co. KG'
* GROUP_COMPANY: starts with '<Owner> ' but isn't '<Owner> AG'
* CONTROLLER: matches a known joint-controller list (Meta, Google
Ads, YouTube, LinkedIn Insight, TikTok, Pinterest, Taboola,
Outbrain, Criteo, Twitter, Reddit, ...)
* PROCESSOR: legal-form suffix in name (GmbH, AG, Inc., A/S,
B.V., S.A., Ltd., LLC, ...)
* OTHER: anything else
vendor_extractor.extract_vendors_from_payloads now takes owner_name:
- Passes it through to classify() for every extracted vendor record
- The route derives owner_name via _company_name_from_url(doc_entries)
- LLM-extracted vendors are classified the same way (so V3 fallback
also produces tagged records)
agent_doc_check_extras.build_vvt_table_html rewritten:
- Buckets vendors by recipient_type
- Renders one section per non-empty bucket, in canonical order
(RECIPIENT_TYPE_SECTIONS), each with section header + count + bad
count + nested table
- Within each section: sorted by compliance_score ascending
- Response JSON cmp_vendors includes recipient_type so the frontend
can later import per-category into the VVT module
Expected BMW result: ~60 INTERNAL rows (BMW AG own services),
~25 PROCESSOR rows (Adobe, Adform, Akamai, AWS, ...), ~5 CONTROLLER
rows (Meta Pixel, Google, LinkedIn, Pinterest, Outbrain, Taboola).
152 lines
5.4 KiB
Python
152 lines
5.4 KiB
Python
"""
|
|
Recipient-type classifier for vendor records (Art. 30(1)(d) DSGVO).
|
|
|
|
Tags each extracted vendor entry with one of the canonical
|
|
RecipientCategoryType values used by the VVT module:
|
|
|
|
- INTERNAL — owner's own department / own system (BMW AG processing
|
|
for itself, e.g. 'BMW AG — Form Validation')
|
|
- GROUP_COMPANY — parent/subsidiary/sister of the owner (BMW Bank,
|
|
BMW Motorrad, BMW Financial Services)
|
|
- PROCESSOR — external Auftragsverarbeiter under AVV (Adobe,
|
|
Akamai, AWS, Salesforce — they process on behalf)
|
|
- CONTROLLER — independent / joint controller (Meta Pixel, Google
|
|
YouTube — they run their own profiles)
|
|
- AUTHORITY — government bodies (rare in cookie contexts)
|
|
- OTHER — fallback
|
|
|
|
Heuristic only — does not query Vault or external sources. A site-owner
|
|
name is derived from the user-submitted URL (e.g. bmw.de -> 'BMW AG' or
|
|
'BMW'). Classification compares the vendor name to that owner name.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from urllib.parse import urlparse
|
|
|
|
# Known tracking/advertising platforms that typically act as INDEPENDENT
|
|
# or JOINT CONTROLLERS rather than processors. They build their own user
|
|
# profiles across many sites; the site owner has limited control over
|
|
# what they do with the data once collected.
|
|
_JOINT_CONTROLLER_HINTS = {
|
|
"meta", # Meta Pixel (Facebook/Instagram)
|
|
"facebook",
|
|
"instagram",
|
|
"google adverti", # Google Advertising
|
|
"google ads",
|
|
"youtube",
|
|
"doubleclick",
|
|
"linkedin insight",
|
|
"linkedin",
|
|
"tiktok",
|
|
"pinterest",
|
|
"twitter",
|
|
"x.com",
|
|
"snapchat",
|
|
"taboola",
|
|
"outbrain",
|
|
"criteo",
|
|
"amazon adverti", # Amazon Advertising (vs AWS)
|
|
"microsoft adverti",
|
|
"yandex",
|
|
"reddit",
|
|
"quora",
|
|
"spotify",
|
|
}
|
|
|
|
|
|
def owner_from_url(url: str) -> str:
|
|
"""Derive a short owner name from a URL.
|
|
|
|
bmw.de -> 'BMW', mercedes-benz.de -> 'Mercedes-Benz',
|
|
deutsche-bahn.de -> 'Deutsche-Bahn'. Used to detect the INTERNAL
|
|
case when a vendor record's provider name starts with or contains
|
|
this token.
|
|
"""
|
|
if not url or "://" not in url:
|
|
return ""
|
|
netloc = urlparse(url).netloc.lower()
|
|
if netloc.startswith("www."):
|
|
netloc = netloc[4:]
|
|
parts = netloc.split(".")
|
|
if len(parts) < 2:
|
|
return ""
|
|
sld = parts[-2] if len(parts) <= 2 else parts[-2] # bmw
|
|
# Acronym (<=4 chars, no hyphen) -> uppercase (BMW, ARD, ZDF)
|
|
if len(sld) <= 4 and "-" not in sld:
|
|
return sld.upper()
|
|
return "-".join(p.capitalize() for p in sld.split("-"))
|
|
|
|
|
|
def classify(
|
|
vendor_name: str,
|
|
category: str,
|
|
owner_name: str,
|
|
) -> str:
|
|
"""Return one of INTERNAL / GROUP_COMPANY / PROCESSOR / CONTROLLER / OTHER.
|
|
|
|
Args:
|
|
vendor_name: the provider/processing name as it appears in the
|
|
cookie policy (e.g. 'BMW AG — Form Validation' or 'Adobe Systems
|
|
Software Ireland Limited — Adobe Analytics').
|
|
category: canonical category ('marketing', 'necessary', 'statistics',
|
|
'functional'). Used to distinguish controller vs processor for ad
|
|
platforms.
|
|
owner_name: short token derived from the site URL ('BMW',
|
|
'Mercedes-Benz'). Empty string disables INTERNAL detection.
|
|
"""
|
|
name = (vendor_name or "").strip()
|
|
if not name:
|
|
return "OTHER"
|
|
lower = name.lower()
|
|
|
|
# 1. INTERNAL — owner processing for itself.
|
|
# Strict: provider must BE the owner's main legal entity:
|
|
# '<Owner> AG', '<Owner> SE', '<Owner> GmbH', '<Owner>' alone, or
|
|
# '<Owner> AG — <processing>' / '<Owner> SE — <processing>'.
|
|
if owner_name:
|
|
ow = owner_name.lower()
|
|
first_token = lower.split(" — ", 1)[0].strip() # text before ' — '
|
|
if (first_token == ow
|
|
or first_token == f"{ow} ag"
|
|
or first_token == f"{ow} se"
|
|
or first_token == f"{ow} gmbh"
|
|
or first_token == f"{ow} ag & co. kg"):
|
|
return "INTERNAL"
|
|
|
|
# 2. GROUP_COMPANY — provider is in the owner's brand family but a
|
|
# different legal entity (BMW Bank GmbH, BMW Motorrad GmbH,
|
|
# BMW Financial Services).
|
|
if owner_name:
|
|
ow = owner_name.lower()
|
|
first_token = lower.split(" — ", 1)[0].strip()
|
|
if first_token.startswith(f"{ow} ") and first_token != f"{ow} ag":
|
|
return "GROUP_COMPANY"
|
|
|
|
# 3. CONTROLLER — known tracking/ad platforms
|
|
if any(hint in lower for hint in _JOINT_CONTROLLER_HINTS):
|
|
return "CONTROLLER"
|
|
|
|
# 4. PROCESSOR — everything else with a corporate name is most likely
|
|
# an Auftragsverarbeiter (hosting/CDN/analytics/chat/captcha/CRM)
|
|
if any(suffix in lower for suffix in (
|
|
"gmbh", "ag ", " ag", "ag—", "ag ", "se ", "kg", "ohg",
|
|
"inc.", "inc ", "ltd", "limited", "llc", "corp", "b.v.",
|
|
"a/s", "s.a.", "s.l.", "s.r.l", "oy ", "ab ", "as ",
|
|
)):
|
|
return "PROCESSOR"
|
|
|
|
return "OTHER"
|
|
|
|
|
|
# Section ordering + display labels for the VVT email table
|
|
RECIPIENT_TYPE_SECTIONS = [
|
|
("INTERNAL", "Eigene Verarbeitung"),
|
|
("GROUP_COMPANY", "Konzernunternehmen (Mutter/Tochter)"),
|
|
("PROCESSOR", "Auftragsverarbeiter (AVV-pflichtig)"),
|
|
("CONTROLLER", "Eigenverantwortliche Dritte / Joint Controller"),
|
|
("AUTHORITY", "Behoerden"),
|
|
("OTHER", "Sonstige Empfaenger"),
|
|
]
|