Files
breakpilot-compliance/backend-compliance/compliance/services/business_profiler.py
T
Benjamin Admin 6c223c7c9b
CI / detect-changes (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 14s
CI / loc-budget (push) Failing after 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m43s
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 37s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
feat(compliance-check): exec-summary + voll-audit + TDM-respect + cookie-KB-extended + saving-scan-funnel
P1 — Exec-Summary oben im Email-Report (4 KPIs + 2 CTAs, dunkler Gradient)
P3 — no_direct_sales-Flag fuer OEM-Konfigurator-Sites; AGB/Widerruf/AGB als
     "NICHT ANWENDBAR" (grau) statt "NICHT GEFUNDEN" (rot)
P5 — Voll-Audit Unification: alle Findings (MC + Pflichtangaben + Vendor +
     Redundanz) in /data/compliance_audits.db.unified_findings; neuer
     /api/compliance/agent/findings/<id> Endpoint + FindingsTab im Audit-UI
     mit Filter + CSV-Export
P7 — Crawl-Hardening: TDM-Reservation-Check (robots.txt / ai.txt / Header /
     Meta) vor jedem Run mit 24h-Cache; HeadlessChrome-UA (Firma noch nicht
     gegruendet — Switch via BREAKPILOT_BRANDED_UA env); per-Domain
     Rate-Limit 1 req/s + max 2 concurrent
P2 — Cookie-Knowledge-DB additiv erweitert (35 -> 74 Cookies): Adobe, Meta,
     Microsoft, LinkedIn, TikTok, HubSpot, Marketo, Salesforce, Hotjar,
     FullStory, Mouseflow, Intercom, Drift, Zendesk, Cloudflare, Stripe,
     OneTrust/Cookiebot/Usercentrics, Matomo, Pinterest, Snapchat, X/Twitter,
     YouTube, Vimeo, Klaviyo, Mailchimp, Mixpanel, Segment, Amplitude,
     Optimizely, Datadog; Wire-in in cookie_function_classifier liefert
     compliance_risk-Label (kritisch/hoch/mittel/gering) pro Vendor
A  — k-Anonymitaets-Helper (benchmark_k_anonymity) fuer P6-Vorbereitung
B  — Cross-Tenant-Domain-Assertion im /findings-Endpoint (expected_domain
     Query-Param -> 403 bei Mismatch)
C  — Saving-Scan-Funnel: /api/compliance/agent/saving-scan/start mit
     Validierung + 24h-Rate-Limit pro Domain + Lead-Persistenz in
     saving_scan_leads + Auto-Discovery via _run_compliance_check; 6 Tests
D  — Risk-Badge im Email-Vendor-Row

Rechtliche Leitplanken (Memory feedback_oem_data_legal.md): nur eigene
Knapp-Bewertungen + Source-Pointer, keine 1:1-Kopien fremder CMP-Texte.
TDM-Opt-Out-Respect nach § 44b UrhG. KEINE Schema-Aenderungen — alles in
Sidecar-SQLite.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 23:48:34 +02:00

374 lines
16 KiB
Python

"""
Business Profiler — detect business model from document texts.
Pure keyword-based detection (deterministic, no LLM). Analyzes
DSE, Impressum, AGB, Widerruf etc. together to build a profile
that drives context-aware compliance checks.
Example:
profile = await detect_business_profile({"dse": "...", "impressum": "..."})
profile.business_type # "b2c"
profile.has_online_shop # True
"""
from __future__ import annotations
import re
from dataclasses import dataclass, field
@dataclass
class BusinessProfile:
business_type: str = "unknown" # b2b, b2c, b2g, nonprofit, unknown
industry: str = "unknown" # it_services, retail, healthcare, legal, craft, public, unknown
has_online_shop: bool = False
has_editorial_content: bool = False
is_regulated_profession: bool = False
regulated_profession_type: str = "" # arzt, anwalt, steuerberater, architekt, ""
needs_odr: bool = False # Online-Streitbeilegung
detected_services: list[str] = field(default_factory=list)
confidence: float = 0.0
# Wenn True: die Site selbst schliesst KEINEN Direktkauf-Vertrag
# (typisch OEM-Konfigurator-Sites BMW/Audi/Mercedes — Vertrag laeuft
# ueber den Vertragshaendler, nicht die Hersteller-Webseite).
# Konsequenz: AGB/Widerruf/Nutzungsbedingungen sind NICHT PFLICHT
# auf der Website, sondern werden beim Haendler ausgehaendigt.
no_direct_sales: bool = False
# ── Keyword lists ────────────────────────────────────────────────────
_B2C_KEYWORDS = [
"verbraucher", "warenkorb", "bestellung", "lieferung", "widerruf",
"shop", "kaufpreis", "rueckgabe", "rückgabe", "endkunde", "kaeufer",
"käufer", "privatkunde", "zahlungspflichtig bestellen",
]
_B2B_KEYWORDS = [
# Discriminative — these don't appear in B2C consumer texts
"geschaeftskunden", "geschäftskunden", "firmenkunde", "b2b",
"industriekunden", "ausschliesslich gewerblich", "ausschließlich gewerblich",
"ausschliesslich unternehmer", "ausschließlich unternehmer",
"kein verbrauchergeschaeft", "kein verbrauchergeschäft",
# Note: "unternehmen", "beratung", "consulting", "dienstleistung"
# were removed — they match in any company text and bias toward B2B.
]
_B2G_KEYWORDS = [
"koerperschaft des oeffentlichen rechts", "körperschaft des öffentlichen rechts",
"gemeinde", "stadtverwaltung", "landesbehoerde", "landesbehörde",
"kommunal", "buergerservice", "bürgerservice", "rathaus",
"landesamt", "bundesamt", "oeffentliche verwaltung", "öffentliche verwaltung",
"oeffentlicher dienst", "öffentlicher dienst",
]
_NONPROFIT_KEYWORDS = [
"gemeinnuetzig", "gemeinnützig", "verein", "stiftung", "e.v.",
"spende", "ehrenamtlich", "satzung",
]
_REGULATED_PROFESSIONS = {
# Anwalt — nur spezifische Begriffe, nicht "anwalt" allein
# (matcht sonst Redaktionsanwalt, Justiziar etc.)
"rechtsanwalt": "anwalt",
"rechtsanwaeltin": "anwalt",
"rechtsanwältin": "anwalt",
"kanzlei": "anwalt",
"rechtsanwaltskammer": "anwalt",
"zugelassener anwalt": "anwalt",
# Arzt — "praxis" entfernt (matcht "in der Praxis")
"arztpraxis": "arzt",
"zahnarzt": "arzt",
"facharzt": "arzt",
"aerztekammer": "arzt",
"ärztekammer": "arzt",
"kassenärztlich": "arzt",
"kassenaerztlich": "arzt",
# Steuerberater
"steuerberater": "steuerberater",
"steuerberaterin": "steuerberater",
"steuerberaterkammer": "steuerberater",
# Architekt
"architekt": "architekt",
"architektin": "architekt",
"architektenkammer": "architekt",
# Notar
"notar": "notar",
"notariat": "notar",
# Apotheker
"apotheke": "apotheker",
"apotheker": "apotheker",
}
_ONLINE_SHOP_KEYWORDS = [
"warenkorb", "checkout", "bestellung", "lieferung", "versand",
"paypal", "kreditkarte", "klarna", "sofortueberweisung",
"sofortüberweisung", "zahlungsarten", "versandkosten",
"lieferzeit", "retour", "paketdienst",
]
_EDITORIAL_KEYWORDS = [
"blog", "ratgeber", "news", "redaktion", "artikel", "magazin",
"beitrag", "kommentar", "podcast", "newsletter", "autor",
]
_INDUSTRY_KEYWORDS = {
# "software/cloud/hosting" are often mentioned in privacy texts of any
# vendor (Cloud-Hosting fuer Newsletter, SaaS-Tools etc.) without making
# the company an IT-services vendor itself. Keep the list deliberately
# narrow: only patterns that strongly suggest IT/SaaS as the core business.
"it_services": ["saas-anbieter", "software-as-a-service",
"ihr saas", "ihre cloud", "hosting-provider",
"api-anbieter", "developer-portal"],
"retail": ["shop", "warenkorb", "versand", "lieferung", "einzelhandel"],
"healthcare": ["arzt", "praxis", "patient", "gesundheit", "therapie", "klinik"],
"legal": ["kanzlei", "rechtsanwalt", "mandant", "anwalt"],
"craft": ["handwerk", "meister", "werkstatt", "montage", "gewerk"],
"public": ["kommune", "stadtverwaltung", "buergerservice", "bürgerservice", "rathaus"],
"finance": ["bank", "versicherung", "finanz", "kredit", "anlage"],
"education": ["schule", "bildung", "unterricht", "lehrplan", "schueler", "schüler"],
"consulting": ["beratung", "consulting", "schulung", "seminar", "gutachten", "audit",
"arbeitssicherheit", "brandschutz", "sicherheitstechnik", "zertifizierung"],
"manufacturing": ["fertigung", "produktion", "maschinenbau", "anlagenbau", "zulieferer",
"werkzeugbau", "spritzguss", "cnc", "industrietechnik"],
"automotive": ["fahrzeug", "kraftfahrzeug", "kfz", "automobil", "neuwagen",
"gebrauchtwagen", "fahrzeugempfehlung", "modellreihe",
"modellpalette", "antriebs", "motor", "reifen", "elektroauto",
"verbrenner", "hybridfahrzeug", "leasing", "werkstatt",
"wartung und reparatur", "probefahrt", "bmw", "mercedes",
"audi", "volkswagen", "porsche", "opel"],
"media": ["redaktion", "verlag", "medien", "journalismus", "presse"],
}
# Terms that indicate "versicherung" / "bank" is only mentioned as a
# §34d/§34c GewO disclosure (Versicherungsvermittler / Finanzanlagenvermittler)
# rather than the core business. Used to suppress false finance matches.
_VERMITTLER_CONTEXT_TERMS = [
"versicherungsvermittler", "berufshaftpflichtversicherung",
"vermittlerregister", "§34d", "§ 34 d", "§34c", "§ 34 c",
"finanzanlagenvermittler", "ihk muenchen", "ihk münchen",
]
_TRACKING_SERVICES = {
"google analytics": "Google Analytics",
"google tag manager": "Google Tag Manager",
"matomo": "Matomo",
"facebook pixel": "Facebook Pixel",
"meta pixel": "Meta Pixel",
"hotjar": "Hotjar",
"hubspot": "HubSpot",
"mailchimp": "Mailchimp",
"linkedin insight": "LinkedIn Insight",
"google ads": "Google Ads",
"google adsense": "Google AdSense",
"google maps": "Google Maps",
"youtube": "YouTube",
"vimeo": "Vimeo",
"cloudflare": "Cloudflare",
"sentry": "Sentry",
"intercom": "Intercom",
"zendesk": "Zendesk",
"stripe": "Stripe",
"paypal": "PayPal",
}
# ── Detection logic ──────────────────────────────────────────────────
def _count_hits(text: str, keywords: list[str]) -> int:
return sum(1 for kw in keywords if kw in text)
async def detect_business_profile(documents: dict[str, str]) -> BusinessProfile:
"""Analyze all document texts together to detect business model.
Args:
documents: dict mapping doc_type -> text (e.g. {"dse": "...", "impressum": "..."})
"""
profile = BusinessProfile()
if not documents:
return profile
# Merge all texts for keyword search
full_text = "\n".join(documents.values()).lower()
full_text = full_text.replace("\xad", "") # strip soft hyphens
# ── Tracking services (use full service detector) ──────────
try:
from compliance.services.service_detector import detect_services_in_text
detected = detect_services_in_text(full_text)
profile.detected_services = [s["name"] for s in detected]
except Exception:
# Fallback to simple keyword list
for pattern, label in _TRACKING_SERVICES.items():
if pattern in full_text:
profile.detected_services.append(label)
# ── Online shop ──────────────────────────────────────────────
shop_hits = _count_hits(full_text, _ONLINE_SHOP_KEYWORDS)
profile.has_online_shop = shop_hits >= 3
# ── Editorial content ────────────────────────────────────────
editorial_hits = _count_hits(full_text, _EDITORIAL_KEYWORDS)
profile.has_editorial_content = editorial_hits >= 2
# ── Regulated profession ─────────────────────────────────────
# Only check impressum text (not full text) — keywords like "rechtsanwalt"
# appear as contact persons in DSI texts (e.g. Spiegel's "Rechtsanwalt Kruse")
# but that doesn't mean the company IS a law firm.
impressum_text = documents.get("impressum", "").lower().replace("\xad", "")
if not impressum_text:
impressum_text = full_text[:2000] # Fallback: first 2000 chars
for keyword, prof_type in _REGULATED_PROFESSIONS.items():
if keyword in impressum_text:
# Extra guard: "rechtsanwalt" must appear near the company description,
# not just as a contact person name
if keyword in ("rechtsanwalt", "rechtsanwaeltin", "rechtsanwältin"):
# Check if it's in the first 500 chars (company description area)
if keyword not in impressum_text[:500]:
continue
profile.is_regulated_profession = True
profile.regulated_profession_type = prof_type
break
# ── Business type ────────────────────────────────────────────
b2c_score = _count_hits(full_text, _B2C_KEYWORDS)
b2b_score = _count_hits(full_text, _B2B_KEYWORDS)
b2g_score = _count_hits(full_text, _B2G_KEYWORDS)
nonprofit_score = _count_hits(full_text, _NONPROFIT_KEYWORDS)
# Missing documents as signal
has_agb = "agb" in documents
has_widerruf = "widerruf" in documents
if not has_agb:
b2c_score -= 1 # No AGB → less likely B2C
if not has_widerruf:
b2c_score -= 1 # No Widerruf → less likely B2C shop
if profile.has_online_shop:
b2c_score += 3 # Strong B2C signal
scores = {
"b2c": b2c_score,
"b2b": b2b_score,
"b2g": b2g_score,
"nonprofit": nonprofit_score,
}
best = max(scores, key=scores.get) # type: ignore[arg-type]
best_val = scores[best]
if best_val >= 2:
profile.business_type = best
total = sum(max(0, v) for v in scores.values())
profile.confidence = round(best_val / total, 2) if total > 0 else 0.5
else:
# Fallback: prefer B2C when the text mentions Verbraucherrechte,
# editorial content, or consumer-direction signals — even without
# checkout keywords. Only fall back to B2B if discriminative B2B
# markers fired (which the keyword list above already filtered to
# genuinely B2B-only terms).
consumer_hint = (
"verbraucher" in full_text
or "widerruf" in full_text
or "kunde" in full_text
or profile.has_editorial_content
)
if b2b_score >= 1 and not consumer_hint:
profile.business_type = "b2b"
profile.confidence = 0.4
elif consumer_hint:
profile.business_type = "b2c"
profile.confidence = 0.4
else:
profile.business_type = "unknown"
profile.confidence = 0.2
# ── ODR (Online-Streitbeilegung) ─────────────────────────────
# Required for B2C with online shop (EU Regulation 524/2013)
profile.needs_odr = (
profile.business_type == "b2c" and profile.has_online_shop
)
# ── Industry ─────────────────────────────────────────────────
industry_scores: dict[str, int] = {}
for industry, keywords in _INDUSTRY_KEYWORDS.items():
hits = _count_hits(full_text, keywords)
if hits >= 1:
industry_scores[industry] = hits
# Suppress finance/insurance false positives caused by §34d/§34c GewO
# disclosures (Versicherungsvermittler, Berufshaftpflicht, etc.) — these
# are pflichtangaben for many companies (e.g. BMW AG) without being
# actual financial services providers.
if industry_scores.get("finance"):
vermittler_hits = _count_hits(full_text, _VERMITTLER_CONTEXT_TERMS)
if vermittler_hits >= 2:
# Only the §34d boilerplate triggered the match — drop or shrink.
non_insurance_finance = _count_hits(
full_text, ["bank", "finanz", "kredit", "anlage"],
)
if non_insurance_finance == 0:
industry_scores.pop("finance", None)
else:
industry_scores["finance"] = non_insurance_finance
# Require a clear winner — if top score is 1 and there are ties, prefer
# "unknown" over guessing.
if industry_scores:
top = max(industry_scores.values())
winners = [k for k, v in industry_scores.items() if v == top]
if top >= 2 or len(winners) == 1:
profile.industry = winners[0]
else:
profile.industry = "unknown"
elif profile.is_regulated_profession:
prof_map = {"anwalt": "legal", "arzt": "healthcare",
"steuerberater": "finance", "architekt": "craft"}
profile.industry = prof_map.get(profile.regulated_profession_type, "unknown")
# ── no_direct_sales (OEM-Konfigurator-Pattern) ───────────────
# Hersteller-Sites die nur konfigurieren + zu Vertragshaendlern
# weiterleiten (BMW/Audi/Mercedes/VW/Porsche) schliessen KEINEN
# Direkt-Kaufvertrag. AGB/Widerruf/Nutzungsbedingungen sind dort
# nicht Pflicht — werden beim Haendler ausgehaendigt.
profile.no_direct_sales = _detect_no_direct_sales(full_text)
return profile
# Indikatoren: Site verweist primaer auf Vertragshaendler/Niederlassungen
# statt einen eigenen Checkout-Vertragsabschluss zu bieten.
_NO_DIRECT_SALES_POSITIVE = [
"vertragshaendler", "vertragshändler", "vertragspartner",
"vertragswerkstatt", "haendlersuche", "händlersuche",
"niederlassung", "vertretung", "autorisierter haendler",
"autorisierter händler", "ihr haendler vor ort",
"ihr händler vor ort", "haendler in ihrer naehe",
"händler in ihrer nähe", "probefahrt vereinbaren",
"anfrage an haendler", "anfrage an händler",
"konfigurator", "fahrzeug konfigurieren",
"ihre individuelle anfrage",
# OEM-Markennamen — sind Hersteller-Marken die ueblicherweise via
# Haendler vertreiben.
"bmw vertriebs", "audi vertriebs", "mercedes-benz vertriebs",
"volkswagen vertriebs", "porsche zentrum",
]
# Indikatoren GEGEN no_direct_sales: echte Online-Shop-Funktionen.
_DIRECT_SALES_NEGATIVE = [
"in den warenkorb", "warenkorb hinzu", "zur kasse",
"jetzt kaufen", "kostenpflichtig bestellen",
"zahlungspflichtig bestellen", "sofort-kauf",
"online bestellen", "lieferadresse", "rechnungsadresse",
]
def _detect_no_direct_sales(full_text: str) -> bool:
"""Heuristik: erkennt OEM-Konfigurator-Sites die nicht direkt verkaufen."""
text = full_text.lower()
pos = sum(1 for k in _NO_DIRECT_SALES_POSITIVE if k in text)
neg = sum(1 for k in _DIRECT_SALES_NEGATIVE if k in text)
# Mindestens 3 Haendler-Indikatoren UND weniger Shop-Indikatoren als
# Haendler-Indikatoren. Vermeidet false-positive fuer Shops die
# zusaetzlich "Haendlersuche" als Filiale-Finder anbieten.
return pos >= 3 and pos > neg