""" Business Profiler — detect business model from document texts. Pure keyword-based detection (deterministic, no LLM). Analyzes DSE, Impressum, AGB, Widerruf etc. together to build a profile that drives context-aware compliance checks. Example: profile = await detect_business_profile({"dse": "...", "impressum": "..."}) profile.business_type # "b2c" profile.has_online_shop # True """ from __future__ import annotations import re from dataclasses import dataclass, field @dataclass class BusinessProfile: business_type: str = "unknown" # b2b, b2c, b2g, nonprofit, unknown industry: str = "unknown" # it_services, retail, healthcare, legal, craft, public, unknown has_online_shop: bool = False has_editorial_content: bool = False is_regulated_profession: bool = False regulated_profession_type: str = "" # arzt, anwalt, steuerberater, architekt, "" needs_odr: bool = False # Online-Streitbeilegung detected_services: list[str] = field(default_factory=list) confidence: float = 0.0 # Wenn True: die Site selbst schliesst KEINEN Direktkauf-Vertrag # (typisch OEM-Konfigurator-Sites BMW/Audi/Mercedes — Vertrag laeuft # ueber den Vertragshaendler, nicht die Hersteller-Webseite). # Konsequenz: AGB/Widerruf/Nutzungsbedingungen sind NICHT PFLICHT # auf der Website, sondern werden beim Haendler ausgehaendigt. no_direct_sales: bool = False # ── Keyword lists ──────────────────────────────────────────────────── _B2C_KEYWORDS = [ "verbraucher", "warenkorb", "bestellung", "lieferung", "widerruf", "shop", "kaufpreis", "rueckgabe", "rückgabe", "endkunde", "kaeufer", "käufer", "privatkunde", "zahlungspflichtig bestellen", ] _B2B_KEYWORDS = [ # Discriminative — these don't appear in B2C consumer texts "geschaeftskunden", "geschäftskunden", "firmenkunde", "b2b", "industriekunden", "ausschliesslich gewerblich", "ausschließlich gewerblich", "ausschliesslich unternehmer", "ausschließlich unternehmer", "kein verbrauchergeschaeft", "kein verbrauchergeschäft", # Note: "unternehmen", "beratung", "consulting", "dienstleistung" # were removed — they match in any company text and bias toward B2B. ] _B2G_KEYWORDS = [ "koerperschaft des oeffentlichen rechts", "körperschaft des öffentlichen rechts", "gemeinde", "stadtverwaltung", "landesbehoerde", "landesbehörde", "kommunal", "buergerservice", "bürgerservice", "rathaus", "landesamt", "bundesamt", "oeffentliche verwaltung", "öffentliche verwaltung", "oeffentlicher dienst", "öffentlicher dienst", ] _NONPROFIT_KEYWORDS = [ "gemeinnuetzig", "gemeinnützig", "verein", "stiftung", "e.v.", "spende", "ehrenamtlich", "satzung", ] _REGULATED_PROFESSIONS = { # Anwalt — nur spezifische Begriffe, nicht "anwalt" allein # (matcht sonst Redaktionsanwalt, Justiziar etc.) "rechtsanwalt": "anwalt", "rechtsanwaeltin": "anwalt", "rechtsanwältin": "anwalt", "kanzlei": "anwalt", "rechtsanwaltskammer": "anwalt", "zugelassener anwalt": "anwalt", # Arzt — "praxis" entfernt (matcht "in der Praxis") "arztpraxis": "arzt", "zahnarzt": "arzt", "facharzt": "arzt", "aerztekammer": "arzt", "ärztekammer": "arzt", "kassenärztlich": "arzt", "kassenaerztlich": "arzt", # Steuerberater "steuerberater": "steuerberater", "steuerberaterin": "steuerberater", "steuerberaterkammer": "steuerberater", # Architekt "architekt": "architekt", "architektin": "architekt", "architektenkammer": "architekt", # Notar "notar": "notar", "notariat": "notar", # Apotheker "apotheke": "apotheker", "apotheker": "apotheker", } _ONLINE_SHOP_KEYWORDS = [ "warenkorb", "checkout", "bestellung", "lieferung", "versand", "paypal", "kreditkarte", "klarna", "sofortueberweisung", "sofortüberweisung", "zahlungsarten", "versandkosten", "lieferzeit", "retour", "paketdienst", ] _EDITORIAL_KEYWORDS = [ "blog", "ratgeber", "news", "redaktion", "artikel", "magazin", "beitrag", "kommentar", "podcast", "newsletter", "autor", ] _INDUSTRY_KEYWORDS = { # "software/cloud/hosting" are often mentioned in privacy texts of any # vendor (Cloud-Hosting fuer Newsletter, SaaS-Tools etc.) without making # the company an IT-services vendor itself. Keep the list deliberately # narrow: only patterns that strongly suggest IT/SaaS as the core business. "it_services": ["saas-anbieter", "software-as-a-service", "ihr saas", "ihre cloud", "hosting-provider", "api-anbieter", "developer-portal"], "retail": ["shop", "warenkorb", "versand", "lieferung", "einzelhandel"], "healthcare": ["arzt", "praxis", "patient", "gesundheit", "therapie", "klinik"], "legal": ["kanzlei", "rechtsanwalt", "mandant", "anwalt"], "craft": ["handwerk", "meister", "werkstatt", "montage", "gewerk"], "public": ["kommune", "stadtverwaltung", "buergerservice", "bürgerservice", "rathaus"], "finance": ["bank", "versicherung", "finanz", "kredit", "anlage"], "education": ["schule", "bildung", "unterricht", "lehrplan", "schueler", "schüler"], "consulting": ["beratung", "consulting", "schulung", "seminar", "gutachten", "audit", "arbeitssicherheit", "brandschutz", "sicherheitstechnik", "zertifizierung"], "manufacturing": ["fertigung", "produktion", "maschinenbau", "anlagenbau", "zulieferer", "werkzeugbau", "spritzguss", "cnc", "industrietechnik"], "automotive": ["fahrzeug", "kraftfahrzeug", "kfz", "automobil", "neuwagen", "gebrauchtwagen", "fahrzeugempfehlung", "modellreihe", "modellpalette", "antriebs", "motor", "reifen", "elektroauto", "verbrenner", "hybridfahrzeug", "leasing", "werkstatt", "wartung und reparatur", "probefahrt", "bmw", "mercedes", "audi", "volkswagen", "porsche", "opel"], "media": ["redaktion", "verlag", "medien", "journalismus", "presse"], } # Terms that indicate "versicherung" / "bank" is only mentioned as a # §34d/§34c GewO disclosure (Versicherungsvermittler / Finanzanlagenvermittler) # rather than the core business. Used to suppress false finance matches. _VERMITTLER_CONTEXT_TERMS = [ "versicherungsvermittler", "berufshaftpflichtversicherung", "vermittlerregister", "§34d", "§ 34 d", "§34c", "§ 34 c", "finanzanlagenvermittler", "ihk muenchen", "ihk münchen", ] _TRACKING_SERVICES = { "google analytics": "Google Analytics", "google tag manager": "Google Tag Manager", "matomo": "Matomo", "facebook pixel": "Facebook Pixel", "meta pixel": "Meta Pixel", "hotjar": "Hotjar", "hubspot": "HubSpot", "mailchimp": "Mailchimp", "linkedin insight": "LinkedIn Insight", "google ads": "Google Ads", "google adsense": "Google AdSense", "google maps": "Google Maps", "youtube": "YouTube", "vimeo": "Vimeo", "cloudflare": "Cloudflare", "sentry": "Sentry", "intercom": "Intercom", "zendesk": "Zendesk", "stripe": "Stripe", "paypal": "PayPal", } # ── Detection logic ────────────────────────────────────────────────── def _count_hits(text: str, keywords: list[str]) -> int: return sum(1 for kw in keywords if kw in text) async def detect_business_profile(documents: dict[str, str]) -> BusinessProfile: """Analyze all document texts together to detect business model. Args: documents: dict mapping doc_type -> text (e.g. {"dse": "...", "impressum": "..."}) """ profile = BusinessProfile() if not documents: return profile # Merge all texts for keyword search full_text = "\n".join(documents.values()).lower() full_text = full_text.replace("\xad", "") # strip soft hyphens # ── Tracking services (use full service detector) ────────── try: from compliance.services.service_detector import detect_services_in_text detected = detect_services_in_text(full_text) profile.detected_services = [s["name"] for s in detected] except Exception: # Fallback to simple keyword list for pattern, label in _TRACKING_SERVICES.items(): if pattern in full_text: profile.detected_services.append(label) # ── Online shop ────────────────────────────────────────────── shop_hits = _count_hits(full_text, _ONLINE_SHOP_KEYWORDS) profile.has_online_shop = shop_hits >= 3 # ── Editorial content ──────────────────────────────────────── editorial_hits = _count_hits(full_text, _EDITORIAL_KEYWORDS) profile.has_editorial_content = editorial_hits >= 2 # ── Regulated profession ───────────────────────────────────── # Only check impressum text (not full text) — keywords like "rechtsanwalt" # appear as contact persons in DSI texts (e.g. Spiegel's "Rechtsanwalt Kruse") # but that doesn't mean the company IS a law firm. impressum_text = documents.get("impressum", "").lower().replace("\xad", "") if not impressum_text: impressum_text = full_text[:2000] # Fallback: first 2000 chars for keyword, prof_type in _REGULATED_PROFESSIONS.items(): if keyword in impressum_text: # Extra guard: "rechtsanwalt" must appear near the company description, # not just as a contact person name if keyword in ("rechtsanwalt", "rechtsanwaeltin", "rechtsanwältin"): # Check if it's in the first 500 chars (company description area) if keyword not in impressum_text[:500]: continue profile.is_regulated_profession = True profile.regulated_profession_type = prof_type break # ── Business type ──────────────────────────────────────────── b2c_score = _count_hits(full_text, _B2C_KEYWORDS) b2b_score = _count_hits(full_text, _B2B_KEYWORDS) b2g_score = _count_hits(full_text, _B2G_KEYWORDS) nonprofit_score = _count_hits(full_text, _NONPROFIT_KEYWORDS) # P17-C: B2B-Dienstleister-Cluster (P14) als Boost — wenn ein Unternehmen # CE-Zertifizierung / Compliance-Beratung / Auditierung / Schulungen anbietet, # ist es i.d.R. B2B auch wenn die strikten B2B-Keywords nicht greifen. b2b_service_boost = _count_hits(full_text, _B2B_SERVICE_POSITIVE) if b2b_service_boost >= 2: b2b_score += min(3, b2b_service_boost - 1) # Missing documents as signal has_agb = "agb" in documents has_widerruf = "widerruf" in documents if not has_agb: b2c_score -= 1 # No AGB → less likely B2C if not has_widerruf: b2c_score -= 1 # No Widerruf → less likely B2C shop if profile.has_online_shop: b2c_score += 3 # Strong B2C signal scores = { "b2c": b2c_score, "b2b": b2b_score, "b2g": b2g_score, "nonprofit": nonprofit_score, } best = max(scores, key=scores.get) # type: ignore[arg-type] best_val = scores[best] if best_val >= 2: profile.business_type = best total = sum(max(0, v) for v in scores.values()) profile.confidence = round(best_val / total, 2) if total > 0 else 0.5 else: # Fallback: prefer B2C when the text mentions Verbraucherrechte, # editorial content, or consumer-direction signals — even without # checkout keywords. Only fall back to B2B if discriminative B2B # markers fired (which the keyword list above already filtered to # genuinely B2B-only terms). consumer_hint = ( "verbraucher" in full_text or "widerruf" in full_text or "kunde" in full_text or profile.has_editorial_content ) if b2b_score >= 1 and not consumer_hint: profile.business_type = "b2b" profile.confidence = 0.4 elif consumer_hint: profile.business_type = "b2c" profile.confidence = 0.4 else: profile.business_type = "unknown" profile.confidence = 0.2 # ── ODR (Online-Streitbeilegung) ───────────────────────────── # Required for B2C with online shop (EU Regulation 524/2013) profile.needs_odr = ( profile.business_type == "b2c" and profile.has_online_shop ) # ── Industry ───────────────────────────────────────────────── industry_scores: dict[str, int] = {} for industry, keywords in _INDUSTRY_KEYWORDS.items(): hits = _count_hits(full_text, keywords) if hits >= 1: industry_scores[industry] = hits # Suppress finance/insurance false positives caused by §34d/§34c GewO # disclosures (Versicherungsvermittler, Berufshaftpflicht, etc.) — these # are pflichtangaben for many companies (e.g. BMW AG) without being # actual financial services providers. if industry_scores.get("finance"): vermittler_hits = _count_hits(full_text, _VERMITTLER_CONTEXT_TERMS) if vermittler_hits >= 2: # Only the §34d boilerplate triggered the match — drop or shrink. non_insurance_finance = _count_hits( full_text, ["bank", "finanz", "kredit", "anlage"], ) if non_insurance_finance == 0: industry_scores.pop("finance", None) else: industry_scores["finance"] = non_insurance_finance # Require a clear winner — if top score is 1 and there are ties, prefer # "unknown" over guessing. if industry_scores: top = max(industry_scores.values()) winners = [k for k, v in industry_scores.items() if v == top] if top >= 2 or len(winners) == 1: profile.industry = winners[0] else: profile.industry = "unknown" elif profile.is_regulated_profession: prof_map = {"anwalt": "legal", "arzt": "healthcare", "steuerberater": "finance", "architekt": "craft"} profile.industry = prof_map.get(profile.regulated_profession_type, "unknown") # ── no_direct_sales (OEM-Konfigurator-Pattern) ─────────────── # Hersteller-Sites die nur konfigurieren + zu Vertragshaendlern # weiterleiten (BMW/Audi/Mercedes/VW/Porsche) schliessen KEINEN # Direkt-Kaufvertrag. AGB/Widerruf/Nutzungsbedingungen sind dort # nicht Pflicht — werden beim Haendler ausgehaendigt. profile.no_direct_sales = _detect_no_direct_sales(full_text) return profile # P14: drei Cluster die jeweils unabhaengig no_direct_sales=True triggern. # Cluster A: OEM-Konfigurator-Pattern (Auto-Hersteller mit Vertragshaendler-Netz) _OEM_POSITIVE = [ "vertragshaendler", "vertragshändler", "vertragspartner", "vertragswerkstatt", "haendlersuche", "händlersuche", "niederlassung", "vertretung", "autorisierter haendler", "autorisierter händler", "ihr haendler vor ort", "ihr händler vor ort", "haendler in ihrer naehe", "händler in ihrer nähe", "probefahrt vereinbaren", "anfrage an haendler", "anfrage an händler", "konfigurator", "fahrzeug konfigurieren", "ihre individuelle anfrage", "bmw vertriebs", "audi vertriebs", "mercedes-benz vertriebs", "volkswagen vertriebs", "porsche zentrum", # OEM-Markennamen im Pflichttext (Datenschutz erwaehnt Hersteller) "bmw ag", "audi ag", "mercedes-benz ag", "volkswagen ag", "porsche ag", "opel automobile gmbh", ] # Cluster B: B2B-Dienstleister (Beratung / Compliance / Schulung / CE) _B2B_SERVICE_POSITIVE = [ "ce-zertifizierung", "ce zertifizierung", "ce-konformitaet", "ce-konformität", "ce-kennzeichnung", "ce kennzeichnung", "compliance-beratung", "compliance beratung", "arbeitssicherheit", "product compliance", "produktsicherheit", "produkthaftung", "auditierung", "auditor", "auditierungen", "schulungen", "workshops", "akademie", "beratungsleistungen", "consultingleistungen", "consulting services", "managementsystem", "datenschutzbeauftragter (extern)", "externer datenschutzbeauftragter", "datenschutz-audit", "tisax", "iso 27001", "iso 9001", "iso 14001", "iso 45001", "gefaehrdungsbeurteilung", "gefährdungsbeurteilung", "betriebsbeauftragter", "fachkraft fuer arbeitssicherheit", "fachkraft für arbeitssicherheit", ] # Cluster C: NGO / Verein / oeffentliche Verwaltung _NONPROFIT_PUBLIC_POSITIVE = [ "spendenkonto", "vereinsregister", "gemeinnuetzig", "gemeinnützig", "ehrenamtlich", "foerderverein", "förderverein", "stiftung", "buergeramt", "bürgeramt", "landratsamt", "kommunalverwaltung", ] # Backwards-compat _NO_DIRECT_SALES_POSITIVE = ( _OEM_POSITIVE + _B2B_SERVICE_POSITIVE + _NONPROFIT_PUBLIC_POSITIVE ) # Indikatoren GEGEN no_direct_sales: echte Online-Shop-Funktionen. _DIRECT_SALES_NEGATIVE = [ "in den warenkorb", "warenkorb hinzu", "zur kasse", "jetzt kaufen", "kostenpflichtig bestellen", "zahlungspflichtig bestellen", "sofort-kauf", "online bestellen", "lieferadresse", "rechnungsadresse", "versandkosten", "lieferzeit", "lieferbedingungen", "checkout", "stueckpreis", "stückpreis", ] def _detect_no_direct_sales(full_text: str) -> bool: """Heuristik: True wenn Site keinen Direkt-Vertrieb mit B2C-Kunden hat. Trifft fuer 3 Cluster zu (jeweils mind. 2 Treffer im Cluster): A) OEM-Konfigurator (Auto-Hersteller) B) B2B-Dienstleister (Beratung/Compliance/Schulung) C) NGO / oeffentliche Verwaltung Negativ-Signale (echte Shop-Funktionen) zaehlen gegen den Cluster: nur True wenn pos > neg. """ text = full_text.lower() oem = sum(1 for k in _OEM_POSITIVE if k in text) b2b = sum(1 for k in _B2B_SERVICE_POSITIVE if k in text) npg = sum(1 for k in _NONPROFIT_PUBLIC_POSITIVE if k in text) neg = sum(1 for k in _DIRECT_SALES_NEGATIVE if k in text) # Jeder Cluster ist eigenstaendig: 2 Treffer + weniger Negativ-Signale # als Cluster-Treffer. if oem >= 2 and oem > neg: return True if b2b >= 2 and b2b > neg: return True if npg >= 2 and npg > neg: return True return False