""" Business Profiler — detect business model from document texts. Pure keyword-based detection (deterministic, no LLM). Analyzes DSE, Impressum, AGB, Widerruf etc. together to build a profile that drives context-aware compliance checks. Example: profile = await detect_business_profile({"dse": "...", "impressum": "..."}) profile.business_type # "b2c" profile.has_online_shop # True """ from __future__ import annotations import re from dataclasses import dataclass, field @dataclass class BusinessProfile: business_type: str = "unknown" # b2b, b2c, b2g, nonprofit, unknown industry: str = "unknown" # it_services, retail, healthcare, legal, craft, public, unknown has_online_shop: bool = False has_editorial_content: bool = False is_regulated_profession: bool = False regulated_profession_type: str = "" # arzt, anwalt, steuerberater, architekt, "" needs_odr: bool = False # Online-Streitbeilegung detected_services: list[str] = field(default_factory=list) confidence: float = 0.0 # Wenn True: die Site selbst schliesst KEINEN Direktkauf-Vertrag # (typisch OEM-Konfigurator-Sites BMW/Audi/Mercedes — Vertrag laeuft # ueber den Vertragshaendler, nicht die Hersteller-Webseite). # Konsequenz: AGB/Widerruf/Nutzungsbedingungen sind NICHT PFLICHT # auf der Website, sondern werden beim Haendler ausgehaendigt. no_direct_sales: bool = False # ── Keyword lists ──────────────────────────────────────────────────── _B2C_KEYWORDS = [ "verbraucher", "warenkorb", "bestellung", "lieferung", "widerruf", "shop", "kaufpreis", "rueckgabe", "rückgabe", "endkunde", "kaeufer", "käufer", "privatkunde", "zahlungspflichtig bestellen", ] _B2B_KEYWORDS = [ # Discriminative — these don't appear in B2C consumer texts "geschaeftskunden", "geschäftskunden", "firmenkunde", "b2b", "industriekunden", "ausschliesslich gewerblich", "ausschließlich gewerblich", "ausschliesslich unternehmer", "ausschließlich unternehmer", "kein verbrauchergeschaeft", "kein verbrauchergeschäft", # Note: "unternehmen", "beratung", "consulting", "dienstleistung" # were removed — they match in any company text and bias toward B2B. ] _B2G_KEYWORDS = [ "koerperschaft des oeffentlichen rechts", "körperschaft des öffentlichen rechts", "gemeinde", "stadtverwaltung", "landesbehoerde", "landesbehörde", "kommunal", "buergerservice", "bürgerservice", "rathaus", "landesamt", "bundesamt", "oeffentliche verwaltung", "öffentliche verwaltung", "oeffentlicher dienst", "öffentlicher dienst", ] _NONPROFIT_KEYWORDS = [ "gemeinnuetzig", "gemeinnützig", "verein", "stiftung", "e.v.", "spende", "ehrenamtlich", "satzung", ] _REGULATED_PROFESSIONS = { # Anwalt — nur spezifische Begriffe, nicht "anwalt" allein # (matcht sonst Redaktionsanwalt, Justiziar etc.) "rechtsanwalt": "anwalt", "rechtsanwaeltin": "anwalt", "rechtsanwältin": "anwalt", "kanzlei": "anwalt", "rechtsanwaltskammer": "anwalt", "zugelassener anwalt": "anwalt", # Arzt — "praxis" entfernt (matcht "in der Praxis") "arztpraxis": "arzt", "zahnarzt": "arzt", "facharzt": "arzt", "aerztekammer": "arzt", "ärztekammer": "arzt", "kassenärztlich": "arzt", "kassenaerztlich": "arzt", # Steuerberater "steuerberater": "steuerberater", "steuerberaterin": "steuerberater", "steuerberaterkammer": "steuerberater", # Architekt "architekt": "architekt", "architektin": "architekt", "architektenkammer": "architekt", # Notar "notar": "notar", "notariat": "notar", # Apotheker "apotheke": "apotheker", "apotheker": "apotheker", } _ONLINE_SHOP_KEYWORDS = [ "warenkorb", "checkout", "bestellung", "lieferung", "versand", "paypal", "kreditkarte", "klarna", "sofortueberweisung", "sofortüberweisung", "zahlungsarten", "versandkosten", "lieferzeit", "retour", "paketdienst", ] _EDITORIAL_KEYWORDS = [ "blog", "ratgeber", "news", "redaktion", "artikel", "magazin", "beitrag", "kommentar", "podcast", "newsletter", "autor", ] _INDUSTRY_KEYWORDS = { # "software/cloud/hosting" are often mentioned in privacy texts of any # vendor (Cloud-Hosting fuer Newsletter, SaaS-Tools etc.) without making # the company an IT-services vendor itself. Keep the list deliberately # narrow: only patterns that strongly suggest IT/SaaS as the core business. "it_services": ["saas-anbieter", "software-as-a-service", "ihr saas", "ihre cloud", "hosting-provider", "api-anbieter", "developer-portal"], "retail": ["shop", "warenkorb", "versand", "lieferung", "einzelhandel"], "healthcare": ["arzt", "praxis", "patient", "gesundheit", "therapie", "klinik"], "legal": ["kanzlei", "rechtsanwalt", "mandant", "anwalt"], "craft": ["handwerk", "meister", "werkstatt", "montage", "gewerk"], "public": ["kommune", "stadtverwaltung", "buergerservice", "bürgerservice", "rathaus"], "finance": ["bank", "versicherung", "finanz", "kredit", "anlage"], "education": ["schule", "bildung", "unterricht", "lehrplan", "schueler", "schüler"], "consulting": ["beratung", "consulting", "schulung", "seminar", "gutachten", "audit", "arbeitssicherheit", "brandschutz", "sicherheitstechnik", "zertifizierung"], "manufacturing": ["fertigung", "produktion", "maschinenbau", "anlagenbau", "zulieferer", "werkzeugbau", "spritzguss", "cnc", "industrietechnik"], "automotive": ["fahrzeug", "kraftfahrzeug", "kfz", "automobil", "neuwagen", "gebrauchtwagen", "fahrzeugempfehlung", "modellreihe", "modellpalette", "antriebs", "motor", "reifen", "elektroauto", "verbrenner", "hybridfahrzeug", "leasing", "werkstatt", "wartung und reparatur", "probefahrt", "bmw", "mercedes", "audi", "volkswagen", "porsche", "opel"], "media": ["redaktion", "verlag", "medien", "journalismus", "presse"], } # Terms that indicate "versicherung" / "bank" is only mentioned as a # §34d/§34c GewO disclosure (Versicherungsvermittler / Finanzanlagenvermittler) # rather than the core business. Used to suppress false finance matches. _VERMITTLER_CONTEXT_TERMS = [ "versicherungsvermittler", "berufshaftpflichtversicherung", "vermittlerregister", "§34d", "§ 34 d", "§34c", "§ 34 c", "finanzanlagenvermittler", "ihk muenchen", "ihk münchen", ] _TRACKING_SERVICES = { "google analytics": "Google Analytics", "google tag manager": "Google Tag Manager", "matomo": "Matomo", "facebook pixel": "Facebook Pixel", "meta pixel": "Meta Pixel", "hotjar": "Hotjar", "hubspot": "HubSpot", "mailchimp": "Mailchimp", "linkedin insight": "LinkedIn Insight", "google ads": "Google Ads", "google adsense": "Google AdSense", "google maps": "Google Maps", "youtube": "YouTube", "vimeo": "Vimeo", "cloudflare": "Cloudflare", "sentry": "Sentry", "intercom": "Intercom", "zendesk": "Zendesk", "stripe": "Stripe", "paypal": "PayPal", } # ── Detection logic ────────────────────────────────────────────────── def _count_hits(text: str, keywords: list[str]) -> int: return sum(1 for kw in keywords if kw in text) async def detect_business_profile(documents: dict[str, str]) -> BusinessProfile: """Analyze all document texts together to detect business model. Args: documents: dict mapping doc_type -> text (e.g. {"dse": "...", "impressum": "..."}) """ profile = BusinessProfile() if not documents: return profile # Merge all texts for keyword search full_text = "\n".join(documents.values()).lower() full_text = full_text.replace("\xad", "") # strip soft hyphens # ── Tracking services (use full service detector) ────────── try: from compliance.services.service_detector import detect_services_in_text detected = detect_services_in_text(full_text) profile.detected_services = [s["name"] for s in detected] except Exception: # Fallback to simple keyword list for pattern, label in _TRACKING_SERVICES.items(): if pattern in full_text: profile.detected_services.append(label) # ── Online shop ────────────────────────────────────────────── shop_hits = _count_hits(full_text, _ONLINE_SHOP_KEYWORDS) profile.has_online_shop = shop_hits >= 3 # ── Editorial content ──────────────────────────────────────── editorial_hits = _count_hits(full_text, _EDITORIAL_KEYWORDS) profile.has_editorial_content = editorial_hits >= 2 # ── Regulated profession ───────────────────────────────────── # Only check impressum text (not full text) — keywords like "rechtsanwalt" # appear as contact persons in DSI texts (e.g. Spiegel's "Rechtsanwalt Kruse") # but that doesn't mean the company IS a law firm. impressum_text = documents.get("impressum", "").lower().replace("\xad", "") if not impressum_text: impressum_text = full_text[:2000] # Fallback: first 2000 chars for keyword, prof_type in _REGULATED_PROFESSIONS.items(): if keyword in impressum_text: # Extra guard: "rechtsanwalt" must appear near the company description, # not just as a contact person name if keyword in ("rechtsanwalt", "rechtsanwaeltin", "rechtsanwältin"): # Check if it's in the first 500 chars (company description area) if keyword not in impressum_text[:500]: continue profile.is_regulated_profession = True profile.regulated_profession_type = prof_type break # ── Business type ──────────────────────────────────────────── b2c_score = _count_hits(full_text, _B2C_KEYWORDS) b2b_score = _count_hits(full_text, _B2B_KEYWORDS) b2g_score = _count_hits(full_text, _B2G_KEYWORDS) nonprofit_score = _count_hits(full_text, _NONPROFIT_KEYWORDS) # Missing documents as signal has_agb = "agb" in documents has_widerruf = "widerruf" in documents if not has_agb: b2c_score -= 1 # No AGB → less likely B2C if not has_widerruf: b2c_score -= 1 # No Widerruf → less likely B2C shop if profile.has_online_shop: b2c_score += 3 # Strong B2C signal scores = { "b2c": b2c_score, "b2b": b2b_score, "b2g": b2g_score, "nonprofit": nonprofit_score, } best = max(scores, key=scores.get) # type: ignore[arg-type] best_val = scores[best] if best_val >= 2: profile.business_type = best total = sum(max(0, v) for v in scores.values()) profile.confidence = round(best_val / total, 2) if total > 0 else 0.5 else: # Fallback: prefer B2C when the text mentions Verbraucherrechte, # editorial content, or consumer-direction signals — even without # checkout keywords. Only fall back to B2B if discriminative B2B # markers fired (which the keyword list above already filtered to # genuinely B2B-only terms). consumer_hint = ( "verbraucher" in full_text or "widerruf" in full_text or "kunde" in full_text or profile.has_editorial_content ) if b2b_score >= 1 and not consumer_hint: profile.business_type = "b2b" profile.confidence = 0.4 elif consumer_hint: profile.business_type = "b2c" profile.confidence = 0.4 else: profile.business_type = "unknown" profile.confidence = 0.2 # ── ODR (Online-Streitbeilegung) ───────────────────────────── # Required for B2C with online shop (EU Regulation 524/2013) profile.needs_odr = ( profile.business_type == "b2c" and profile.has_online_shop ) # ── Industry ───────────────────────────────────────────────── industry_scores: dict[str, int] = {} for industry, keywords in _INDUSTRY_KEYWORDS.items(): hits = _count_hits(full_text, keywords) if hits >= 1: industry_scores[industry] = hits # Suppress finance/insurance false positives caused by §34d/§34c GewO # disclosures (Versicherungsvermittler, Berufshaftpflicht, etc.) — these # are pflichtangaben for many companies (e.g. BMW AG) without being # actual financial services providers. if industry_scores.get("finance"): vermittler_hits = _count_hits(full_text, _VERMITTLER_CONTEXT_TERMS) if vermittler_hits >= 2: # Only the §34d boilerplate triggered the match — drop or shrink. non_insurance_finance = _count_hits( full_text, ["bank", "finanz", "kredit", "anlage"], ) if non_insurance_finance == 0: industry_scores.pop("finance", None) else: industry_scores["finance"] = non_insurance_finance # Require a clear winner — if top score is 1 and there are ties, prefer # "unknown" over guessing. if industry_scores: top = max(industry_scores.values()) winners = [k for k, v in industry_scores.items() if v == top] if top >= 2 or len(winners) == 1: profile.industry = winners[0] else: profile.industry = "unknown" elif profile.is_regulated_profession: prof_map = {"anwalt": "legal", "arzt": "healthcare", "steuerberater": "finance", "architekt": "craft"} profile.industry = prof_map.get(profile.regulated_profession_type, "unknown") # ── no_direct_sales (OEM-Konfigurator-Pattern) ─────────────── # Hersteller-Sites die nur konfigurieren + zu Vertragshaendlern # weiterleiten (BMW/Audi/Mercedes/VW/Porsche) schliessen KEINEN # Direkt-Kaufvertrag. AGB/Widerruf/Nutzungsbedingungen sind dort # nicht Pflicht — werden beim Haendler ausgehaendigt. profile.no_direct_sales = _detect_no_direct_sales(full_text) return profile # Indikatoren: Site verweist primaer auf Vertragshaendler/Niederlassungen # statt einen eigenen Checkout-Vertragsabschluss zu bieten. _NO_DIRECT_SALES_POSITIVE = [ "vertragshaendler", "vertragshändler", "vertragspartner", "vertragswerkstatt", "haendlersuche", "händlersuche", "niederlassung", "vertretung", "autorisierter haendler", "autorisierter händler", "ihr haendler vor ort", "ihr händler vor ort", "haendler in ihrer naehe", "händler in ihrer nähe", "probefahrt vereinbaren", "anfrage an haendler", "anfrage an händler", "konfigurator", "fahrzeug konfigurieren", "ihre individuelle anfrage", # OEM-Markennamen — sind Hersteller-Marken die ueblicherweise via # Haendler vertreiben. "bmw vertriebs", "audi vertriebs", "mercedes-benz vertriebs", "volkswagen vertriebs", "porsche zentrum", ] # Indikatoren GEGEN no_direct_sales: echte Online-Shop-Funktionen. _DIRECT_SALES_NEGATIVE = [ "in den warenkorb", "warenkorb hinzu", "zur kasse", "jetzt kaufen", "kostenpflichtig bestellen", "zahlungspflichtig bestellen", "sofort-kauf", "online bestellen", "lieferadresse", "rechnungsadresse", ] def _detect_no_direct_sales(full_text: str) -> bool: """Heuristik: erkennt OEM-Konfigurator-Sites die nicht direkt verkaufen.""" text = full_text.lower() pos = sum(1 for k in _NO_DIRECT_SALES_POSITIVE if k in text) neg = sum(1 for k in _DIRECT_SALES_NEGATIVE if k in text) # Mindestens 3 Haendler-Indikatoren UND weniger Shop-Indikatoren als # Haendler-Indikatoren. Vermeidet false-positive fuer Shops die # zusaetzlich "Haendlersuche" als Filiale-Finder anbieten. return pos >= 3 and pos > neg