""" Business Profiler — detect business model from document texts. Pure keyword-based detection (deterministic, no LLM). Analyzes DSE, Impressum, AGB, Widerruf etc. together to build a profile that drives context-aware compliance checks. Example: profile = await detect_business_profile({"dse": "...", "impressum": "..."}) profile.business_type # "b2c" profile.has_online_shop # True """ from __future__ import annotations import re from dataclasses import dataclass, field @dataclass class BusinessProfile: business_type: str = "unknown" # b2b, b2c, b2g, nonprofit, unknown industry: str = "unknown" # it_services, retail, healthcare, legal, craft, public, unknown has_online_shop: bool = False has_editorial_content: bool = False is_regulated_profession: bool = False regulated_profession_type: str = "" # arzt, anwalt, steuerberater, architekt, "" needs_odr: bool = False # Online-Streitbeilegung detected_services: list[str] = field(default_factory=list) confidence: float = 0.0 # ── Keyword lists ──────────────────────────────────────────────────── _B2C_KEYWORDS = [ "verbraucher", "warenkorb", "bestellung", "lieferung", "widerruf", "shop", "kaufpreis", "rueckgabe", "rückgabe", "endkunde", "kaeufer", "käufer", "privatkunde", "zahlungspflichtig bestellen", ] _B2B_KEYWORDS = [ # Discriminative — these don't appear in B2C consumer texts "geschaeftskunden", "geschäftskunden", "firmenkunde", "b2b", "industriekunden", "ausschliesslich gewerblich", "ausschließlich gewerblich", "ausschliesslich unternehmer", "ausschließlich unternehmer", "kein verbrauchergeschaeft", "kein verbrauchergeschäft", # Note: "unternehmen", "beratung", "consulting", "dienstleistung" # were removed — they match in any company text and bias toward B2B. ] _B2G_KEYWORDS = [ "koerperschaft des oeffentlichen rechts", "körperschaft des öffentlichen rechts", "gemeinde", "stadtverwaltung", "landesbehoerde", "landesbehörde", "kommunal", "buergerservice", "bürgerservice", "rathaus", "landesamt", "bundesamt", "oeffentliche verwaltung", "öffentliche verwaltung", "oeffentlicher dienst", "öffentlicher dienst", ] _NONPROFIT_KEYWORDS = [ "gemeinnuetzig", "gemeinnützig", "verein", "stiftung", "e.v.", "spende", "ehrenamtlich", "satzung", ] _REGULATED_PROFESSIONS = { # Anwalt — nur spezifische Begriffe, nicht "anwalt" allein # (matcht sonst Redaktionsanwalt, Justiziar etc.) "rechtsanwalt": "anwalt", "rechtsanwaeltin": "anwalt", "rechtsanwältin": "anwalt", "kanzlei": "anwalt", "rechtsanwaltskammer": "anwalt", "zugelassener anwalt": "anwalt", # Arzt — "praxis" entfernt (matcht "in der Praxis") "arztpraxis": "arzt", "zahnarzt": "arzt", "facharzt": "arzt", "aerztekammer": "arzt", "ärztekammer": "arzt", "kassenärztlich": "arzt", "kassenaerztlich": "arzt", # Steuerberater "steuerberater": "steuerberater", "steuerberaterin": "steuerberater", "steuerberaterkammer": "steuerberater", # Architekt "architekt": "architekt", "architektin": "architekt", "architektenkammer": "architekt", # Notar "notar": "notar", "notariat": "notar", # Apotheker "apotheke": "apotheker", "apotheker": "apotheker", } _ONLINE_SHOP_KEYWORDS = [ "warenkorb", "checkout", "bestellung", "lieferung", "versand", "paypal", "kreditkarte", "klarna", "sofortueberweisung", "sofortüberweisung", "zahlungsarten", "versandkosten", "lieferzeit", "retour", "paketdienst", ] _EDITORIAL_KEYWORDS = [ "blog", "ratgeber", "news", "redaktion", "artikel", "magazin", "beitrag", "kommentar", "podcast", "newsletter", "autor", ] _INDUSTRY_KEYWORDS = { # "software/cloud/hosting" are often mentioned in privacy texts of any # vendor (Cloud-Hosting fuer Newsletter, SaaS-Tools etc.) without making # the company an IT-services vendor itself. Keep the list deliberately # narrow: only patterns that strongly suggest IT/SaaS as the core business. "it_services": ["saas-anbieter", "software-as-a-service", "ihr saas", "ihre cloud", "hosting-provider", "api-anbieter", "developer-portal"], "retail": ["shop", "warenkorb", "versand", "lieferung", "einzelhandel"], "healthcare": ["arzt", "praxis", "patient", "gesundheit", "therapie", "klinik"], "legal": ["kanzlei", "rechtsanwalt", "mandant", "anwalt"], "craft": ["handwerk", "meister", "werkstatt", "montage", "gewerk"], "public": ["kommune", "stadtverwaltung", "buergerservice", "bürgerservice", "rathaus"], "finance": ["bank", "versicherung", "finanz", "kredit", "anlage"], "education": ["schule", "bildung", "unterricht", "lehrplan", "schueler", "schüler"], "consulting": ["beratung", "consulting", "schulung", "seminar", "gutachten", "audit", "arbeitssicherheit", "brandschutz", "sicherheitstechnik", "zertifizierung"], "manufacturing": ["fertigung", "produktion", "maschinenbau", "anlagenbau", "zulieferer", "werkzeugbau", "spritzguss", "cnc", "industrietechnik"], "automotive": ["fahrzeug", "kraftfahrzeug", "kfz", "automobil", "neuwagen", "gebrauchtwagen", "fahrzeugempfehlung", "modellreihe", "modellpalette", "antriebs", "motor", "reifen", "elektroauto", "verbrenner", "hybridfahrzeug", "leasing", "werkstatt", "wartung und reparatur", "probefahrt", "bmw", "mercedes", "audi", "volkswagen", "porsche", "opel"], "media": ["redaktion", "verlag", "medien", "journalismus", "presse"], } # Terms that indicate "versicherung" / "bank" is only mentioned as a # §34d/§34c GewO disclosure (Versicherungsvermittler / Finanzanlagenvermittler) # rather than the core business. Used to suppress false finance matches. _VERMITTLER_CONTEXT_TERMS = [ "versicherungsvermittler", "berufshaftpflichtversicherung", "vermittlerregister", "§34d", "§ 34 d", "§34c", "§ 34 c", "finanzanlagenvermittler", "ihk muenchen", "ihk münchen", ] _TRACKING_SERVICES = { "google analytics": "Google Analytics", "google tag manager": "Google Tag Manager", "matomo": "Matomo", "facebook pixel": "Facebook Pixel", "meta pixel": "Meta Pixel", "hotjar": "Hotjar", "hubspot": "HubSpot", "mailchimp": "Mailchimp", "linkedin insight": "LinkedIn Insight", "google ads": "Google Ads", "google adsense": "Google AdSense", "google maps": "Google Maps", "youtube": "YouTube", "vimeo": "Vimeo", "cloudflare": "Cloudflare", "sentry": "Sentry", "intercom": "Intercom", "zendesk": "Zendesk", "stripe": "Stripe", "paypal": "PayPal", } # ── Detection logic ────────────────────────────────────────────────── def _count_hits(text: str, keywords: list[str]) -> int: return sum(1 for kw in keywords if kw in text) async def detect_business_profile(documents: dict[str, str]) -> BusinessProfile: """Analyze all document texts together to detect business model. Args: documents: dict mapping doc_type -> text (e.g. {"dse": "...", "impressum": "..."}) """ profile = BusinessProfile() if not documents: return profile # Merge all texts for keyword search full_text = "\n".join(documents.values()).lower() full_text = full_text.replace("\xad", "") # strip soft hyphens # ── Tracking services (use full service detector) ────────── try: from compliance.services.service_detector import detect_services_in_text detected = detect_services_in_text(full_text) profile.detected_services = [s["name"] for s in detected] except Exception: # Fallback to simple keyword list for pattern, label in _TRACKING_SERVICES.items(): if pattern in full_text: profile.detected_services.append(label) # ── Online shop ────────────────────────────────────────────── shop_hits = _count_hits(full_text, _ONLINE_SHOP_KEYWORDS) profile.has_online_shop = shop_hits >= 3 # ── Editorial content ──────────────────────────────────────── editorial_hits = _count_hits(full_text, _EDITORIAL_KEYWORDS) profile.has_editorial_content = editorial_hits >= 2 # ── Regulated profession ───────────────────────────────────── # Only check impressum text (not full text) — keywords like "rechtsanwalt" # appear as contact persons in DSI texts (e.g. Spiegel's "Rechtsanwalt Kruse") # but that doesn't mean the company IS a law firm. impressum_text = documents.get("impressum", "").lower().replace("\xad", "") if not impressum_text: impressum_text = full_text[:2000] # Fallback: first 2000 chars for keyword, prof_type in _REGULATED_PROFESSIONS.items(): if keyword in impressum_text: # Extra guard: "rechtsanwalt" must appear near the company description, # not just as a contact person name if keyword in ("rechtsanwalt", "rechtsanwaeltin", "rechtsanwältin"): # Check if it's in the first 500 chars (company description area) if keyword not in impressum_text[:500]: continue profile.is_regulated_profession = True profile.regulated_profession_type = prof_type break # ── Business type ──────────────────────────────────────────── b2c_score = _count_hits(full_text, _B2C_KEYWORDS) b2b_score = _count_hits(full_text, _B2B_KEYWORDS) b2g_score = _count_hits(full_text, _B2G_KEYWORDS) nonprofit_score = _count_hits(full_text, _NONPROFIT_KEYWORDS) # Missing documents as signal has_agb = "agb" in documents has_widerruf = "widerruf" in documents if not has_agb: b2c_score -= 1 # No AGB → less likely B2C if not has_widerruf: b2c_score -= 1 # No Widerruf → less likely B2C shop if profile.has_online_shop: b2c_score += 3 # Strong B2C signal scores = { "b2c": b2c_score, "b2b": b2b_score, "b2g": b2g_score, "nonprofit": nonprofit_score, } best = max(scores, key=scores.get) # type: ignore[arg-type] best_val = scores[best] if best_val >= 2: profile.business_type = best total = sum(max(0, v) for v in scores.values()) profile.confidence = round(best_val / total, 2) if total > 0 else 0.5 else: # Fallback: prefer B2C when the text mentions Verbraucherrechte, # editorial content, or consumer-direction signals — even without # checkout keywords. Only fall back to B2B if discriminative B2B # markers fired (which the keyword list above already filtered to # genuinely B2B-only terms). consumer_hint = ( "verbraucher" in full_text or "widerruf" in full_text or "kunde" in full_text or profile.has_editorial_content ) if b2b_score >= 1 and not consumer_hint: profile.business_type = "b2b" profile.confidence = 0.4 elif consumer_hint: profile.business_type = "b2c" profile.confidence = 0.4 else: profile.business_type = "unknown" profile.confidence = 0.2 # ── ODR (Online-Streitbeilegung) ───────────────────────────── # Required for B2C with online shop (EU Regulation 524/2013) profile.needs_odr = ( profile.business_type == "b2c" and profile.has_online_shop ) # ── Industry ───────────────────────────────────────────────── industry_scores: dict[str, int] = {} for industry, keywords in _INDUSTRY_KEYWORDS.items(): hits = _count_hits(full_text, keywords) if hits >= 1: industry_scores[industry] = hits # Suppress finance/insurance false positives caused by §34d/§34c GewO # disclosures (Versicherungsvermittler, Berufshaftpflicht, etc.) — these # are pflichtangaben for many companies (e.g. BMW AG) without being # actual financial services providers. if industry_scores.get("finance"): vermittler_hits = _count_hits(full_text, _VERMITTLER_CONTEXT_TERMS) if vermittler_hits >= 2: # Only the §34d boilerplate triggered the match — drop or shrink. non_insurance_finance = _count_hits( full_text, ["bank", "finanz", "kredit", "anlage"], ) if non_insurance_finance == 0: industry_scores.pop("finance", None) else: industry_scores["finance"] = non_insurance_finance # Require a clear winner — if top score is 1 and there are ties, prefer # "unknown" over guessing. if industry_scores: top = max(industry_scores.values()) winners = [k for k, v in industry_scores.items() if v == top] if top >= 2 or len(winners) == 1: profile.industry = winners[0] else: profile.industry = "unknown" elif profile.is_regulated_profession: prof_map = {"anwalt": "legal", "arzt": "healthcare", "steuerberater": "finance", "architekt": "craft"} profile.industry = prof_map.get(profile.regulated_profession_type, "unknown") return profile