breakpilot-compliance/backend-compliance/compliance/services/business_profiler.py

"""
Business Profiler — detect business model from document texts.

Pure keyword-based detection (deterministic, no LLM). Analyzes
DSE, Impressum, AGB, Widerruf etc. together to build a profile
that drives context-aware compliance checks.

Example:
    profile = await detect_business_profile({"dse": "...", "impressum": "..."})
    profile.business_type  # "b2c"
    profile.has_online_shop  # True
"""

from __future__ import annotations

import re
from dataclasses import dataclass, field


@dataclass
class BusinessProfile:
    business_type: str = "unknown"  # b2b, b2c, b2g, nonprofit, unknown
    industry: str = "unknown"  # it_services, retail, healthcare, legal, craft, public, unknown
    has_online_shop: bool = False
    has_editorial_content: bool = False
    is_regulated_profession: bool = False
    regulated_profession_type: str = ""  # arzt, anwalt, steuerberater, architekt, ""
    needs_odr: bool = False  # Online-Streitbeilegung
    detected_services: list[str] = field(default_factory=list)
    confidence: float = 0.0


# ── Keyword lists ────────────────────────────────────────────────────

_B2C_KEYWORDS = [
    "verbraucher", "warenkorb", "bestellung", "lieferung", "widerruf",
    "shop", "kaufpreis", "rueckgabe", "rückgabe", "endkunde", "kaeufer",
    "käufer", "privatkunde", "zahlungspflichtig bestellen",
]

_B2B_KEYWORDS = [
    # Discriminative — these don't appear in B2C consumer texts
    "geschaeftskunden", "geschäftskunden", "firmenkunde", "b2b",
    "industriekunden", "ausschliesslich gewerblich", "ausschließlich gewerblich",
    "ausschliesslich unternehmer", "ausschließlich unternehmer",
    "kein verbrauchergeschaeft", "kein verbrauchergeschäft",
    # Note: "unternehmen", "beratung", "consulting", "dienstleistung"
    # were removed — they match in any company text and bias toward B2B.
]

_B2G_KEYWORDS = [
    "koerperschaft des oeffentlichen rechts", "körperschaft des öffentlichen rechts",
    "gemeinde", "stadtverwaltung", "landesbehoerde", "landesbehörde",
    "kommunal", "buergerservice", "bürgerservice", "rathaus",
    "landesamt", "bundesamt", "oeffentliche verwaltung", "öffentliche verwaltung",
    "oeffentlicher dienst", "öffentlicher dienst",
]

_NONPROFIT_KEYWORDS = [
    "gemeinnuetzig", "gemeinnützig", "verein", "stiftung", "e.v.",
    "spende", "ehrenamtlich", "satzung",
]

_REGULATED_PROFESSIONS = {
    # Anwalt — nur spezifische Begriffe, nicht "anwalt" allein
    # (matcht sonst Redaktionsanwalt, Justiziar etc.)
    "rechtsanwalt": "anwalt",
    "rechtsanwaeltin": "anwalt",
    "rechtsanwältin": "anwalt",
    "kanzlei": "anwalt",
    "rechtsanwaltskammer": "anwalt",
    "zugelassener anwalt": "anwalt",
    # Arzt — "praxis" entfernt (matcht "in der Praxis")
    "arztpraxis": "arzt",
    "zahnarzt": "arzt",
    "facharzt": "arzt",
    "aerztekammer": "arzt",
    "ärztekammer": "arzt",
    "kassenärztlich": "arzt",
    "kassenaerztlich": "arzt",
    # Steuerberater
    "steuerberater": "steuerberater",
    "steuerberaterin": "steuerberater",
    "steuerberaterkammer": "steuerberater",
    # Architekt
    "architekt": "architekt",
    "architektin": "architekt",
    "architektenkammer": "architekt",
    # Notar
    "notar": "notar",
    "notariat": "notar",
    # Apotheker
    "apotheke": "apotheker",
    "apotheker": "apotheker",
}

_ONLINE_SHOP_KEYWORDS = [
    "warenkorb", "checkout", "bestellung", "lieferung", "versand",
    "paypal", "kreditkarte", "klarna", "sofortueberweisung",
    "sofortüberweisung", "zahlungsarten", "versandkosten",
    "lieferzeit", "retour", "paketdienst",
]

_EDITORIAL_KEYWORDS = [
    "blog", "ratgeber", "news", "redaktion", "artikel", "magazin",
    "beitrag", "kommentar", "podcast", "newsletter", "autor",
]

_INDUSTRY_KEYWORDS = {
    "it_services": ["software", "saas", "cloud", "hosting", "api", "plattform"],
    "retail": ["shop", "warenkorb", "versand", "lieferung", "einzelhandel"],
    "healthcare": ["arzt", "praxis", "patient", "gesundheit", "therapie", "klinik"],
    "legal": ["kanzlei", "rechtsanwalt", "mandant", "anwalt"],
    "craft": ["handwerk", "meister", "werkstatt", "montage", "gewerk"],
    "public": ["kommune", "stadtverwaltung", "buergerservice", "bürgerservice", "rathaus"],
    "finance": ["bank", "versicherung", "finanz", "kredit", "anlage"],
    "education": ["schule", "bildung", "unterricht", "lehrplan", "schueler", "schüler"],
    "consulting": ["beratung", "consulting", "schulung", "seminar", "gutachten", "audit",
                    "arbeitssicherheit", "brandschutz", "sicherheitstechnik", "zertifizierung"],
    "manufacturing": ["fertigung", "produktion", "maschinenbau", "anlagenbau", "zulieferer",
                       "werkzeugbau", "spritzguss", "cnc", "industrietechnik"],
    "automotive": ["fahrzeug", "kraftfahrzeug", "kfz", "automobil", "neuwagen",
                    "gebrauchtwagen", "konfigurator", "modellreihe", "modellpalette"],
    "media": ["redaktion", "verlag", "medien", "journalismus", "presse"],
}

# Terms that indicate "versicherung" / "bank" is only mentioned as a
# §34d/§34c GewO disclosure (Versicherungsvermittler / Finanzanlagenvermittler)
# rather than the core business. Used to suppress false finance matches.
_VERMITTLER_CONTEXT_TERMS = [
    "versicherungsvermittler", "berufshaftpflichtversicherung",
    "vermittlerregister", "§34d", "§ 34 d", "§34c", "§ 34 c",
    "finanzanlagenvermittler", "ihk muenchen", "ihk münchen",
]

_TRACKING_SERVICES = {
    "google analytics": "Google Analytics",
    "google tag manager": "Google Tag Manager",
    "matomo": "Matomo",
    "facebook pixel": "Facebook Pixel",
    "meta pixel": "Meta Pixel",
    "hotjar": "Hotjar",
    "hubspot": "HubSpot",
    "mailchimp": "Mailchimp",
    "linkedin insight": "LinkedIn Insight",
    "google ads": "Google Ads",
    "google adsense": "Google AdSense",
    "google maps": "Google Maps",
    "youtube": "YouTube",
    "vimeo": "Vimeo",
    "cloudflare": "Cloudflare",
    "sentry": "Sentry",
    "intercom": "Intercom",
    "zendesk": "Zendesk",
    "stripe": "Stripe",
    "paypal": "PayPal",
}


# ── Detection logic ──────────────────────────────────────────────────

def _count_hits(text: str, keywords: list[str]) -> int:
    return sum(1 for kw in keywords if kw in text)


async def detect_business_profile(documents: dict[str, str]) -> BusinessProfile:
    """Analyze all document texts together to detect business model.

    Args:
        documents: dict mapping doc_type -> text (e.g. {"dse": "...", "impressum": "..."})
    """
    profile = BusinessProfile()
    if not documents:
        return profile

    # Merge all texts for keyword search
    full_text = "\n".join(documents.values()).lower()
    full_text = full_text.replace("\xad", "")  # strip soft hyphens

    # ── Tracking services (use full service detector) ──────────
    try:
        from compliance.services.service_detector import detect_services_in_text
        detected = detect_services_in_text(full_text)
        profile.detected_services = [s["name"] for s in detected]
    except Exception:
        # Fallback to simple keyword list
        for pattern, label in _TRACKING_SERVICES.items():
            if pattern in full_text:
                profile.detected_services.append(label)

    # ── Online shop ──────────────────────────────────────────────
    shop_hits = _count_hits(full_text, _ONLINE_SHOP_KEYWORDS)
    profile.has_online_shop = shop_hits >= 3

    # ── Editorial content ────────────────────────────────────────
    editorial_hits = _count_hits(full_text, _EDITORIAL_KEYWORDS)
    profile.has_editorial_content = editorial_hits >= 2

    # ── Regulated profession ─────────────────────────────────────
    # Only check impressum text (not full text) — keywords like "rechtsanwalt"
    # appear as contact persons in DSI texts (e.g. Spiegel's "Rechtsanwalt Kruse")
    # but that doesn't mean the company IS a law firm.
    impressum_text = documents.get("impressum", "").lower().replace("\xad", "")
    if not impressum_text:
        impressum_text = full_text[:2000]  # Fallback: first 2000 chars
    for keyword, prof_type in _REGULATED_PROFESSIONS.items():
        if keyword in impressum_text:
            # Extra guard: "rechtsanwalt" must appear near the company description,
            # not just as a contact person name
            if keyword in ("rechtsanwalt", "rechtsanwaeltin", "rechtsanwältin"):
                # Check if it's in the first 500 chars (company description area)
                if keyword not in impressum_text[:500]:
                    continue
            profile.is_regulated_profession = True
            profile.regulated_profession_type = prof_type
            break

    # ── Business type ────────────────────────────────────────────
    b2c_score = _count_hits(full_text, _B2C_KEYWORDS)
    b2b_score = _count_hits(full_text, _B2B_KEYWORDS)
    b2g_score = _count_hits(full_text, _B2G_KEYWORDS)
    nonprofit_score = _count_hits(full_text, _NONPROFIT_KEYWORDS)

    # Missing documents as signal
    has_agb = "agb" in documents
    has_widerruf = "widerruf" in documents
    if not has_agb:
        b2c_score -= 1  # No AGB → less likely B2C
    if not has_widerruf:
        b2c_score -= 1  # No Widerruf → less likely B2C shop
    if profile.has_online_shop:
        b2c_score += 3  # Strong B2C signal

    scores = {
        "b2c": b2c_score,
        "b2b": b2b_score,
        "b2g": b2g_score,
        "nonprofit": nonprofit_score,
    }
    best = max(scores, key=scores.get)  # type: ignore[arg-type]
    best_val = scores[best]

    if best_val >= 2:
        profile.business_type = best
        total = sum(max(0, v) for v in scores.values())
        profile.confidence = round(best_val / total, 2) if total > 0 else 0.5
    else:
        # Fallback: prefer B2C when the text mentions Verbraucherrechte,
        # editorial content, or consumer-direction signals — even without
        # checkout keywords. Only fall back to B2B if discriminative B2B
        # markers fired (which the keyword list above already filtered to
        # genuinely B2B-only terms).
        consumer_hint = (
            "verbraucher" in full_text
            or "widerruf" in full_text
            or "kunde" in full_text
            or profile.has_editorial_content
        )
        if b2b_score >= 1 and not consumer_hint:
            profile.business_type = "b2b"
            profile.confidence = 0.4
        elif consumer_hint:
            profile.business_type = "b2c"
            profile.confidence = 0.4
        else:
            profile.business_type = "unknown"
            profile.confidence = 0.2

    # ── ODR (Online-Streitbeilegung) ─────────────────────────────
    # Required for B2C with online shop (EU Regulation 524/2013)
    profile.needs_odr = (
        profile.business_type == "b2c" and profile.has_online_shop
    )

    # ── Industry ─────────────────────────────────────────────────
    industry_scores: dict[str, int] = {}
    for industry, keywords in _INDUSTRY_KEYWORDS.items():
        hits = _count_hits(full_text, keywords)
        if hits >= 1:
            industry_scores[industry] = hits

    # Suppress finance/insurance false positives caused by §34d/§34c GewO
    # disclosures (Versicherungsvermittler, Berufshaftpflicht, etc.) — these
    # are pflichtangaben for many companies (e.g. BMW AG) without being
    # actual financial services providers.
    if industry_scores.get("finance"):
        vermittler_hits = _count_hits(full_text, _VERMITTLER_CONTEXT_TERMS)
        if vermittler_hits >= 2:
            # Only the §34d boilerplate triggered the match — drop or shrink.
            non_insurance_finance = _count_hits(
                full_text, ["bank", "finanz", "kredit", "anlage"],
            )
            if non_insurance_finance == 0:
                industry_scores.pop("finance", None)
            else:
                industry_scores["finance"] = non_insurance_finance

    # Require a clear winner — if top score is 1 and there are ties, prefer
    # "unknown" over guessing.
    if industry_scores:
        top = max(industry_scores.values())
        winners = [k for k, v in industry_scores.items() if v == top]
        if top >= 2 or len(winners) == 1:
            profile.industry = winners[0]
        else:
            profile.industry = "unknown"
    elif profile.is_regulated_profession:
        prof_map = {"anwalt": "legal", "arzt": "healthcare",
                    "steuerberater": "finance", "architekt": "craft"}
        profile.industry = prof_map.get(profile.regulated_profession_type, "unknown")

    return profile