Files
breakpilot-compliance/backend-compliance/compliance/services/business_profiler.py
T
Benjamin Admin c702260ec1
Build + Deploy / build-backend-compliance (push) Successful in 23s
Build + Deploy / build-ai-sdk (push) Successful in 13s
Build + Deploy / build-admin-compliance (push) Successful in 13s
Build + Deploy / build-developer-portal (push) Successful in 14s
Build + Deploy / build-tts (push) Successful in 15s
Build + Deploy / build-document-crawler (push) Successful in 13s
CI / secret-scan (push) Has been skipped
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go (push) Successful in 39s
Build + Deploy / build-dsms-gateway (push) Successful in 15s
Build + Deploy / build-dsms-node (push) Successful in 14s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / loc-budget (push) Failing after 15s
CI / nodejs-build (push) Successful in 2m26s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-python-backend (push) Successful in 39s
CI / test-python-document-crawler (push) Successful in 25s
CI / test-python-dsms-gateway (push) Successful in 22s
CI / validate-canonical-controls (push) Successful in 15s
Build + Deploy / trigger-orca (push) Successful in 2m28s
fix: 5 regex bugs + text extraction scroll + GT update
Root cause: Spiegel DSI text was truncated (lazy-loading) — the
rights/DSB/complaints sections at the bottom were never extracted.

Fixes:
1. Text extraction: scroll to bottom before innerText (dsi_discovery.py)
2. V.i.S.d.P.: add "verantwortlicher i.s.v." + "§18 Abs. N MStV" pattern
3. USt-IdNr: add "umsatzsteuer-id" + "DE 212 442 423" (with spaces)
4. Profiler: remove generic "anwalt"/"praxis" (false positive on Spiegel
   "Redaktionsanwalt"), keep only "rechtsanwalt", "kanzlei" etc.
5. Section splitter: auto_fill_from_dsi() fills empty Cookie/Social-Media
   rows from sections found in the DSI text

Ground Truth 06-spiegel.md fully rewritten with verified data from
live website — 3 L1 False Negatives identified (DSB, Beschwerderecht,
Betroffenenrechte all present on website but not in extracted text).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-13 01:20:55 +02:00

248 lines
9.8 KiB
Python

"""
Business Profiler — detect business model from document texts.
Pure keyword-based detection (deterministic, no LLM). Analyzes
DSE, Impressum, AGB, Widerruf etc. together to build a profile
that drives context-aware compliance checks.
Example:
profile = await detect_business_profile({"dse": "...", "impressum": "..."})
profile.business_type # "b2c"
profile.has_online_shop # True
"""
from __future__ import annotations
import re
from dataclasses import dataclass, field
@dataclass
class BusinessProfile:
business_type: str = "unknown" # b2b, b2c, b2g, nonprofit, unknown
industry: str = "unknown" # it_services, retail, healthcare, legal, craft, public, unknown
has_online_shop: bool = False
has_editorial_content: bool = False
is_regulated_profession: bool = False
regulated_profession_type: str = "" # arzt, anwalt, steuerberater, architekt, ""
needs_odr: bool = False # Online-Streitbeilegung
detected_services: list[str] = field(default_factory=list)
confidence: float = 0.0
# ── Keyword lists ────────────────────────────────────────────────────
_B2C_KEYWORDS = [
"verbraucher", "warenkorb", "bestellung", "lieferung", "widerruf",
"shop", "kaufpreis", "rueckgabe", "rückgabe", "endkunde", "kaeufer",
"käufer", "privatkunde", "zahlungspflichtig bestellen",
]
_B2B_KEYWORDS = [
"unternehmen", "geschaeftskunden", "geschäftskunden", "gewerblich",
"auftraggeber", "auftragnehmer", "geschaeftspartner",
"geschäftspartner", "firmenkunde", "b2b", "industriekunden",
"beratung", "consulting", "dienstleistung", "engineering",
]
_B2G_KEYWORDS = [
"koerperschaft des oeffentlichen rechts", "körperschaft des öffentlichen rechts",
"gemeinde", "stadtverwaltung", "landesbehoerde", "landesbehörde",
"kommunal", "buergerservice", "bürgerservice", "rathaus",
"landesamt", "bundesamt", "oeffentliche verwaltung", "öffentliche verwaltung",
"oeffentlicher dienst", "öffentlicher dienst",
]
_NONPROFIT_KEYWORDS = [
"gemeinnuetzig", "gemeinnützig", "verein", "stiftung", "e.v.",
"spende", "ehrenamtlich", "satzung",
]
_REGULATED_PROFESSIONS = {
# Anwalt — nur spezifische Begriffe, nicht "anwalt" allein
# (matcht sonst Redaktionsanwalt, Justiziar etc.)
"rechtsanwalt": "anwalt",
"rechtsanwaeltin": "anwalt",
"rechtsanwältin": "anwalt",
"kanzlei": "anwalt",
"rechtsanwaltskammer": "anwalt",
"zugelassener anwalt": "anwalt",
# Arzt — "praxis" entfernt (matcht "in der Praxis")
"arztpraxis": "arzt",
"zahnarzt": "arzt",
"facharzt": "arzt",
"aerztekammer": "arzt",
"ärztekammer": "arzt",
"kassenärztlich": "arzt",
"kassenaerztlich": "arzt",
# Steuerberater
"steuerberater": "steuerberater",
"steuerberaterin": "steuerberater",
"steuerberaterkammer": "steuerberater",
# Architekt
"architekt": "architekt",
"architektin": "architekt",
"architektenkammer": "architekt",
# Notar
"notar": "notar",
"notariat": "notar",
# Apotheker
"apotheke": "apotheker",
"apotheker": "apotheker",
}
_ONLINE_SHOP_KEYWORDS = [
"warenkorb", "checkout", "bestellung", "lieferung", "versand",
"paypal", "kreditkarte", "klarna", "sofortueberweisung",
"sofortüberweisung", "zahlungsarten", "versandkosten",
"lieferzeit", "retour", "paketdienst",
]
_EDITORIAL_KEYWORDS = [
"blog", "ratgeber", "news", "redaktion", "artikel", "magazin",
"beitrag", "kommentar", "podcast", "newsletter", "autor",
]
_INDUSTRY_KEYWORDS = {
"it_services": ["software", "saas", "cloud", "hosting", "api", "plattform"],
"retail": ["shop", "warenkorb", "versand", "lieferung", "einzelhandel"],
"healthcare": ["arzt", "praxis", "patient", "gesundheit", "therapie", "klinik"],
"legal": ["kanzlei", "rechtsanwalt", "mandant", "anwalt"],
"craft": ["handwerk", "meister", "werkstatt", "montage", "gewerk"],
"public": ["kommune", "stadtverwaltung", "buergerservice", "bürgerservice", "rathaus"],
"finance": ["bank", "versicherung", "finanz", "kredit", "anlage"],
"education": ["schule", "bildung", "unterricht", "lehrplan", "schueler", "schüler"],
"consulting": ["beratung", "consulting", "schulung", "seminar", "gutachten", "audit",
"arbeitssicherheit", "brandschutz", "sicherheitstechnik", "zertifizierung"],
"manufacturing": ["fertigung", "produktion", "maschinenbau", "anlagenbau", "zulieferer",
"werkzeugbau", "spritzguss", "cnc", "industrietechnik"],
"media": ["redaktion", "verlag", "medien", "journalismus", "presse"],
}
_TRACKING_SERVICES = {
"google analytics": "Google Analytics",
"google tag manager": "Google Tag Manager",
"matomo": "Matomo",
"facebook pixel": "Facebook Pixel",
"meta pixel": "Meta Pixel",
"hotjar": "Hotjar",
"hubspot": "HubSpot",
"mailchimp": "Mailchimp",
"linkedin insight": "LinkedIn Insight",
"google ads": "Google Ads",
"google adsense": "Google AdSense",
"google maps": "Google Maps",
"youtube": "YouTube",
"vimeo": "Vimeo",
"cloudflare": "Cloudflare",
"sentry": "Sentry",
"intercom": "Intercom",
"zendesk": "Zendesk",
"stripe": "Stripe",
"paypal": "PayPal",
}
# ── Detection logic ──────────────────────────────────────────────────
def _count_hits(text: str, keywords: list[str]) -> int:
return sum(1 for kw in keywords if kw in text)
async def detect_business_profile(documents: dict[str, str]) -> BusinessProfile:
"""Analyze all document texts together to detect business model.
Args:
documents: dict mapping doc_type -> text (e.g. {"dse": "...", "impressum": "..."})
"""
profile = BusinessProfile()
if not documents:
return profile
# Merge all texts for keyword search
full_text = "\n".join(documents.values()).lower()
full_text = full_text.replace("\xad", "") # strip soft hyphens
# ── Tracking services ────────────────────────────────────────
for pattern, label in _TRACKING_SERVICES.items():
if pattern in full_text:
profile.detected_services.append(label)
# ── Online shop ──────────────────────────────────────────────
shop_hits = _count_hits(full_text, _ONLINE_SHOP_KEYWORDS)
profile.has_online_shop = shop_hits >= 3
# ── Editorial content ────────────────────────────────────────
editorial_hits = _count_hits(full_text, _EDITORIAL_KEYWORDS)
profile.has_editorial_content = editorial_hits >= 2
# ── Regulated profession ─────────────────────────────────────
for keyword, prof_type in _REGULATED_PROFESSIONS.items():
if keyword in full_text:
profile.is_regulated_profession = True
profile.regulated_profession_type = prof_type
break
# ── Business type ────────────────────────────────────────────
b2c_score = _count_hits(full_text, _B2C_KEYWORDS)
b2b_score = _count_hits(full_text, _B2B_KEYWORDS)
b2g_score = _count_hits(full_text, _B2G_KEYWORDS)
nonprofit_score = _count_hits(full_text, _NONPROFIT_KEYWORDS)
# Missing documents as signal
has_agb = "agb" in documents
has_widerruf = "widerruf" in documents
if not has_agb:
b2c_score -= 1 # No AGB → less likely B2C
if not has_widerruf:
b2c_score -= 1 # No Widerruf → less likely B2C shop
if profile.has_online_shop:
b2c_score += 3 # Strong B2C signal
scores = {
"b2c": b2c_score,
"b2b": b2b_score,
"b2g": b2g_score,
"nonprofit": nonprofit_score,
}
best = max(scores, key=scores.get) # type: ignore[arg-type]
best_val = scores[best]
if best_val >= 2:
profile.business_type = best
total = sum(max(0, v) for v in scores.values())
profile.confidence = round(best_val / total, 2) if total > 0 else 0.5
else:
# Fallback: GmbH/AG without B2C signals → assume B2B
has_company = any(kw in full_text for kw in [
"gmbh", "ag ", "ohg", "kg ", "ug ", "gbr",
])
if has_company and b2c_score <= 0:
profile.business_type = "b2b"
profile.confidence = 0.4
else:
profile.business_type = "unknown"
profile.confidence = 0.2
# ── ODR (Online-Streitbeilegung) ─────────────────────────────
# Required for B2C with online shop (EU Regulation 524/2013)
profile.needs_odr = (
profile.business_type == "b2c" and profile.has_online_shop
)
# ── Industry ─────────────────────────────────────────────────
industry_scores: dict[str, int] = {}
for industry, keywords in _INDUSTRY_KEYWORDS.items():
hits = _count_hits(full_text, keywords)
if hits >= 1:
industry_scores[industry] = hits
if industry_scores:
profile.industry = max(industry_scores, key=industry_scores.get) # type: ignore[arg-type]
elif profile.is_regulated_profession:
prof_map = {"anwalt": "legal", "arzt": "healthcare",
"steuerberater": "finance", "architekt": "craft"}
profile.industry = prof_map.get(profile.regulated_profession_type, "unknown")
return profile