feat: Unified Compliance-Check — 8 document types in one form
New 3-tab structure: Website-Scan, Compliance-Check, Banner-Check. Compliance-Check Tab (replaces Dokumenten-Pruefung + Impressum-Check): - 8 document rows: DSI, Impressum, Social Media, Cookie, AGB, Nutzungsbedingungen, Widerruf, DSB-Kontakt - Each row: URL input + "Text laden" + file upload + manual text - "Text laden" extracts via consent-tester, shows in editable textarea - User verifies/corrects text before checking - Empty fields = "not present" → own finding Business Profiler (business_profiler.py): - Detects B2B/B2C/B2G from all documents together - Recognizes regulated professions, online shops, editorial content - Context-aware: INFO checks become PASS/FAIL based on profile Backend: /compliance-check + /extract-text endpoints Frontend: ComplianceCheckTab.tsx + DocumentRow.tsx API proxies: compliance-check/route.ts + extract-text/route.ts Also: Impressum regex fixes (Telefon, AG, Geschaeftsfuehrung) and INFO severity for context-dependent checks. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,223 @@
|
||||
"""
|
||||
Business Profiler — detect business model from document texts.
|
||||
|
||||
Pure keyword-based detection (deterministic, no LLM). Analyzes
|
||||
DSE, Impressum, AGB, Widerruf etc. together to build a profile
|
||||
that drives context-aware compliance checks.
|
||||
|
||||
Example:
|
||||
profile = await detect_business_profile({"dse": "...", "impressum": "..."})
|
||||
profile.business_type # "b2c"
|
||||
profile.has_online_shop # True
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class BusinessProfile:
|
||||
business_type: str = "unknown" # b2b, b2c, b2g, nonprofit, unknown
|
||||
industry: str = "unknown" # it_services, retail, healthcare, legal, craft, public, unknown
|
||||
has_online_shop: bool = False
|
||||
has_editorial_content: bool = False
|
||||
is_regulated_profession: bool = False
|
||||
regulated_profession_type: str = "" # arzt, anwalt, steuerberater, architekt, ""
|
||||
needs_odr: bool = False # Online-Streitbeilegung
|
||||
detected_services: list[str] = field(default_factory=list)
|
||||
confidence: float = 0.0
|
||||
|
||||
|
||||
# ── Keyword lists ────────────────────────────────────────────────────
|
||||
|
||||
_B2C_KEYWORDS = [
|
||||
"verbraucher", "warenkorb", "bestellung", "lieferung", "widerruf",
|
||||
"shop", "kaufpreis", "rueckgabe", "rückgabe", "endkunde", "kaeufer",
|
||||
"käufer", "privatkunde", "zahlungspflichtig bestellen",
|
||||
]
|
||||
|
||||
_B2B_KEYWORDS = [
|
||||
"unternehmen", "geschaeftskunden", "geschäftskunden", "gewerblich",
|
||||
"auftrag", "auftraggeber", "auftragnehmer", "geschaeftspartner",
|
||||
"geschäftspartner", "firmenkunde", "b2b",
|
||||
]
|
||||
|
||||
_B2G_KEYWORDS = [
|
||||
"behoerde", "behörde", "koerperschaft", "körperschaft", "oeffentlich",
|
||||
"öffentlich", "gemeinde", "amt", "stadtverwaltung", "landesbehoerde",
|
||||
"landesbehörde", "kommunal",
|
||||
]
|
||||
|
||||
_NONPROFIT_KEYWORDS = [
|
||||
"gemeinnuetzig", "gemeinnützig", "verein", "stiftung", "e.v.",
|
||||
"spende", "ehrenamtlich", "satzung",
|
||||
]
|
||||
|
||||
_REGULATED_PROFESSIONS = {
|
||||
"rechtsanwalt": "anwalt",
|
||||
"anwalt": "anwalt",
|
||||
"anwaeltin": "anwalt",
|
||||
"anwältin": "anwalt",
|
||||
"kanzlei": "anwalt",
|
||||
"rechtsanwaltskammer": "anwalt",
|
||||
"arzt": "arzt",
|
||||
"ärztin": "arzt",
|
||||
"aerztin": "arzt",
|
||||
"praxis": "arzt",
|
||||
"aerztekammer": "arzt",
|
||||
"ärztekammer": "arzt",
|
||||
"steuerberater": "steuerberater",
|
||||
"steuerberaterin": "steuerberater",
|
||||
"steuerberaterkammer": "steuerberater",
|
||||
"architekt": "architekt",
|
||||
"architektin": "architekt",
|
||||
"architektenkammer": "architekt",
|
||||
"notar": "notar",
|
||||
"notariat": "notar",
|
||||
"apotheke": "apotheker",
|
||||
"apotheker": "apotheker",
|
||||
}
|
||||
|
||||
_ONLINE_SHOP_KEYWORDS = [
|
||||
"warenkorb", "checkout", "bestellung", "lieferung", "versand",
|
||||
"paypal", "kreditkarte", "klarna", "sofortueberweisung",
|
||||
"sofortüberweisung", "zahlungsarten", "versandkosten",
|
||||
"lieferzeit", "retour", "paketdienst",
|
||||
]
|
||||
|
||||
_EDITORIAL_KEYWORDS = [
|
||||
"blog", "ratgeber", "news", "redaktion", "artikel", "magazin",
|
||||
"beitrag", "kommentar", "podcast", "newsletter", "autor",
|
||||
]
|
||||
|
||||
_INDUSTRY_KEYWORDS = {
|
||||
"it_services": ["software", "saas", "cloud", "hosting", "server", "api", "app"],
|
||||
"retail": ["shop", "warenkorb", "versand", "lieferung", "einzelhandel"],
|
||||
"healthcare": ["arzt", "praxis", "patient", "gesundheit", "therapie", "klinik"],
|
||||
"legal": ["kanzlei", "rechtsanwalt", "mandant", "anwalt"],
|
||||
"craft": ["handwerk", "meister", "werkstatt", "montage", "gewerk"],
|
||||
"public": ["behoerde", "behörde", "kommune", "verwaltung", "buerger", "bürger"],
|
||||
"finance": ["bank", "versicherung", "finanz", "kredit", "anlage"],
|
||||
"education": ["schule", "bildung", "unterricht", "lehrplan", "schueler", "schüler"],
|
||||
}
|
||||
|
||||
_TRACKING_SERVICES = {
|
||||
"google analytics": "Google Analytics",
|
||||
"google tag manager": "Google Tag Manager",
|
||||
"matomo": "Matomo",
|
||||
"facebook pixel": "Facebook Pixel",
|
||||
"meta pixel": "Meta Pixel",
|
||||
"hotjar": "Hotjar",
|
||||
"hubspot": "HubSpot",
|
||||
"mailchimp": "Mailchimp",
|
||||
"linkedin insight": "LinkedIn Insight",
|
||||
"google ads": "Google Ads",
|
||||
"google adsense": "Google AdSense",
|
||||
"google maps": "Google Maps",
|
||||
"youtube": "YouTube",
|
||||
"vimeo": "Vimeo",
|
||||
"cloudflare": "Cloudflare",
|
||||
"sentry": "Sentry",
|
||||
"intercom": "Intercom",
|
||||
"zendesk": "Zendesk",
|
||||
"stripe": "Stripe",
|
||||
"paypal": "PayPal",
|
||||
}
|
||||
|
||||
|
||||
# ── Detection logic ──────────────────────────────────────────────────
|
||||
|
||||
def _count_hits(text: str, keywords: list[str]) -> int:
|
||||
return sum(1 for kw in keywords if kw in text)
|
||||
|
||||
|
||||
async def detect_business_profile(documents: dict[str, str]) -> BusinessProfile:
|
||||
"""Analyze all document texts together to detect business model.
|
||||
|
||||
Args:
|
||||
documents: dict mapping doc_type -> text (e.g. {"dse": "...", "impressum": "..."})
|
||||
"""
|
||||
profile = BusinessProfile()
|
||||
if not documents:
|
||||
return profile
|
||||
|
||||
# Merge all texts for keyword search
|
||||
full_text = "\n".join(documents.values()).lower()
|
||||
full_text = full_text.replace("\xad", "") # strip soft hyphens
|
||||
|
||||
# ── Tracking services ────────────────────────────────────────
|
||||
for pattern, label in _TRACKING_SERVICES.items():
|
||||
if pattern in full_text:
|
||||
profile.detected_services.append(label)
|
||||
|
||||
# ── Online shop ──────────────────────────────────────────────
|
||||
shop_hits = _count_hits(full_text, _ONLINE_SHOP_KEYWORDS)
|
||||
profile.has_online_shop = shop_hits >= 3
|
||||
|
||||
# ── Editorial content ────────────────────────────────────────
|
||||
editorial_hits = _count_hits(full_text, _EDITORIAL_KEYWORDS)
|
||||
profile.has_editorial_content = editorial_hits >= 2
|
||||
|
||||
# ── Regulated profession ─────────────────────────────────────
|
||||
for keyword, prof_type in _REGULATED_PROFESSIONS.items():
|
||||
if keyword in full_text:
|
||||
profile.is_regulated_profession = True
|
||||
profile.regulated_profession_type = prof_type
|
||||
break
|
||||
|
||||
# ── Business type ────────────────────────────────────────────
|
||||
b2c_score = _count_hits(full_text, _B2C_KEYWORDS)
|
||||
b2b_score = _count_hits(full_text, _B2B_KEYWORDS)
|
||||
b2g_score = _count_hits(full_text, _B2G_KEYWORDS)
|
||||
nonprofit_score = _count_hits(full_text, _NONPROFIT_KEYWORDS)
|
||||
|
||||
# Missing documents as signal
|
||||
has_agb = "agb" in documents
|
||||
has_widerruf = "widerruf" in documents
|
||||
if not has_agb:
|
||||
b2c_score -= 1 # No AGB → less likely B2C
|
||||
if not has_widerruf:
|
||||
b2c_score -= 1 # No Widerruf → less likely B2C shop
|
||||
if profile.has_online_shop:
|
||||
b2c_score += 3 # Strong B2C signal
|
||||
|
||||
scores = {
|
||||
"b2c": b2c_score,
|
||||
"b2b": b2b_score,
|
||||
"b2g": b2g_score,
|
||||
"nonprofit": nonprofit_score,
|
||||
}
|
||||
best = max(scores, key=scores.get) # type: ignore[arg-type]
|
||||
best_val = scores[best]
|
||||
|
||||
if best_val >= 2:
|
||||
profile.business_type = best
|
||||
total = sum(max(0, v) for v in scores.values())
|
||||
profile.confidence = round(best_val / total, 2) if total > 0 else 0.5
|
||||
else:
|
||||
profile.business_type = "unknown"
|
||||
profile.confidence = 0.2
|
||||
|
||||
# ── ODR (Online-Streitbeilegung) ─────────────────────────────
|
||||
# Required for B2C with online shop (EU Regulation 524/2013)
|
||||
profile.needs_odr = (
|
||||
profile.business_type == "b2c" and profile.has_online_shop
|
||||
)
|
||||
|
||||
# ── Industry ─────────────────────────────────────────────────
|
||||
industry_scores: dict[str, int] = {}
|
||||
for industry, keywords in _INDUSTRY_KEYWORDS.items():
|
||||
hits = _count_hits(full_text, keywords)
|
||||
if hits >= 2:
|
||||
industry_scores[industry] = hits
|
||||
|
||||
if industry_scores:
|
||||
profile.industry = max(industry_scores, key=industry_scores.get) # type: ignore[arg-type]
|
||||
elif profile.is_regulated_profession:
|
||||
prof_map = {"anwalt": "legal", "arzt": "healthcare",
|
||||
"steuerberater": "finance", "architekt": "craft"}
|
||||
profile.industry = prof_map.get(profile.regulated_profession_type, "unknown")
|
||||
|
||||
return profile
|
||||
Reference in New Issue
Block a user