feat: Unified Compliance-Check — 8 document types in one form
New 3-tab structure: Website-Scan, Compliance-Check, Banner-Check. Compliance-Check Tab (replaces Dokumenten-Pruefung + Impressum-Check): - 8 document rows: DSI, Impressum, Social Media, Cookie, AGB, Nutzungsbedingungen, Widerruf, DSB-Kontakt - Each row: URL input + "Text laden" + file upload + manual text - "Text laden" extracts via consent-tester, shows in editable textarea - User verifies/corrects text before checking - Empty fields = "not present" → own finding Business Profiler (business_profiler.py): - Detects B2B/B2C/B2G from all documents together - Recognizes regulated professions, online shops, editorial content - Context-aware: INFO checks become PASS/FAIL based on profile Backend: /compliance-check + /extract-text endpoints Frontend: ComplianceCheckTab.tsx + DocumentRow.tsx API proxies: compliance-check/route.ts + extract-text/route.ts Also: Impressum regex fixes (Telefon, AG, Geschaeftsfuehrung) and INFO severity for context-dependent checks. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,439 @@
|
||||
"""
|
||||
Unified Compliance Check Routes — check all documents in one request.
|
||||
|
||||
POST /compliance/agent/extract-text — extract text from a URL
|
||||
POST /compliance/agent/compliance-check — unified check for all documents
|
||||
GET /compliance/agent/compliance-check/{check_id} — poll status
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import uuid as _uuid
|
||||
from dataclasses import asdict
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import httpx
|
||||
from fastapi import APIRouter
|
||||
from pydantic import BaseModel
|
||||
|
||||
from compliance.services.smtp_sender import send_email
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/compliance/agent", tags=["agent"])
|
||||
|
||||
CONSENT_TESTER_URL = "http://bp-compliance-consent-tester:8094"
|
||||
|
||||
# In-memory job store (same pattern as doc-check)
|
||||
_compliance_check_jobs: dict[str, dict] = {}
|
||||
|
||||
|
||||
# ── Models ───────────────────────────────────────────────────────────
|
||||
|
||||
class ExtractTextRequest(BaseModel):
|
||||
url: str
|
||||
|
||||
|
||||
class DocumentInput(BaseModel):
|
||||
doc_type: str # dse, agb, impressum, cookie, widerruf, avv, loeschkonzept, etc.
|
||||
url: str = ""
|
||||
text: str = "" # text has priority over URL
|
||||
|
||||
|
||||
class ComplianceCheckRequest(BaseModel):
|
||||
documents: list[DocumentInput]
|
||||
use_agent: bool = False
|
||||
recipient: str = "dsb@breakpilot.local"
|
||||
|
||||
|
||||
class ComplianceCheckStartResponse(BaseModel):
|
||||
check_id: str
|
||||
status: str = "running"
|
||||
|
||||
|
||||
class ComplianceCheckStatusResponse(BaseModel):
|
||||
check_id: str
|
||||
status: str
|
||||
progress: str = ""
|
||||
result: dict | None = None
|
||||
error: str = ""
|
||||
|
||||
|
||||
# ── Extract text endpoint ────────────────────────────────────────────
|
||||
|
||||
@router.post("/extract-text")
|
||||
async def extract_text(req: ExtractTextRequest):
|
||||
"""Extract text from a URL via consent-tester DSI discovery."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=90.0) as client:
|
||||
resp = await client.post(
|
||||
f"{CONSENT_TESTER_URL}/dsi-discovery",
|
||||
json={"url": req.url, "max_documents": 1},
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
return {
|
||||
"text": "", "word_count": 0, "title": "",
|
||||
"error": f"HTTP {resp.status_code} von Consent-Tester",
|
||||
}
|
||||
|
||||
data = resp.json()
|
||||
docs = data.get("documents", [])
|
||||
|
||||
if not docs:
|
||||
return {
|
||||
"text": "", "word_count": 0, "title": "",
|
||||
"error": "Kein Text extrahierbar",
|
||||
}
|
||||
|
||||
doc = docs[0]
|
||||
text = doc.get("full_text", "") or doc.get("text_preview", "") or doc.get("text", "")
|
||||
title = doc.get("title", "") or doc.get("doc_type", "")
|
||||
word_count = doc.get("word_count", 0) or len(text.split())
|
||||
|
||||
return {
|
||||
"text": text,
|
||||
"word_count": word_count,
|
||||
"title": title,
|
||||
"error": "",
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("extract-text failed for %s: %s", req.url, e)
|
||||
return {
|
||||
"text": "", "word_count": 0, "title": "",
|
||||
"error": str(e)[:200],
|
||||
}
|
||||
|
||||
|
||||
# ── Unified compliance check ────────────────────────────────────────
|
||||
|
||||
@router.post("/compliance-check")
|
||||
async def start_compliance_check(req: ComplianceCheckRequest):
|
||||
"""Start async compliance check for all documents."""
|
||||
check_id = str(_uuid.uuid4())[:8]
|
||||
_compliance_check_jobs[check_id] = {
|
||||
"status": "running",
|
||||
"progress": "Pruefung gestartet...",
|
||||
"result": None,
|
||||
"error": "",
|
||||
}
|
||||
asyncio.create_task(_run_compliance_check(check_id, req))
|
||||
return ComplianceCheckStartResponse(check_id=check_id, status="running")
|
||||
|
||||
|
||||
@router.get("/compliance-check/{check_id}")
|
||||
async def get_compliance_check_status(check_id: str):
|
||||
"""Poll compliance check status."""
|
||||
job = _compliance_check_jobs.get(check_id)
|
||||
if not job:
|
||||
return {"check_id": check_id, "status": "not_found"}
|
||||
return ComplianceCheckStatusResponse(
|
||||
check_id=check_id,
|
||||
status=job["status"],
|
||||
progress=job.get("progress", ""),
|
||||
result=job.get("result"),
|
||||
error=job.get("error", ""),
|
||||
)
|
||||
|
||||
|
||||
async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
"""Background task: check all documents with business-profile context."""
|
||||
try:
|
||||
from compliance.services.business_profiler import detect_business_profile
|
||||
from compliance.services.doc_checks.runner import check_document_completeness
|
||||
from compliance.services.rag_document_checker import check_document_with_controls
|
||||
from .agent_doc_check_routes import CheckItem, DocCheckResult
|
||||
from .agent_doc_check_report import build_html_report
|
||||
|
||||
# Step 1: Resolve texts (fetch from URL if needed)
|
||||
_update(check_id, "Texte werden geladen...")
|
||||
doc_texts: dict[str, str] = {}
|
||||
doc_entries: list[dict] = []
|
||||
|
||||
for i, doc in enumerate(req.documents):
|
||||
_update(check_id, f"Dokument {i+1}/{len(req.documents)}: {doc.doc_type}...")
|
||||
text = doc.text
|
||||
if not text and doc.url:
|
||||
text = await _fetch_text(doc.url)
|
||||
if text:
|
||||
doc_texts[doc.doc_type] = text
|
||||
doc_entries.append({
|
||||
"doc_type": doc.doc_type,
|
||||
"url": doc.url,
|
||||
"text": text,
|
||||
"word_count": len(text.split()) if text else 0,
|
||||
})
|
||||
|
||||
# Step 2: Detect business profile
|
||||
_update(check_id, "Geschaeftsmodell wird erkannt...")
|
||||
profile = await detect_business_profile(doc_texts)
|
||||
profile_dict = asdict(profile)
|
||||
|
||||
# Step 3: Check each document
|
||||
results: list[DocCheckResult] = []
|
||||
total_findings = 0
|
||||
use_agent_flag = req.use_agent or os.getenv(
|
||||
"COMPLIANCE_USE_AGENT", "false"
|
||||
).lower() == "true"
|
||||
|
||||
for i, entry in enumerate(doc_entries):
|
||||
text = entry["text"]
|
||||
doc_type = entry["doc_type"]
|
||||
label = _doc_type_label(doc_type)
|
||||
url = entry["url"]
|
||||
|
||||
_update(check_id, f"Pruefe {label} ({i+1}/{len(doc_entries)})...")
|
||||
|
||||
if not text or len(text) < 50:
|
||||
results.append(DocCheckResult(
|
||||
label=label, url=url, doc_type=doc_type,
|
||||
error="Kein Text vorhanden oder zu kurz",
|
||||
))
|
||||
continue
|
||||
|
||||
result = await _check_single(
|
||||
text, doc_type, label, url,
|
||||
entry["word_count"], use_agent_flag,
|
||||
)
|
||||
|
||||
# Apply profile context filter
|
||||
result = _apply_profile_filter(result, profile, doc_type)
|
||||
|
||||
results.append(result)
|
||||
total_findings += result.findings_count
|
||||
|
||||
# Step 4: Build report
|
||||
_update(check_id, "Report wird erstellt...")
|
||||
report_html = build_html_report(results, None)
|
||||
|
||||
# Prepend profile summary to report
|
||||
profile_html = _build_profile_html(profile)
|
||||
full_html = profile_html + report_html
|
||||
|
||||
# Step 5: Send email
|
||||
doc_count = len([r for r in results if not r.error])
|
||||
email_result = send_email(
|
||||
recipient=req.recipient,
|
||||
subject=f"[COMPLIANCE-CHECK] {doc_count} Dokumente geprueft",
|
||||
body_html=full_html,
|
||||
)
|
||||
|
||||
# Step 6: Store result
|
||||
response = {
|
||||
"results": [_result_to_dict(r) for r in results],
|
||||
"business_profile": profile_dict,
|
||||
"total_documents": len(results),
|
||||
"total_findings": total_findings,
|
||||
"email_status": email_result.get("status", "failed"),
|
||||
"checked_at": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
|
||||
_compliance_check_jobs[check_id]["status"] = "completed"
|
||||
_compliance_check_jobs[check_id]["result"] = response
|
||||
_compliance_check_jobs[check_id]["progress"] = "Fertig"
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Compliance check %s failed: %s", check_id, e, exc_info=True)
|
||||
_compliance_check_jobs[check_id]["status"] = "failed"
|
||||
_compliance_check_jobs[check_id]["error"] = str(e)[:500]
|
||||
|
||||
|
||||
def _update(check_id: str, msg: str):
|
||||
_compliance_check_jobs[check_id]["progress"] = msg
|
||||
|
||||
|
||||
async def _fetch_text(url: str) -> str:
|
||||
"""Fetch text from URL via consent-tester."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=90.0) as client:
|
||||
resp = await client.post(
|
||||
f"{CONSENT_TESTER_URL}/dsi-discovery",
|
||||
json={"url": url, "max_documents": 1},
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
return ""
|
||||
docs = resp.json().get("documents", [])
|
||||
if not docs:
|
||||
return ""
|
||||
doc = docs[0]
|
||||
return doc.get("full_text", "") or doc.get("text_preview", "") or ""
|
||||
except Exception as e:
|
||||
logger.warning("Text fetch failed for %s: %s", url, e)
|
||||
return ""
|
||||
|
||||
|
||||
async def _check_single(
|
||||
text: str, doc_type: str, label: str, url: str,
|
||||
word_count: int, use_agent: bool,
|
||||
):
|
||||
"""Run regex + MC checks on a single document."""
|
||||
from compliance.services.doc_checks.runner import check_document_completeness
|
||||
from compliance.services.rag_document_checker import check_document_with_controls
|
||||
from .agent_doc_check_routes import CheckItem, DocCheckResult
|
||||
|
||||
# Regex checklist
|
||||
findings = check_document_completeness(text, doc_type, label, url)
|
||||
|
||||
all_checks: list[CheckItem] = []
|
||||
completeness = 0
|
||||
correctness = 0
|
||||
|
||||
for f in findings:
|
||||
if "SCORE" in f.get("code", ""):
|
||||
for c in f.get("all_checks", []):
|
||||
all_checks.append(CheckItem(
|
||||
id=c["id"], label=c["label"], passed=c["passed"],
|
||||
severity=c["severity"], matched_text=c.get("matched_text", ""),
|
||||
level=c.get("level", 1), parent=c.get("parent"),
|
||||
skipped=c.get("skipped", False), hint=c.get("hint", ""),
|
||||
))
|
||||
completeness = f.get("completeness_pct", 0)
|
||||
correctness = f.get("correctness_pct", 0)
|
||||
|
||||
# Master Control checks
|
||||
try:
|
||||
mc_results = await check_document_with_controls(
|
||||
text, doc_type, label, max_controls=0, use_agent=use_agent,
|
||||
)
|
||||
if mc_results:
|
||||
for mc in mc_results:
|
||||
all_checks.append(CheckItem(**mc))
|
||||
l2 = [c for c in all_checks if c.level == 2 and not c.skipped]
|
||||
l2_passed = sum(1 for c in l2 if c.passed)
|
||||
correctness = round(l2_passed / len(l2) * 100) if l2 else 0
|
||||
except Exception as e:
|
||||
logger.warning("MC check skipped for %s: %s", label, e)
|
||||
|
||||
# LLM verification of regex fails
|
||||
failed = [c for c in all_checks if not c.passed and not c.skipped and c.hint]
|
||||
if failed:
|
||||
try:
|
||||
from compliance.services.doc_checks.llm_verify import verify_failed_checks
|
||||
overturns = await verify_failed_checks(
|
||||
text,
|
||||
[{"id": c.id, "label": c.label, "hint": c.hint} for c in failed],
|
||||
label,
|
||||
)
|
||||
for c in all_checks:
|
||||
if c.id in overturns and overturns[c.id]["overturned"]:
|
||||
c.passed = True
|
||||
c.matched_text = f"[LLM] {overturns[c.id]['evidence']}"
|
||||
l2_active = [c for c in all_checks if c.level == 2 and not c.skipped]
|
||||
l2_passed = sum(1 for c in l2_active if c.passed)
|
||||
if l2_active:
|
||||
correctness = round(l2_passed / len(l2_active) * 100)
|
||||
except Exception as e:
|
||||
logger.warning("LLM verification skipped: %s", e)
|
||||
|
||||
non_score = [f for f in findings if "SCORE" not in f.get("code", "")]
|
||||
return DocCheckResult(
|
||||
label=label, url=url, doc_type=doc_type,
|
||||
word_count=word_count or len(text.split()),
|
||||
completeness_pct=completeness, correctness_pct=correctness,
|
||||
checks=all_checks, findings_count=len(non_score),
|
||||
)
|
||||
|
||||
|
||||
def _apply_profile_filter(result, profile, doc_type: str):
|
||||
"""Adjust INFO-level checks based on business profile context.
|
||||
|
||||
For example: ODR check only relevant for B2C online shops.
|
||||
"""
|
||||
from .agent_doc_check_routes import CheckItem
|
||||
|
||||
for check in result.checks:
|
||||
cid = check.id.lower()
|
||||
|
||||
# ODR/OS-Link only relevant for B2C online shops
|
||||
if "odr" in cid or "os-link" in cid or "streitbeilegung" in check.label.lower():
|
||||
if not profile.needs_odr:
|
||||
check.skipped = True
|
||||
check.hint = "Nicht relevant (kein B2C Online-Shop)"
|
||||
|
||||
# Widerruf only relevant for B2C
|
||||
if doc_type == "widerruf" and profile.business_type not in ("b2c", "unknown"):
|
||||
if check.severity == "INFO":
|
||||
check.skipped = True
|
||||
|
||||
# Regulated profession: check for Kammer info
|
||||
if "kammer" in cid or "berufsordnung" in check.label.lower():
|
||||
if not profile.is_regulated_profession:
|
||||
check.skipped = True
|
||||
check.hint = "Nicht relevant (kein regulierter Beruf)"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ── Helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
_DOC_TYPE_LABELS = {
|
||||
"dse": "Datenschutzerklaerung",
|
||||
"datenschutz": "Datenschutzerklaerung",
|
||||
"privacy": "Datenschutzerklaerung",
|
||||
"impressum": "Impressum",
|
||||
"agb": "AGB",
|
||||
"widerruf": "Widerrufsbelehrung",
|
||||
"cookie": "Cookie-Richtlinie",
|
||||
"avv": "Auftragsverarbeitung",
|
||||
"loeschkonzept": "Loeschkonzept",
|
||||
"dsfa": "Datenschutz-Folgenabschaetzung",
|
||||
"social_media": "Social Media Datenschutz",
|
||||
}
|
||||
|
||||
|
||||
def _doc_type_label(doc_type: str) -> str:
|
||||
return _DOC_TYPE_LABELS.get(doc_type, doc_type.upper())
|
||||
|
||||
|
||||
def _result_to_dict(r) -> dict:
|
||||
"""Convert DocCheckResult to JSON-serializable dict."""
|
||||
return {
|
||||
"label": r.label, "url": r.url, "doc_type": r.doc_type,
|
||||
"word_count": r.word_count, "completeness_pct": r.completeness_pct,
|
||||
"correctness_pct": r.correctness_pct,
|
||||
"checks": [
|
||||
{
|
||||
"id": c.id, "label": c.label, "passed": c.passed,
|
||||
"severity": c.severity, "matched_text": c.matched_text,
|
||||
"level": c.level, "parent": c.parent,
|
||||
"skipped": c.skipped, "hint": c.hint,
|
||||
}
|
||||
for c in r.checks
|
||||
],
|
||||
"findings_count": r.findings_count, "error": r.error,
|
||||
}
|
||||
|
||||
|
||||
def _build_profile_html(profile) -> str:
|
||||
"""Build a small HTML block summarizing the detected business profile."""
|
||||
service_tags = ", ".join(profile.detected_services[:10]) or "keine erkannt"
|
||||
flags = []
|
||||
if profile.has_online_shop:
|
||||
flags.append("Online-Shop")
|
||||
if profile.has_editorial_content:
|
||||
flags.append("Redaktionelle Inhalte")
|
||||
if profile.is_regulated_profession:
|
||||
flags.append(f"Regulierter Beruf ({profile.regulated_profession_type})")
|
||||
if profile.needs_odr:
|
||||
flags.append("ODR-pflichtig")
|
||||
flags_str = ", ".join(flags) or "keine"
|
||||
|
||||
return (
|
||||
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
|
||||
'max-width:700px;margin:0 auto 16px;padding:12px 16px;'
|
||||
'background:#f0f9ff;border:1px solid #bae6fd;border-radius:8px">'
|
||||
'<h3 style="margin:0 0 8px;font-size:14px;color:#0369a1">'
|
||||
'Erkanntes Geschaeftsmodell</h3>'
|
||||
'<table style="font-size:13px;color:#374151">'
|
||||
f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Typ:</td>'
|
||||
f'<td><strong>{profile.business_type.upper()}</strong>'
|
||||
f' ({profile.industry})</td></tr>'
|
||||
f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Merkmale:</td>'
|
||||
f'<td>{flags_str}</td></tr>'
|
||||
f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Dienste:</td>'
|
||||
f'<td>{service_tags}</td></tr>'
|
||||
f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Konfidenz:</td>'
|
||||
f'<td>{int(profile.confidence * 100)}%</td></tr>'
|
||||
'</table></div>'
|
||||
)
|
||||
@@ -0,0 +1,223 @@
|
||||
"""
|
||||
Business Profiler — detect business model from document texts.
|
||||
|
||||
Pure keyword-based detection (deterministic, no LLM). Analyzes
|
||||
DSE, Impressum, AGB, Widerruf etc. together to build a profile
|
||||
that drives context-aware compliance checks.
|
||||
|
||||
Example:
|
||||
profile = await detect_business_profile({"dse": "...", "impressum": "..."})
|
||||
profile.business_type # "b2c"
|
||||
profile.has_online_shop # True
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class BusinessProfile:
|
||||
business_type: str = "unknown" # b2b, b2c, b2g, nonprofit, unknown
|
||||
industry: str = "unknown" # it_services, retail, healthcare, legal, craft, public, unknown
|
||||
has_online_shop: bool = False
|
||||
has_editorial_content: bool = False
|
||||
is_regulated_profession: bool = False
|
||||
regulated_profession_type: str = "" # arzt, anwalt, steuerberater, architekt, ""
|
||||
needs_odr: bool = False # Online-Streitbeilegung
|
||||
detected_services: list[str] = field(default_factory=list)
|
||||
confidence: float = 0.0
|
||||
|
||||
|
||||
# ── Keyword lists ────────────────────────────────────────────────────
|
||||
|
||||
_B2C_KEYWORDS = [
|
||||
"verbraucher", "warenkorb", "bestellung", "lieferung", "widerruf",
|
||||
"shop", "kaufpreis", "rueckgabe", "rückgabe", "endkunde", "kaeufer",
|
||||
"käufer", "privatkunde", "zahlungspflichtig bestellen",
|
||||
]
|
||||
|
||||
_B2B_KEYWORDS = [
|
||||
"unternehmen", "geschaeftskunden", "geschäftskunden", "gewerblich",
|
||||
"auftrag", "auftraggeber", "auftragnehmer", "geschaeftspartner",
|
||||
"geschäftspartner", "firmenkunde", "b2b",
|
||||
]
|
||||
|
||||
_B2G_KEYWORDS = [
|
||||
"behoerde", "behörde", "koerperschaft", "körperschaft", "oeffentlich",
|
||||
"öffentlich", "gemeinde", "amt", "stadtverwaltung", "landesbehoerde",
|
||||
"landesbehörde", "kommunal",
|
||||
]
|
||||
|
||||
_NONPROFIT_KEYWORDS = [
|
||||
"gemeinnuetzig", "gemeinnützig", "verein", "stiftung", "e.v.",
|
||||
"spende", "ehrenamtlich", "satzung",
|
||||
]
|
||||
|
||||
_REGULATED_PROFESSIONS = {
|
||||
"rechtsanwalt": "anwalt",
|
||||
"anwalt": "anwalt",
|
||||
"anwaeltin": "anwalt",
|
||||
"anwältin": "anwalt",
|
||||
"kanzlei": "anwalt",
|
||||
"rechtsanwaltskammer": "anwalt",
|
||||
"arzt": "arzt",
|
||||
"ärztin": "arzt",
|
||||
"aerztin": "arzt",
|
||||
"praxis": "arzt",
|
||||
"aerztekammer": "arzt",
|
||||
"ärztekammer": "arzt",
|
||||
"steuerberater": "steuerberater",
|
||||
"steuerberaterin": "steuerberater",
|
||||
"steuerberaterkammer": "steuerberater",
|
||||
"architekt": "architekt",
|
||||
"architektin": "architekt",
|
||||
"architektenkammer": "architekt",
|
||||
"notar": "notar",
|
||||
"notariat": "notar",
|
||||
"apotheke": "apotheker",
|
||||
"apotheker": "apotheker",
|
||||
}
|
||||
|
||||
_ONLINE_SHOP_KEYWORDS = [
|
||||
"warenkorb", "checkout", "bestellung", "lieferung", "versand",
|
||||
"paypal", "kreditkarte", "klarna", "sofortueberweisung",
|
||||
"sofortüberweisung", "zahlungsarten", "versandkosten",
|
||||
"lieferzeit", "retour", "paketdienst",
|
||||
]
|
||||
|
||||
_EDITORIAL_KEYWORDS = [
|
||||
"blog", "ratgeber", "news", "redaktion", "artikel", "magazin",
|
||||
"beitrag", "kommentar", "podcast", "newsletter", "autor",
|
||||
]
|
||||
|
||||
_INDUSTRY_KEYWORDS = {
|
||||
"it_services": ["software", "saas", "cloud", "hosting", "server", "api", "app"],
|
||||
"retail": ["shop", "warenkorb", "versand", "lieferung", "einzelhandel"],
|
||||
"healthcare": ["arzt", "praxis", "patient", "gesundheit", "therapie", "klinik"],
|
||||
"legal": ["kanzlei", "rechtsanwalt", "mandant", "anwalt"],
|
||||
"craft": ["handwerk", "meister", "werkstatt", "montage", "gewerk"],
|
||||
"public": ["behoerde", "behörde", "kommune", "verwaltung", "buerger", "bürger"],
|
||||
"finance": ["bank", "versicherung", "finanz", "kredit", "anlage"],
|
||||
"education": ["schule", "bildung", "unterricht", "lehrplan", "schueler", "schüler"],
|
||||
}
|
||||
|
||||
_TRACKING_SERVICES = {
|
||||
"google analytics": "Google Analytics",
|
||||
"google tag manager": "Google Tag Manager",
|
||||
"matomo": "Matomo",
|
||||
"facebook pixel": "Facebook Pixel",
|
||||
"meta pixel": "Meta Pixel",
|
||||
"hotjar": "Hotjar",
|
||||
"hubspot": "HubSpot",
|
||||
"mailchimp": "Mailchimp",
|
||||
"linkedin insight": "LinkedIn Insight",
|
||||
"google ads": "Google Ads",
|
||||
"google adsense": "Google AdSense",
|
||||
"google maps": "Google Maps",
|
||||
"youtube": "YouTube",
|
||||
"vimeo": "Vimeo",
|
||||
"cloudflare": "Cloudflare",
|
||||
"sentry": "Sentry",
|
||||
"intercom": "Intercom",
|
||||
"zendesk": "Zendesk",
|
||||
"stripe": "Stripe",
|
||||
"paypal": "PayPal",
|
||||
}
|
||||
|
||||
|
||||
# ── Detection logic ──────────────────────────────────────────────────
|
||||
|
||||
def _count_hits(text: str, keywords: list[str]) -> int:
|
||||
return sum(1 for kw in keywords if kw in text)
|
||||
|
||||
|
||||
async def detect_business_profile(documents: dict[str, str]) -> BusinessProfile:
|
||||
"""Analyze all document texts together to detect business model.
|
||||
|
||||
Args:
|
||||
documents: dict mapping doc_type -> text (e.g. {"dse": "...", "impressum": "..."})
|
||||
"""
|
||||
profile = BusinessProfile()
|
||||
if not documents:
|
||||
return profile
|
||||
|
||||
# Merge all texts for keyword search
|
||||
full_text = "\n".join(documents.values()).lower()
|
||||
full_text = full_text.replace("\xad", "") # strip soft hyphens
|
||||
|
||||
# ── Tracking services ────────────────────────────────────────
|
||||
for pattern, label in _TRACKING_SERVICES.items():
|
||||
if pattern in full_text:
|
||||
profile.detected_services.append(label)
|
||||
|
||||
# ── Online shop ──────────────────────────────────────────────
|
||||
shop_hits = _count_hits(full_text, _ONLINE_SHOP_KEYWORDS)
|
||||
profile.has_online_shop = shop_hits >= 3
|
||||
|
||||
# ── Editorial content ────────────────────────────────────────
|
||||
editorial_hits = _count_hits(full_text, _EDITORIAL_KEYWORDS)
|
||||
profile.has_editorial_content = editorial_hits >= 2
|
||||
|
||||
# ── Regulated profession ─────────────────────────────────────
|
||||
for keyword, prof_type in _REGULATED_PROFESSIONS.items():
|
||||
if keyword in full_text:
|
||||
profile.is_regulated_profession = True
|
||||
profile.regulated_profession_type = prof_type
|
||||
break
|
||||
|
||||
# ── Business type ────────────────────────────────────────────
|
||||
b2c_score = _count_hits(full_text, _B2C_KEYWORDS)
|
||||
b2b_score = _count_hits(full_text, _B2B_KEYWORDS)
|
||||
b2g_score = _count_hits(full_text, _B2G_KEYWORDS)
|
||||
nonprofit_score = _count_hits(full_text, _NONPROFIT_KEYWORDS)
|
||||
|
||||
# Missing documents as signal
|
||||
has_agb = "agb" in documents
|
||||
has_widerruf = "widerruf" in documents
|
||||
if not has_agb:
|
||||
b2c_score -= 1 # No AGB → less likely B2C
|
||||
if not has_widerruf:
|
||||
b2c_score -= 1 # No Widerruf → less likely B2C shop
|
||||
if profile.has_online_shop:
|
||||
b2c_score += 3 # Strong B2C signal
|
||||
|
||||
scores = {
|
||||
"b2c": b2c_score,
|
||||
"b2b": b2b_score,
|
||||
"b2g": b2g_score,
|
||||
"nonprofit": nonprofit_score,
|
||||
}
|
||||
best = max(scores, key=scores.get) # type: ignore[arg-type]
|
||||
best_val = scores[best]
|
||||
|
||||
if best_val >= 2:
|
||||
profile.business_type = best
|
||||
total = sum(max(0, v) for v in scores.values())
|
||||
profile.confidence = round(best_val / total, 2) if total > 0 else 0.5
|
||||
else:
|
||||
profile.business_type = "unknown"
|
||||
profile.confidence = 0.2
|
||||
|
||||
# ── ODR (Online-Streitbeilegung) ─────────────────────────────
|
||||
# Required for B2C with online shop (EU Regulation 524/2013)
|
||||
profile.needs_odr = (
|
||||
profile.business_type == "b2c" and profile.has_online_shop
|
||||
)
|
||||
|
||||
# ── Industry ─────────────────────────────────────────────────
|
||||
industry_scores: dict[str, int] = {}
|
||||
for industry, keywords in _INDUSTRY_KEYWORDS.items():
|
||||
hits = _count_hits(full_text, keywords)
|
||||
if hits >= 2:
|
||||
industry_scores[industry] = hits
|
||||
|
||||
if industry_scores:
|
||||
profile.industry = max(industry_scores, key=industry_scores.get) # type: ignore[arg-type]
|
||||
elif profile.is_regulated_profession:
|
||||
prof_map = {"anwalt": "legal", "arzt": "healthcare",
|
||||
"steuerberater": "finance", "architekt": "craft"}
|
||||
profile.industry = prof_map.get(profile.regulated_profession_type, "unknown")
|
||||
|
||||
return profile
|
||||
@@ -48,6 +48,8 @@ from compliance.api.agent_scan_routes import router as agent_scan_router
|
||||
from compliance.api.agent_history_routes import router as agent_history_router
|
||||
from compliance.api.agent_recurring_routes import router as agent_recurring_router
|
||||
from compliance.api.agent_compare_routes import router as agent_compare_router
|
||||
from compliance.api.agent_doc_check_routes import router as agent_doc_check_router
|
||||
from compliance.api.agent_compliance_check_routes import router as agent_compliance_check_router
|
||||
|
||||
# Middleware
|
||||
from middleware import (
|
||||
@@ -150,6 +152,8 @@ app.include_router(agent_scan_router, prefix="/api")
|
||||
app.include_router(agent_history_router, prefix="/api")
|
||||
app.include_router(agent_recurring_router, prefix="/api")
|
||||
app.include_router(agent_compare_router, prefix="/api")
|
||||
app.include_router(agent_doc_check_router, prefix="/api")
|
||||
app.include_router(agent_compliance_check_router, prefix="/api")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user