feat(compliance-check): profile extraction + scenario classification

- New profile_extractor.py: extracts Company Profile fields (name, legal form, address, DPO, USt-IdNr) and Compliance Scope hints (Art. 9 data, third country, profiling) from document texts - Scenario per document: regenerate (<30%), fix (30-95%), import (>95%) - Widerruf for B2B: no longer skipped, instead all checks flagged as INFO with "not needed for B2B" hint - Move _build_profile_html to report builder module - DocCheckResult gets scenario field Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-12 17:34:33 +02:00
parent be9cfdc2d4
commit 7be34552bb
4 changed files with 318 additions and 49 deletions
@@ -268,15 +268,29 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
                        l2p = sum(1 for c in l2 if c.passed)
                        r.correctness_pct = round(l2p / len(l2) * 100) if l2 else 0

-        # Step 4: Build report
+        # Step 4: Extract profile hints from documents
+        _update(check_id, "Profil wird aus Dokumenten extrahiert...")
+        from compliance.services.profile_extractor import extract_profile_from_documents
+        extracted_profile = extract_profile_from_documents(doc_texts, profile_dict)
+
+        # Step 4b: Determine scenario per document
+        for r in results:
+            if r.error:
+                r.scenario = "skip"
+            elif r.completeness_pct < 30:
+                r.scenario = "regenerate"
+            elif r.completeness_pct < 95:
+                r.scenario = "fix"
+            else:
+                r.scenario = "import"
+
+        # Step 5: Build report
        _update(check_id, "Report wird erstellt...")
        report_html = build_html_report(results, None)
-
-        # Prepend profile summary to report
        profile_html = _build_profile_html(profile)
        full_html = profile_html + report_html

-        # Step 5: Send email
+        # Step 6: Send email
        doc_count = len([r for r in results if not r.error])
        email_result = send_email(
            recipient=req.recipient,
@@ -284,10 +298,11 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
            body_html=full_html,
        )

-        # Step 6: Store result
+        # Step 7: Store result
        response = {
            "results": [_result_to_dict(r) for r in results],
            "business_profile": profile_dict,
+            "extracted_profile": extracted_profile,
            "banner_result": {
                "detected": banner_result.get("banner_detected", False) if banner_result else False,
                "provider": banner_result.get("banner_provider", "") if banner_result else "",
@@ -406,16 +421,9 @@ async def _check_single(


 def _get_skip_types(profile) -> dict[str, str]:
-    """Return doc_types to skip entirely based on business profile.
-
-    Returns dict mapping doc_type -> skip reason.
-    """
-    skip: dict[str, str] = {}
-    if profile.business_type in ("b2b", "b2g"):
-        skip["widerruf"] = "Uebersprungen: Widerrufsbelehrung nur fuer B2C relevant"
-    if profile.business_type in ("b2b", "b2g") and not profile.has_online_shop:
-        skip["nutzungsbedingungen"] = "Uebersprungen: Nutzungsbedingungen bei B2B ohne Shop selten relevant"
-    return skip
+    """Doc_types to skip entirely. Currently empty — we check everything
+    and flag irrelevant items as INFO instead of skipping."""
+    return {}


 def _apply_profile_filter(result, profile, doc_type: str):
@@ -434,10 +442,16 @@ def _apply_profile_filter(result, profile, doc_type: str):
                check.skipped = True
                check.hint = "Nicht relevant (kein B2C Online-Shop)"

-        # Widerruf only relevant for B2C
+        # Widerruf: Flag entire document as unnecessary for B2B
        if doc_type == "widerruf" and profile.business_type not in ("b2c", "unknown"):
-            if check.severity == "INFO":
-                check.skipped = True
+            check.severity = "INFO"
+            if not check.passed:
+                check.hint = (
+                    "Als B2B-Unternehmen benoetigen Sie keine Widerrufsbelehrung "
+                    "(§355 BGB gilt nur fuer Verbrauchervertraege). "
+                    "Empfehlung: Entfernen Sie die Widerrufsbelehrung von "
+                    "Ihrer Website, da sie Verwirrung stiften kann."
+                )

        # Regulated profession: check for Kammer info
        if "kammer" in cid or "berufsordnung" in check.label.lower():
@@ -479,41 +493,13 @@ def _result_to_dict(r) -> dict:
        "correctness_pct": r.correctness_pct,
        "checks": [{f: getattr(c, f) for f in fields} for c in r.checks],
        "findings_count": r.findings_count, "error": r.error,
+        "scenario": getattr(r, "scenario", ""),
    }


 def _build_profile_html(profile) -> str:
-    """Build a small HTML block summarizing the detected business profile."""
-    service_tags = ", ".join(profile.detected_services[:10]) or "keine erkannt"
-    flags = []
-    if profile.has_online_shop:
-        flags.append("Online-Shop")
-    if profile.has_editorial_content:
-        flags.append("Redaktionelle Inhalte")
-    if profile.is_regulated_profession:
-        flags.append(f"Regulierter Beruf ({profile.regulated_profession_type})")
-    if profile.needs_odr:
-        flags.append("ODR-pflichtig")
-    flags_str = ", ".join(flags) or "keine"
-
-    return (
-        '<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
-        'max-width:700px;margin:0 auto 16px;padding:12px 16px;'
-        'background:#f0f9ff;border:1px solid #bae6fd;border-radius:8px">'
-        '<h3 style="margin:0 0 8px;font-size:14px;color:#0369a1">'
-        'Erkanntes Geschaeftsmodell</h3>'
-        '<table style="font-size:13px;color:#374151">'
-        f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Typ:</td>'
-        f'<td><strong>{profile.business_type.upper()}</strong>'
-        f' ({profile.industry})</td></tr>'
-        f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Merkmale:</td>'
-        f'<td>{flags_str}</td></tr>'
-        f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Dienste:</td>'
-        f'<td>{service_tags}</td></tr>'
-        f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Konfidenz:</td>'
-        f'<td>{int(profile.confidence * 100)}%</td></tr>'
-        '</table></div>'
-    )
+    from .agent_doc_check_report import build_profile_html
+    return build_profile_html(profile)


 # Cross-check extracted to compliance.services.banner_cookie_cross_check
@@ -173,3 +173,37 @@ def _render_cookie_banner(html: list[str], cookie_result: dict) -> None:
    else:
        html.append('<br><span style="color:#22c55e">Keine Verstoesse erkannt.</span>')
    html.append('</div>')
+
+
+def build_profile_html(profile) -> str:
+    """Build a small HTML block summarizing the detected business profile."""
+    service_tags = ", ".join(profile.detected_services[:10]) or "keine erkannt"
+    flags = []
+    if profile.has_online_shop:
+        flags.append("Online-Shop")
+    if profile.has_editorial_content:
+        flags.append("Redaktionelle Inhalte")
+    if profile.is_regulated_profession:
+        flags.append(f"Regulierter Beruf ({profile.regulated_profession_type})")
+    if profile.needs_odr:
+        flags.append("ODR-pflichtig")
+    flags_str = ", ".join(flags) or "keine"
+
+    return (
+        '<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
+        'max-width:700px;margin:0 auto 16px;padding:12px 16px;'
+        'background:#f0f9ff;border:1px solid #bae6fd;border-radius:8px">'
+        '<h3 style="margin:0 0 8px;font-size:14px;color:#0369a1">'
+        'Erkanntes Geschaeftsmodell</h3>'
+        '<table style="font-size:13px;color:#374151">'
+        f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Typ:</td>'
+        f'<td><strong>{profile.business_type.upper()}</strong>'
+        f' ({profile.industry})</td></tr>'
+        f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Merkmale:</td>'
+        f'<td>{flags_str}</td></tr>'
+        f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Dienste:</td>'
+        f'<td>{service_tags}</td></tr>'
+        f'<tr><td style="padding:2px 12px 2px 0;color:#6b7280">Konfidenz:</td>'
+        f'<td>{int(profile.confidence * 100)}%</td></tr>'
+        '</table></div>'
+    )
@@ -65,6 +65,7 @@ class DocCheckResult(BaseModel):
    checks: list[CheckItem] = []
    findings_count: int = 0
    error: str = ""
+    scenario: str = ""  # regenerate | fix | import | skip


 class DocCheckResponse(BaseModel):