From 313982c6f1913f30fd39233300a86f89a4f4a76f Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 19 May 2026 12:22:05 +0200 Subject: [PATCH] =?UTF-8?q?feat(profile+report):=20P17=20=E2=80=94=204=20P?= =?UTF-8?q?olish-Items?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A) Cookie-Policy-Architecture-Block Fallback auf DSE-Text wenn cookie via P15 deduped wurde. Erkennt jetzt auch single-doc Sites (Safetykon-Pattern). B) Konkrete-Aufgaben-Liste: Per-Doc-Cap (3) entfernt + globaler Cap 10→20. Safetykon zeigt jetzt 7 statt 4 Aufgaben. C) business_type-Klassifizierer: B2B-Service-Cluster aus P14 als Boost. Bei 2+ Service-Indikatoren (CE-Zertifizierung/Compliance/Auditierung) wird b2b_score angehoben. Safetykon: "B2C consulting" → "B2B (consulting)". D) Vendor-Extract Fallback auf DSE-Text wenn cookie deduped + keine CMP- Payloads. LLM extrahiert dann Vendors aus dem DSE-Text. Safetykon: 0 → 1 Vendor (Google Analytics aus dem DSE-Text erkannt). Smoke-Test Safetykon: alle 4 Polish-Items wirken, kein Regression. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../api/agent_compliance_check_routes.py | 22 +++++++++++++++++++ .../compliance/api/agent_doc_check_report.py | 4 ++-- .../compliance/services/business_profiler.py | 7 ++++++ 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py index 18d8bffc..6ece4bbb 100644 --- a/backend-compliance/compliance/api/agent_compliance_check_routes.py +++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py @@ -536,6 +536,15 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): cookie_payloads.extend(e["cmp_payloads"]) if e.get("text"): cookie_text = e["text"] + # P17-D: Fallback wenn cookie via P15 deduped wurde — nutze DSE-Text + # sofern Cookie-Begriffe drin sind, damit LLM-Vendor-Extract trotzdem + # greifen kann. + if not cookie_text and not cookie_payloads: + dse_t = doc_texts.get("dse", "") + if dse_t and any(w in dse_t.lower() for w in + ("cookie", "tracking", "google analytics", "consent")): + cookie_text = dse_t + logger.info("P17-D: vendor-extract Fallback auf DSE (Cookie deduped)") # Site-owner derived from the submitted URLs — drives the # INTERNAL/GROUP_COMPANY classification of vendor records. owner_name = _company_name_from_url(doc_entries) or "" @@ -681,6 +690,19 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): cookie_doc_url = e.get("url", "") cookie_cmp_payloads = e.get("cmp_payloads") or [] break + # P17-A: Fallback wenn Cookie-Doc via P15 deduped wurde — nutze + # den DSE-Text wenn er Cookie-Schluesselwoerter enthaelt. + if not cookie_doc_text: + dse_text = doc_texts.get("dse", "") + if dse_text and any(w in dse_text.lower() for w in + ("cookie", "tracking", "google analytics", + "consent")): + cookie_doc_text = dse_text + dse_entry = next((e for e in doc_entries + if e.get("doc_type") == "dse"), {}) + cookie_doc_url = dse_entry.get("url", "") + cookie_cmp_payloads = dse_entry.get("cmp_payloads") or [] + logger.info("P17-A: cookie-arch fallback auf DSE (Cookie-Doc deduped)") if cookie_doc_text: arch = detect_architecture( doc_url=cookie_doc_url, diff --git a/backend-compliance/compliance/api/agent_doc_check_report.py b/backend-compliance/compliance/api/agent_doc_check_report.py index fa1aefab..77c0cb97 100644 --- a/backend-compliance/compliance/api/agent_doc_check_report.py +++ b/backend-compliance/compliance/api/agent_doc_check_report.py @@ -182,7 +182,7 @@ def build_management_summary(results: list[DocCheckResult]) -> str: if c.level == 1 and not c.passed and not c.skipped and c.severity != "INFO" ] - for c in failed_checks[:3]: # Max 3 per document + for c in failed_checks: # P17-B: kein Per-Doc-Cap action = _check_to_action(r.label, c.label, c.hint) if action: actions.append(action) @@ -193,7 +193,7 @@ def build_management_summary(results: list[DocCheckResult]) -> str: 'Konkrete Aufgaben:' '
    ' ) - for a in actions[:10]: # Max 10 actions + for a in actions[:20]: # P17-B: 10 -> 20 html.append(f'
  1. {a}
  2. ') html.append('
') diff --git a/backend-compliance/compliance/services/business_profiler.py b/backend-compliance/compliance/services/business_profiler.py index 8cfdc464..4a289ac1 100644 --- a/backend-compliance/compliance/services/business_profiler.py +++ b/backend-compliance/compliance/services/business_profiler.py @@ -237,6 +237,13 @@ async def detect_business_profile(documents: dict[str, str]) -> BusinessProfile: b2g_score = _count_hits(full_text, _B2G_KEYWORDS) nonprofit_score = _count_hits(full_text, _NONPROFIT_KEYWORDS) + # P17-C: B2B-Dienstleister-Cluster (P14) als Boost — wenn ein Unternehmen + # CE-Zertifizierung / Compliance-Beratung / Auditierung / Schulungen anbietet, + # ist es i.d.R. B2B auch wenn die strikten B2B-Keywords nicht greifen. + b2b_service_boost = _count_hits(full_text, _B2B_SERVICE_POSITIVE) + if b2b_service_boost >= 2: + b2b_score += min(3, b2b_service_boost - 1) + # Missing documents as signal has_agb = "agb" in documents has_widerruf = "widerruf" in documents