diff --git a/admin-compliance/app/sdk/agent/_components/ChecklistView.tsx b/admin-compliance/app/sdk/agent/_components/ChecklistView.tsx index 98f41fd1..17ff82c8 100644 --- a/admin-compliance/app/sdk/agent/_components/ChecklistView.tsx +++ b/admin-compliance/app/sdk/agent/_components/ChecklistView.tsx @@ -31,6 +31,7 @@ const SCENARIO_LABELS: Record = { @@ -102,6 +103,7 @@ export function ChecklistView({ results }: { results: DocResult[] }) { regenerate: results.filter(r => r.scenario === 'regenerate').length, fix: results.filter(r => r.scenario === 'fix').length, import: results.filter(r => r.scenario === 'import').length, + missing: results.filter(r => r.scenario === 'missing').length, } return ( @@ -114,6 +116,7 @@ export function ChecklistView({ results }: { results: DocResult[] }) { {scenarioCounts.import > 0 && {scenarioCounts.import} konform} {scenarioCounts.fix > 0 && {scenarioCounts.fix} Korrekturen} {scenarioCounts.regenerate > 0 && {scenarioCounts.regenerate} Neugenerierung} + {scenarioCounts.missing > 0 && {scenarioCounts.missing} fehlt} @@ -164,7 +167,11 @@ export function ChecklistView({ results }: { results: DocResult[] }) {
- {r.error ? ( + {r.error && r.error.startsWith("Nicht eingereicht") ? ( + + Nicht eingereicht + + ) : r.error ? ( Fehler ) : (
diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py index d2f6a019..ee99d960 100644 --- a/backend-compliance/compliance/api/agent_compliance_check_routes.py +++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py @@ -176,7 +176,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): if url_key in url_text_cache: text = url_text_cache[url_key] else: - text = await _fetch_text(doc.url) + text = await _fetch_text(doc.url, doc_type=doc.doc_type) if text: url_text_cache[url_key] = text if text: @@ -334,6 +334,11 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): else: r.scenario = "import" + # Step 4c: Always render all 8 canonical doc types, even when the + # user left a row blank. Missing types get a placeholder so the + # email + frontend make absent documents immediately visible. + results = _pad_results_with_missing(results) + # Step 5: Build report with management summary (95-98%) _update(check_id, "Report wird erstellt...", 96) from .agent_doc_check_report import ( @@ -402,23 +407,31 @@ def _update(check_id: str, msg: str, pct: int | None = None): job["progress_pct"] = max(0, min(100, int(pct))) -async def _fetch_text(url: str) -> str: +async def _fetch_text(url: str, doc_type: str = "") -> str: """Fetch text from URL via consent-tester, with HTTP fallback. 1. Try consent-tester (Playwright) — handles JS-heavy SPAs 2. Fallback: direct HTTP fetch + HTML strip — fast, works for SSR pages + + doc_type controls how aggressively we follow sub-links — cookie/dse + pages prefer self-extract only (CMP capture is authoritative); legal/ + imprint pages need to follow sub-pages (Versicherungsvermittler etc). """ # 1. Consent-tester (Playwright-based, full JS rendering). - # max_documents=1: for a *specific* user-entered URL (cookie, impressum, - # privacy) we only want the self-extracted text of THAT page. Following - # sub-links was triggering 4x networkidle timeouts (~240s) and made the - # backend httpx call time out, dropping us to the raw HTTP fallback - # which returned site navigation as garbage text. + # max_documents depends on doc_type: + # - cookie/dse/social_media: self-extract (often + CMP capture) is + # authoritative, sub-pages dilute the policy text. max=1. + # - impressum/agb/widerruf/nutzungsbedingungen/dsb: BMW & similar + # enterprise sites split this across 3-4 short sub-pages + # (Versicherungsvermittler, Aufsicht, Berufsrecht). max=3 follows + # them. The 15s networkidle bail (dsi_helpers) keeps timing safe. + short_extract_types = {"cookie", "dse", "datenschutz", "privacy", "social_media"} + max_docs = 1 if (doc_type or "") in short_extract_types else 3 try: async with httpx.AsyncClient(timeout=120.0) as client: resp = await client.post( f"{CONSENT_TESTER_URL}/dsi-discovery", - json={"url": url, "max_documents": 1}, + json={"url": url, "max_documents": max_docs}, timeout=120.0, ) if resp.status_code == 200: @@ -531,6 +544,50 @@ async def _check_single( ) +def _pad_results_with_missing(results: list) -> list: + """Ensure every canonical doc_type has an entry in the results list. + + Doc_types the user did not submit get a placeholder DocCheckResult + with a 'Nicht eingereicht' marker so the email + frontend make + absent documents visible at a glance. + + Preserves the canonical ordering from _ALL_DOC_TYPES so the report + layout is stable. + """ + from .agent_doc_check_routes import DocCheckResult + + by_type: dict[str, object] = {} + for r in results: + # Map alias types (datenschutz/privacy → dse) to the canonical key + canon = "dse" if r.doc_type in ("datenschutz", "privacy") else r.doc_type + by_type[canon] = r + + ordered: list = [] + for dt in _ALL_DOC_TYPES: + if dt in by_type: + ordered.append(by_type[dt]) + continue + ordered.append(DocCheckResult( + label=_doc_type_label(dt), + url="", + doc_type=dt, + word_count=0, + completeness_pct=0, + correctness_pct=0, + checks=[], + findings_count=0, + error="Nicht eingereicht — Quelle nicht angegeben", + scenario="missing", + )) + + # Append any results not in _ALL_DOC_TYPES (e.g. avv, dsfa) at the end + extras = [r for r in results + if (r.doc_type if r.doc_type not in ("datenschutz", "privacy") else "dse") + not in _ALL_DOC_TYPES] + ordered.extend(extras) + return ordered + + _COMPOUND_TLDS = { "co.uk", "co.jp", "co.nz", "co.kr", "co.za", "co.in", "com.au", "com.br", "com.mx", "com.tr", "com.sg", @@ -603,9 +660,21 @@ def _apply_profile_filter(result, profile, doc_type: str): for check in result.checks: cid = check.id.lower() - # ODR/OS-Link only relevant for B2C online shops + # ODR/OS-Link: relevant ONLY for B2C online shops. The check's + # default hint is written for B2B (it explains why it's not + # relevant) — for B2C we must replace it with action-oriented + # guidance, otherwise the report contradicts itself. if "odr" in cid or "os-link" in cid or "streitbeilegung" in check.label.lower(): - if not profile.needs_odr: + if profile.needs_odr: + if not check.passed: + check.hint = ( + "Als B2C-Anbieter muessen Sie nach Art. 14 EU-VO 524/2013 " + "auf die OS-Plattform (https://ec.europa.eu/consumers/odr) " + "verlinken — klickbarer Link, nicht nur Text. Zusaetzlich " + "§36 VSBG: angeben, ob Sie an Verbraucher-" + "Streitbeilegungsverfahren teilnehmen (oder nicht)." + ) + else: check.skipped = True check.hint = "Nicht relevant (kein B2C Online-Shop)" @@ -643,8 +712,19 @@ _DOC_TYPE_LABELS = { "loeschkonzept": "Loeschkonzept", "dsfa": "Datenschutz-Folgenabschaetzung", "social_media": "Social Media Datenschutz", + "nutzungsbedingungen": "Nutzungsbedingungen", + "dsb": "DSB-Kontakt", } +# Canonical 8 doc types in the same order as the frontend ComplianceCheckTab. +# The route pads `results` to always contain an entry for each — even if +# the user did not submit a URL — so the email + frontend always show +# the complete checklist (missing rows marked as 'Nicht eingereicht'). +_ALL_DOC_TYPES = [ + "dse", "impressum", "social_media", "cookie", + "agb", "nutzungsbedingungen", "widerruf", "dsb", +] + def _doc_type_label(doc_type: str) -> str: return _DOC_TYPE_LABELS.get(doc_type, doc_type.upper()) diff --git a/backend-compliance/compliance/api/agent_doc_check_report.py b/backend-compliance/compliance/api/agent_doc_check_report.py index c41efe29..739ec8b9 100644 --- a/backend-compliance/compliance/api/agent_doc_check_report.py +++ b/backend-compliance/compliance/api/agent_doc_check_report.py @@ -184,7 +184,10 @@ def _render_document(html: list[str], r: DocCheckResult) -> None: cpct = r.correctness_pct bar_color = "green" if pct >= 80 else "yellow" if pct >= 50 else "red" status_label = "OK" if pct == 100 else "LUECKENHAFT" if pct >= 50 else "MANGELHAFT" - if r.error: + is_missing = bool(r.error) and r.error.startswith("Nicht eingereicht") + if is_missing: + status_label = "NICHT EINGEREICHT" + elif r.error: status_label = "FEHLER" l1_checks = [c for c in r.checks if c.level == 1] @@ -216,7 +219,16 @@ def _render_document(html: list[str], r: DocCheckResult) -> None: html.append('
') # Body - if r.error: + if is_missing: + html.append( + '
' + 'Keine URL oder Text fuer dieses Dokument angegeben. ' + 'Tragen Sie die Quelle im Compliance-Check Formular nach, ' + 'um diese Pflichtangabe zu pruefen.' + '
' + ) + elif r.error: html.append(f'
{r.error}
') else: html.append('
') diff --git a/backend-compliance/compliance/services/business_profiler.py b/backend-compliance/compliance/services/business_profiler.py index 6b8ac6d3..2f511ec1 100644 --- a/backend-compliance/compliance/services/business_profiler.py +++ b/backend-compliance/compliance/services/business_profiler.py @@ -107,7 +107,13 @@ _EDITORIAL_KEYWORDS = [ ] _INDUSTRY_KEYWORDS = { - "it_services": ["software", "saas", "cloud", "hosting", "api", "plattform"], + # "software/cloud/hosting" are often mentioned in privacy texts of any + # vendor (Cloud-Hosting fuer Newsletter, SaaS-Tools etc.) without making + # the company an IT-services vendor itself. Keep the list deliberately + # narrow: only patterns that strongly suggest IT/SaaS as the core business. + "it_services": ["saas-anbieter", "software-as-a-service", + "ihr saas", "ihre cloud", "hosting-provider", + "api-anbieter", "developer-portal"], "retail": ["shop", "warenkorb", "versand", "lieferung", "einzelhandel"], "healthcare": ["arzt", "praxis", "patient", "gesundheit", "therapie", "klinik"], "legal": ["kanzlei", "rechtsanwalt", "mandant", "anwalt"], @@ -120,7 +126,11 @@ _INDUSTRY_KEYWORDS = { "manufacturing": ["fertigung", "produktion", "maschinenbau", "anlagenbau", "zulieferer", "werkzeugbau", "spritzguss", "cnc", "industrietechnik"], "automotive": ["fahrzeug", "kraftfahrzeug", "kfz", "automobil", "neuwagen", - "gebrauchtwagen", "konfigurator", "modellreihe", "modellpalette"], + "gebrauchtwagen", "fahrzeugempfehlung", "modellreihe", + "modellpalette", "antriebs", "motor", "reifen", "elektroauto", + "verbrenner", "hybridfahrzeug", "leasing", "werkstatt", + "wartung und reparatur", "probefahrt", "bmw", "mercedes", + "audi", "volkswagen", "porsche", "opel"], "media": ["redaktion", "verlag", "medien", "journalismus", "presse"], } diff --git a/backend-compliance/compliance/services/doc_checks/impressum_checks.py b/backend-compliance/compliance/services/doc_checks/impressum_checks.py index 1d80d1fa..1009fb9a 100644 --- a/backend-compliance/compliance/services/doc_checks/impressum_checks.py +++ b/backend-compliance/compliance/services/doc_checks/impressum_checks.py @@ -111,9 +111,16 @@ IMPRESSUM_CHECKLIST = [ "label": "Registergericht benannt (Amtsgericht X)", "level": 2, "parent": "register", "patterns": [ - r"(?:amtsgericht|registergericht)\s+[a-zA-Z\u00c0-\u017e]\w+", + # "Amtsgericht " or "Registergericht " + # Allow colon/dot/dash between keyword and city (BMW writes + # "registergericht: m\u00fcnchen hrb 42243"). + r"(?:amtsgericht|registergericht)[\s:\.\-,]+[a-zA-Z\u00c0-\u017e]\w+", + # "AG " short form r"\bag\s+[a-zA-Z\u00c0-\u017e]\w+", + # "Handelsregister AG/Amtsgericht " r"(?:handelsregister|register)\s+(?:ag|amtsgericht)\s+\w+", + # "Sitz und Registergericht: M\u00fcnchen" \u2014 BMW pattern + r"sitz\s+und\s+registergericht[\s:\.\-,]+[a-zA-Z\u00c0-\u017e]\w+", ], "severity": "LOW", "hint": "Registergericht benennen (z.B. 'Amtsgericht Freiburg' oder 'AG Freiburg'). Beides ist korrekt.",