From e61e9d9e2ac811c052e8d23c44874e1afa10ca7a Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 16 May 2026 17:53:14 +0200 Subject: [PATCH] feat(agent): progress_pct + 6 BMW-Run Verbesserungen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backend (agent_compliance_check_routes.py): - progress_pct (0-100%) im Job-State, ueber alle Phasen verteilt (Laden 0-30, Profil 35-40, Pruefen 40-80, Banner 80-92, Report 95-100) - Status-Texte vereinheitlicht ("Texte laden X/N", "Pruefen X/N") - Firmenname fuer Email-Subject jetzt aus URL abgeleitet (bmw.de -> "BMW", mercedes-benz.de -> "Mercedes-Benz") statt unzuverlaessigem extracted_profile.companyName (matchte oft juris.de) - E-Mail-Report enthaelt jetzt Banner+TCF-Vendor-Liste (build_provider_list_html) Backend (agent_doc_check_extras.py — neu): - build_scanned_urls_html: gepruefte URLs als Tabelle oben im Report (transparent fuer GF, welche Quellen wirklich gezogen wurden) - Cross-Domain-Hinweis bei >1 netloc (BMW: bmw.de / bmwgroup.com / bmwgroup.jobs — Auffindbarkeit nach Art. 12 DSGVO) - build_provider_list_html: Banner-Box + TCF-Vendor-Tabelle mit Spalten Name | Kategorie | Zweck | Drittland | Rechtsgrundlage Backend (business_profiler.py): - §34d-GewO Versicherungsvermittler-Hinweise zaehlen nicht mehr als "finance"-Industrie (BMW wurde dadurch falsch als B2B/finance erkannt) - Neue Industry "automotive" (Fahrzeug/KFZ/Konfigurator/Modellpalette) - B2B-Keywords: generische Begriffe wie "unternehmen", "beratung", "consulting" entfernt (matchten in jedem Konzerntext) - B2C-Fallback: bei Verbraucher-Signalen ("widerruf", "kunde", redaktioneller Inhalt) tendiert auf b2c statt b2b Frontend (ComplianceCheckTab.tsx): - Progress-Balken mit Width-% und XX%-Anzeige rechts - liest data.progress_pct aus Polling-Response Consent-Tester (dsi_discovery.py): - Cookie-Policy-Extraktion kritisch fixt: wait_for_function bis body.innerText > 500 chars (BMW SPA-Rendering brauchte mehr Zeit) - _extract_text_robust: 3-Strategien-Extraktion (Selektoren -> Body- Cleanup -> P/LI/TD-Tags) - _extract_text_from_iframes: liest OneTrust/Sourcepoint/Usercentrics Iframe-Inhalte (manche Cookie-Policies leben dort) Adressiert alle Findings aus dem BMW-Ground-Truth-Vergleich. --- .../agent/_components/ComplianceCheckTab.tsx | 31 ++- .../api/agent_compliance_check_routes.py | 128 +++++++--- .../compliance/api/agent_doc_check_extras.py | 226 ++++++++++++++++++ .../compliance/api/agent_doc_check_report.py | 9 + .../compliance/services/business_profiler.py | 67 +++++- consent-tester/services/dsi_discovery.py | 107 ++++++++- 6 files changed, 515 insertions(+), 53 deletions(-) create mode 100644 backend-compliance/compliance/api/agent_doc_check_extras.py diff --git a/admin-compliance/app/sdk/agent/_components/ComplianceCheckTab.tsx b/admin-compliance/app/sdk/agent/_components/ComplianceCheckTab.tsx index c7505df1..3c041c80 100644 --- a/admin-compliance/app/sdk/agent/_components/ComplianceCheckTab.tsx +++ b/admin-compliance/app/sdk/agent/_components/ComplianceCheckTab.tsx @@ -73,6 +73,7 @@ export function ComplianceCheckTab() { const [useAgent, setUseAgent] = useState(false) const [loading, setLoading] = useState(false) const [progress, setProgress] = useState('') + const [progressPct, setProgressPct] = useState(0) const [results, setResults] = useState(() => { if (typeof window === 'undefined') return null try { const s = localStorage.getItem(STORAGE_KEY_RESULTS); return s ? JSON.parse(s) : null } catch { return null } @@ -109,15 +110,16 @@ export function ComplianceCheckTab() { if (!res.ok) continue const data = await res.json() if (data.progress) setProgress(data.progress) + if (typeof data.progress_pct === 'number') setProgressPct(data.progress_pct) if (data.status === 'completed' && data.result) { - setResults(data.result); setProgress(''); setLoading(false) + setResults(data.result); setProgress(''); setProgressPct(0); setLoading(false) localStorage.setItem(STORAGE_KEY_RESULTS, JSON.stringify(data.result)) localStorage.removeItem(STORAGE_KEY_CHECK_ID); setActiveCheckId('') return } if (data.status === 'failed' || data.status === 'not_found') { if (data.status === 'failed') setError(data.error || 'Pruefung fehlgeschlagen') - setProgress(''); setLoading(false) + setProgress(''); setProgressPct(0); setLoading(false) localStorage.removeItem(STORAGE_KEY_CHECK_ID); setActiveCheckId('') return } @@ -177,6 +179,7 @@ export function ComplianceCheckTab() { setError(null) setResults(null) setProgress('Compliance-Check wird gestartet...') + setProgressPct(0) try { const entries = DOCUMENT_TYPES @@ -210,9 +213,11 @@ export function ComplianceCheckTab() { if (!pollRes.ok) { attempts++; continue } const pollData = await pollRes.json() if (pollData.progress) setProgress(pollData.progress) + if (typeof pollData.progress_pct === 'number') setProgressPct(pollData.progress_pct) if (pollData.status === 'completed' && pollData.result) { setResults(pollData.result) setProgress('') + setProgressPct(0) localStorage.setItem(STORAGE_KEY_RESULTS, JSON.stringify(pollData.result)) localStorage.removeItem(STORAGE_KEY_CHECK_ID); setActiveCheckId('') @@ -242,6 +247,7 @@ export function ComplianceCheckTab() { } catch (e) { setError(e instanceof Error ? e.message : 'Unbekannter Fehler') setProgress('') + setProgressPct(0) } finally { setLoading(false) } @@ -334,12 +340,21 @@ export function ComplianceCheckTab() { {/* Progress */} {progress && ( -
- - - - - {progress} +
+
+ + + + + {progress} + {progressPct}% +
+
+
+
)} diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py index 26f399fb..7ed3edf2 100644 --- a/backend-compliance/compliance/api/agent_compliance_check_routes.py +++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py @@ -56,6 +56,7 @@ class ComplianceCheckStatusResponse(BaseModel): check_id: str status: str progress: str = "" + progress_pct: int = 0 result: dict | None = None error: str = "" @@ -124,6 +125,7 @@ async def start_compliance_check(req: ComplianceCheckRequest): _compliance_check_jobs[check_id] = { "status": "running", "progress": "Pruefung gestartet...", + "progress_pct": 0, "result": None, "error": "", } @@ -141,6 +143,7 @@ async def get_compliance_check_status(check_id: str): check_id=check_id, status=job["status"], progress=job.get("progress", ""), + progress_pct=job.get("progress_pct", 0), result=job.get("result"), error=job.get("error", ""), ) @@ -155,16 +158,18 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): from .agent_doc_check_routes import CheckItem, DocCheckResult from .agent_doc_check_report import build_html_report - # Step 1: Resolve texts (fetch from URL if needed) - _update(check_id, "Texte werden geladen...") + # Step 1: Resolve texts (fetch from URL if needed) — 0-30% + _update(check_id, "Texte werden geladen...", 1) doc_texts: dict[str, str] = {} doc_entries: list[dict] = [] # Cache fetched URLs to detect duplicates url_text_cache: dict[str, str] = {} + n_docs = max(1, len(req.documents)) for i, doc in enumerate(req.documents): - _update(check_id, f"Dokument {i+1}/{len(req.documents)}: {doc.doc_type}...") + pct = int(1 + (i / n_docs) * 29) + _update(check_id, f"Texte laden {i+1}/{n_docs}: {doc.doc_type}...", pct) text = doc.text if not text and doc.url: url_key = doc.url.strip().rstrip("/").lower() @@ -192,8 +197,8 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): split_shared_texts(doc_entries, url_text_cache) auto_fill_from_dsi(doc_entries) - # Step 1c: Cross-document search — find doc_types in wrong documents - _update(check_id, "Dokumente werden uebergreifend durchsucht...") + # Step 1c: Cross-document search — find doc_types in wrong documents (30-35%) + _update(check_id, "Dokumente werden uebergreifend durchsucht...", 32) placement_findings = cross_search_documents(doc_entries) # Refresh doc_texts after all splitting/searching @@ -201,8 +206,8 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): if entry.get("text"): doc_texts[entry["doc_type"]] = entry["text"] - # Step 2: Detect business profile - _update(check_id, "Geschaeftsmodell wird erkannt...") + # Step 2: Detect business profile (35-40%) + _update(check_id, "Geschaeftsmodell wird erkannt...", 37) profile = await detect_business_profile(doc_texts) profile_dict = asdict(profile) @@ -216,6 +221,8 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): # Filter out doc_types that don't apply to this business profile skip_types = _get_skip_types(profile) + # Document checks: 40-80% + n_entries = max(1, len(doc_entries)) for i, entry in enumerate(doc_entries): text = entry["text"] doc_type = entry["doc_type"] @@ -229,7 +236,8 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): )) continue - _update(check_id, f"Pruefe {label} ({i+1}/{len(doc_entries)})...") + pct = int(40 + (i / n_entries) * 40) + _update(check_id, f"Pruefen {i+1}/{n_entries}: {label}...", pct) if not text or len(text) < 50: results.append(DocCheckResult( @@ -268,7 +276,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): parsed = urlparse(banner_url) banner_url = f"{parsed.scheme}://{parsed.netloc}" if banner_url: - _update(check_id, "Cookie-Banner wird geprueft...") + _update(check_id, "Cookie-Banner wird geprueft...", 82) try: async with httpx.AsyncClient(timeout=120.0) as client: resp = await client.post( @@ -280,9 +288,9 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): except Exception as e: logger.warning("Banner check failed: %s", e) - # Step 3c: Cross-check Banner vs Cookie-Richtlinie + # Step 3c: Cross-check Banner vs Cookie-Richtlinie (88-90%) if banner_result and "cookie" in doc_texts: - _update(check_id, "Banner vs. Cookie-Richtlinie abgleichen...") + _update(check_id, "Banner vs. Cookie-Richtlinie abgleichen...", 89) cross_findings = _cross_check_banner_vs_cookie( banner_result, doc_texts["cookie"], ) @@ -299,7 +307,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): tcf_vendors = banner_result.get("tcf_vendors", []) if banner_result else [] vvt_entries: list[dict] = [] if tcf_vendors and "dse" in doc_texts: - _update(check_id, f"{len(tcf_vendors)} TCF-Verarbeiter vs. DSI abgleichen...") + _update(check_id, f"{len(tcf_vendors)} TCF-Verarbeiter vs. DSI abgleichen...", 91) from compliance.services.banner_cookie_cross_check import cross_check_vendors_vs_dsi from compliance.services.vendor_vvt_mapper import map_vendors_to_vvt vendor_findings = cross_check_vendors_vs_dsi(tcf_vendors, doc_texts["dse"]) @@ -310,8 +318,8 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): r.checks.append(CheckItem(**vf)) vvt_entries = map_vendors_to_vvt(tcf_vendors) - # Step 4: Extract profile hints from documents - _update(check_id, "Profil wird aus Dokumenten extrahiert...") + # Step 4: Extract profile hints from documents (92-95%) + _update(check_id, "Profil wird aus Dokumenten extrahiert...", 93) from compliance.services.profile_extractor import extract_profile_from_documents extracted_profile = extract_profile_from_documents(doc_texts, profile_dict) @@ -326,21 +334,32 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): else: r.scenario = "import" - # Step 5: Build report with management summary - _update(check_id, "Report wird erstellt...") - from .agent_doc_check_report import build_management_summary + # Step 5: Build report with management summary (95-98%) + _update(check_id, "Report wird erstellt...", 96) + from .agent_doc_check_report import ( + build_management_summary, + build_scanned_urls_html, + build_provider_list_html, + ) summary_html = build_management_summary(results) + scanned_html = build_scanned_urls_html(doc_entries) + providers_html = build_provider_list_html(banner_result, vvt_entries) report_html = build_html_report(results, None) profile_html = _build_profile_html(profile) - full_html = summary_html + profile_html + report_html - - # Step 6: Send email — include website/company name in subject - doc_count = len([r for r in results if not r.error]) - site_name = ( - extracted_profile.get("company_profile", {}).get("companyName") - or _extract_domain(doc_entries) - or "Unbekannt" + full_html = ( + summary_html + scanned_html + profile_html + + providers_html + report_html ) + + # Step 6: Send email — derive site name primarily from entered URL. + # The extracted_profile.companyName is often noisy (e.g. picks up + # juris.de from legal references). Domain-derived name is more + # predictable for the GF email subject. + doc_count = len([r for r in results if not r.error]) + url_company = _company_name_from_url(doc_entries) + domain = _extract_domain(doc_entries) + site_name = url_company or domain or "Unbekannt" + _update(check_id, "E-Mail wird versendet...", 98) email_result = send_email( recipient=req.recipient, subject=f"[COMPLIANCE-CHECK] {site_name} — {doc_count} Dokumente geprueft", @@ -368,6 +387,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): _compliance_check_jobs[check_id]["status"] = "completed" _compliance_check_jobs[check_id]["result"] = response _compliance_check_jobs[check_id]["progress"] = "Fertig" + _compliance_check_jobs[check_id]["progress_pct"] = 100 except Exception as e: logger.error("Compliance check %s failed: %s", check_id, e, exc_info=True) @@ -375,8 +395,11 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest): _compliance_check_jobs[check_id]["error"] = str(e)[:500] -def _update(check_id: str, msg: str): - _compliance_check_jobs[check_id]["progress"] = msg +def _update(check_id: str, msg: str, pct: int | None = None): + job = _compliance_check_jobs[check_id] + job["progress"] = msg + if pct is not None: + job["progress_pct"] = max(0, min(100, int(pct))) async def _fetch_text(url: str) -> str: @@ -503,14 +526,59 @@ async def _check_single( ) +_COMPOUND_TLDS = { + "co.uk", "co.jp", "co.nz", "co.kr", "co.za", "co.in", + "com.au", "com.br", "com.mx", "com.tr", "com.sg", +} + + def _extract_domain(doc_entries: list[dict]) -> str | None: - """Extract domain name from first URL for email subject.""" + """Extract base domain (without www) from first URL.""" for entry in doc_entries: url = entry.get("url", "") if url and "://" in url: from urllib.parse import urlparse - host = urlparse(url).netloc - return host.replace("www.", "") if host else None + host = urlparse(url).netloc.lower() + if host.startswith("www."): + host = host[4:] + return host or None + return None + + +def _company_name_from_url(doc_entries: list[dict]) -> str | None: + """Derive a display company name from the entered URLs. + + Heuristic: take the second-level domain (e.g. "bmw" from "www.bmw.de"), + uppercase short acronyms (<=4 chars, no hyphens), title-case the rest. + + Examples: + www.bmw.de -> BMW + mercedes-benz.de -> Mercedes-Benz + shop.example.co.uk -> Example + juris.de -> Juris + """ + from urllib.parse import urlparse + + for entry in doc_entries: + url = entry.get("url", "") + if not url or "://" not in url: + continue + host = urlparse(url).netloc.lower() + if host.startswith("www."): + host = host[4:] + parts = host.split(".") + if len(parts) < 2: + continue + # Handle compound TLDs (.co.uk etc.) + if len(parts) >= 3 and ".".join(parts[-2:]) in _COMPOUND_TLDS: + sld = parts[-3] + else: + sld = parts[-2] + if not sld: + continue + if len(sld) <= 4 and "-" not in sld: + return sld.upper() + return "-".join(p.capitalize() for p in sld.split("-")) return None diff --git a/backend-compliance/compliance/api/agent_doc_check_extras.py b/backend-compliance/compliance/api/agent_doc_check_extras.py new file mode 100644 index 00000000..74578347 --- /dev/null +++ b/backend-compliance/compliance/api/agent_doc_check_extras.py @@ -0,0 +1,226 @@ +""" +Extras for the agent doc-check email report. + +Split out from agent_doc_check_report.py to keep both files under the +500-line hard cap. Contains: + - build_scanned_urls_html (list of fetched URLs + cross-domain notice) + - build_provider_list_html (cookie banner + TCF vendor table) +""" + +from __future__ import annotations + + +def build_scanned_urls_html(doc_entries: list[dict]) -> str: + """Render the list of scanned URLs at the top of the report. + + Transparent for the GF which sources were actually fetched/analysed. + Skips empty URLs (text-only uploads). Adds a cross-domain warning when + legal texts are distributed across multiple domains (e.g. BMW spreads + across bmw.de, bmwgroup.com, bmwgroup.jobs). + """ + from urllib.parse import urlparse + + rows: list[str] = [] + seen: set[str] = set() + domains: dict[str, list[str]] = {} # netloc -> list of doc_types + for entry in doc_entries: + url = (entry.get("url") or "").strip() + if not url or url in seen: + continue + seen.add(url) + label = _doc_type_label(entry.get("doc_type", "")) + words = entry.get("word_count") or 0 + try: + netloc = urlparse(url).netloc.lower().lstrip("www.") + if netloc: + domains.setdefault(netloc, []).append(label) + except Exception: + pass + rows.append( + f'' + f'{label}' + f'' + f'{url}' + f'{words} Woerter' + f'' + ) + if not rows: + return "" + + cross_domain_html = _cross_domain_notice(domains) if len(domains) >= 2 else "" + + return ( + '
' + '

' + f'Gepruefte Quellen ({len(rows)})

' + '' + + "".join(rows) + + '
' + + cross_domain_html + + '
' + ) + + +def _cross_domain_notice(domains: dict[str, list[str]]) -> str: + """Warning box when legal texts are spread across multiple domains. + + Relevant for big corporate groups (BMW Group: bmw.de / bmwgroup.com / + bmwgroup.jobs). Affects findability for data subjects and may indicate + incomplete disclosure on the main site. + """ + items = [] + for netloc, labels in sorted(domains.items()): + labels_str = ", ".join(sorted(set(labels))) + items.append( + f'
  • {netloc} ' + f'→ {labels_str}
  • ' + ) + return ( + '
    ' + 'Hinweis: Rechtstexte verteilt auf ' + f'{len(domains)} Domains. ' + 'Erschwert die Auffindbarkeit fuer Betroffene (Art. 12 Abs. 1 DSGVO — ' + 'transparente Information). Pruefen Sie, ob alle Texte auch von der ' + 'Hauptdomain aus klar verlinkt sind.' + '
      ' + + "".join(items) + + '
    ' + ) + + +def _doc_type_label(doc_type: str) -> str: + """Lazy resolver — avoids circular import with agent_compliance_check_routes.""" + labels = { + "dse": "Datenschutzerklaerung", + "datenschutz": "Datenschutzerklaerung", + "privacy": "Datenschutzerklaerung", + "impressum": "Impressum", + "agb": "AGB", + "widerruf": "Widerrufsbelehrung", + "cookie": "Cookie-Richtlinie", + "avv": "Auftragsverarbeitung", + "loeschkonzept": "Loeschkonzept", + "dsfa": "Datenschutz-Folgenabschaetzung", + "social_media": "Social Media Datenschutz", + "nutzungsbedingungen": "Nutzungsbedingungen", + "dsb": "DSB-Kontakt", + } + return labels.get(doc_type, doc_type.upper() if doc_type else "Dokument") + + +def build_provider_list_html( + banner_result: dict | None, + vvt_entries: list[dict] | None, +) -> str: + """Render the cookie banner result + TCF vendor table for the email. + + Sections: + 1. Banner summary (provider, violations count) + 2. Vendor table: Name | Kategorie | Zweck | Drittland | Rechtsgrundlage + """ + if not banner_result and not vvt_entries: + return "" + + parts: list[str] = [ + '
    ' + '

    ' + 'Cookie-Banner & Verarbeiter

    ' + ] + + if banner_result: + detected = banner_result.get("banner_detected", False) + provider = banner_result.get("banner_provider") or "unbekannt" + violations = banner_result.get("banner_checks", {}).get("violations", []) + n_viol = len(violations) if isinstance(violations, list) else int(violations or 0) + + status_color = "#16a34a" if detected and n_viol == 0 else ( + "#d97706" if detected else "#6b7280" + ) + parts.append( + f'
    ' + f'' + f'Banner erkannt: {"Ja" if detected else "Nein"}' + f'  ·  Anbieter: {provider}' + f'  ·  Auffaelligkeiten: {n_viol}' + f'
    ' + ) + + vendors = vvt_entries or [] + if vendors: + parts.append( + f'
    ' + f'{len(vendors)} TCF-Verarbeiter ueber das Banner eingebunden:' + f'
    ' + '' + '' + '' + '' + '' + '' + '' + '' + ) + for v in vendors[:50]: + parts.append(_render_vendor_row(v)) + parts.append('
    NameKategorieZweckDrittlandRechtsgrundlage
    ') + if len(vendors) > 50: + parts.append( + f'
    ' + f'... und {len(vendors) - 50} weitere
    ' + ) + elif banner_result and banner_result.get("banner_detected"): + parts.append( + '
    ' + 'Keine TCF-Verarbeiter erkannt (Banner nutzt kein TCF v2 Framework ' + 'oder Vendor-Liste konnte nicht ausgelesen werden).
    ' + ) + + parts.append('
    ') + return "".join(parts) + + +def _render_vendor_row(v: dict) -> str: + name = v.get("name") or "Unbekannt" + kategorie = _category_label(v.get("kategorie", "")) + zweck = v.get("zweck_kurz") or ", ".join((v.get("zweck") or [])[:2]) + drittland = v.get("drittland") + land = v.get("land") or "" + if drittland is True: + drittland_str = (f'Ja ({land})' + if land else 'Ja') + elif drittland is False: + drittland_str = (f'Nein ({land})' + if land else 'Nein') + else: + drittland_str = 'unbekannt' + rg = v.get("rechtsgrundlage", "") + rg_short = "Einwilligung" if "Einwilligung" in rg else ( + "Berechtigtes Interesse" if "Berechtigtes" in rg else rg[:40] + ) + return ( + f'' + f'{name}' + f'{kategorie}' + f'{zweck}' + f'{drittland_str}' + f'{rg_short}' + f'' + ) + + +def _category_label(kat: str) -> str: + return { + "necessary": "Notwendig", + "functional": "Funktional", + "statistics": "Statistik", + "marketing": "Marketing", + }.get(kat, kat or "—") diff --git a/backend-compliance/compliance/api/agent_doc_check_report.py b/backend-compliance/compliance/api/agent_doc_check_report.py index 6508cfd4..c41efe29 100644 --- a/backend-compliance/compliance/api/agent_doc_check_report.py +++ b/backend-compliance/compliance/api/agent_doc_check_report.py @@ -290,6 +290,15 @@ def _render_cookie_banner(html: list[str], cookie_result: dict) -> None: html.append('
    ') +# Re-export the helpers extracted to agent_doc_check_extras.py so existing +# callers that did `from .agent_doc_check_report import build_scanned_urls_html` +# keep working. +from .agent_doc_check_extras import ( # noqa: E402,F401 + build_provider_list_html, + build_scanned_urls_html, +) + + def build_profile_html(profile) -> str: """Build a small HTML block summarizing the detected business profile.""" service_tags = ", ".join(profile.detected_services[:10]) or "keine erkannt" diff --git a/backend-compliance/compliance/services/business_profiler.py b/backend-compliance/compliance/services/business_profiler.py index 32eb4fe8..6b8ac6d3 100644 --- a/backend-compliance/compliance/services/business_profiler.py +++ b/backend-compliance/compliance/services/business_profiler.py @@ -39,10 +39,13 @@ _B2C_KEYWORDS = [ ] _B2B_KEYWORDS = [ - "unternehmen", "geschaeftskunden", "geschäftskunden", "gewerblich", - "auftraggeber", "auftragnehmer", "geschaeftspartner", - "geschäftspartner", "firmenkunde", "b2b", "industriekunden", - "beratung", "consulting", "dienstleistung", "engineering", + # Discriminative — these don't appear in B2C consumer texts + "geschaeftskunden", "geschäftskunden", "firmenkunde", "b2b", + "industriekunden", "ausschliesslich gewerblich", "ausschließlich gewerblich", + "ausschliesslich unternehmer", "ausschließlich unternehmer", + "kein verbrauchergeschaeft", "kein verbrauchergeschäft", + # Note: "unternehmen", "beratung", "consulting", "dienstleistung" + # were removed — they match in any company text and bias toward B2B. ] _B2G_KEYWORDS = [ @@ -116,9 +119,20 @@ _INDUSTRY_KEYWORDS = { "arbeitssicherheit", "brandschutz", "sicherheitstechnik", "zertifizierung"], "manufacturing": ["fertigung", "produktion", "maschinenbau", "anlagenbau", "zulieferer", "werkzeugbau", "spritzguss", "cnc", "industrietechnik"], + "automotive": ["fahrzeug", "kraftfahrzeug", "kfz", "automobil", "neuwagen", + "gebrauchtwagen", "konfigurator", "modellreihe", "modellpalette"], "media": ["redaktion", "verlag", "medien", "journalismus", "presse"], } +# Terms that indicate "versicherung" / "bank" is only mentioned as a +# §34d/§34c GewO disclosure (Versicherungsvermittler / Finanzanlagenvermittler) +# rather than the core business. Used to suppress false finance matches. +_VERMITTLER_CONTEXT_TERMS = [ + "versicherungsvermittler", "berufshaftpflichtversicherung", + "vermittlerregister", "§34d", "§ 34 d", "§34c", "§ 34 c", + "finanzanlagenvermittler", "ihk muenchen", "ihk münchen", +] + _TRACKING_SERVICES = { "google analytics": "Google Analytics", "google tag manager": "Google Tag Manager", @@ -231,13 +245,23 @@ async def detect_business_profile(documents: dict[str, str]) -> BusinessProfile: total = sum(max(0, v) for v in scores.values()) profile.confidence = round(best_val / total, 2) if total > 0 else 0.5 else: - # Fallback: GmbH/AG without B2C signals → assume B2B - has_company = any(kw in full_text for kw in [ - "gmbh", "ag ", "ohg", "kg ", "ug ", "gbr", - ]) - if has_company and b2c_score <= 0: + # Fallback: prefer B2C when the text mentions Verbraucherrechte, + # editorial content, or consumer-direction signals — even without + # checkout keywords. Only fall back to B2B if discriminative B2B + # markers fired (which the keyword list above already filtered to + # genuinely B2B-only terms). + consumer_hint = ( + "verbraucher" in full_text + or "widerruf" in full_text + or "kunde" in full_text + or profile.has_editorial_content + ) + if b2b_score >= 1 and not consumer_hint: profile.business_type = "b2b" profile.confidence = 0.4 + elif consumer_hint: + profile.business_type = "b2c" + profile.confidence = 0.4 else: profile.business_type = "unknown" profile.confidence = 0.2 @@ -255,8 +279,31 @@ async def detect_business_profile(documents: dict[str, str]) -> BusinessProfile: if hits >= 1: industry_scores[industry] = hits + # Suppress finance/insurance false positives caused by §34d/§34c GewO + # disclosures (Versicherungsvermittler, Berufshaftpflicht, etc.) — these + # are pflichtangaben for many companies (e.g. BMW AG) without being + # actual financial services providers. + if industry_scores.get("finance"): + vermittler_hits = _count_hits(full_text, _VERMITTLER_CONTEXT_TERMS) + if vermittler_hits >= 2: + # Only the §34d boilerplate triggered the match — drop or shrink. + non_insurance_finance = _count_hits( + full_text, ["bank", "finanz", "kredit", "anlage"], + ) + if non_insurance_finance == 0: + industry_scores.pop("finance", None) + else: + industry_scores["finance"] = non_insurance_finance + + # Require a clear winner — if top score is 1 and there are ties, prefer + # "unknown" over guessing. if industry_scores: - profile.industry = max(industry_scores, key=industry_scores.get) # type: ignore[arg-type] + top = max(industry_scores.values()) + winners = [k for k, v in industry_scores.items() if v == top] + if top >= 2 or len(winners) == 1: + profile.industry = winners[0] + else: + profile.industry = "unknown" elif profile.is_regulated_profession: prof_map = {"anwalt": "legal", "arzt": "healthcare", "steuerberater": "finance", "architekt": "craft"} diff --git a/consent-tester/services/dsi_discovery.py b/consent-tester/services/dsi_discovery.py index 20d07ba4..bf170f66 100644 --- a/consent-tester/services/dsi_discovery.py +++ b/consent-tester/services/dsi_discovery.py @@ -273,18 +273,35 @@ async def discover_dsi_documents( is_self_dsi, self_lang = _matches_dsi_keyword(page_title) if is_self_dsi: try: + # Wait for substantive content to appear (SPAs need time to render). + # Polls body.innerText length up to 10s. Many sites (BMW, Daimler) + # render via React/Vue after domcontentloaded fires. + try: + await page.wait_for_function( + "() => (document.body && document.body.innerText || '').length > 500", + timeout=10000, + ) + except Exception: + pass # Continue anyway, extractor below has fallbacks + # Scroll to bottom to trigger lazy-loading of full content await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") await page.wait_for_timeout(1500) await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") await page.wait_for_timeout(1000) - self_text = await page.evaluate("""() => { - const main = document.querySelector('main, article, [role="main"], .content, #content, .bodytext') - || document.body; - return main ? main.innerText : document.body.innerText; - }""") + self_text = await _extract_text_robust(page) self_wc = len(self_text.split()) if self_text else 0 + + # If still too short, try same-origin iframes (some sites + # embed cookie policies via OneTrust/Sourcepoint iframes). + if self_wc < 100: + iframe_text = await _extract_text_from_iframes(page) + if iframe_text and len(iframe_text.split()) > self_wc: + self_text = iframe_text + self_wc = len(self_text.split()) + logger.info("Self-extraction via iframe for %s: %d words", url, self_wc) + if self_wc >= 100: page_title = await page.title() or url result.documents.append(DiscoveredDSI( @@ -622,3 +639,83 @@ async def _find_inline_dsi_sections(page: Page) -> list[dict]: return sections or [] except Exception: return [] + + +async def _extract_text_robust(page: Page) -> str: + """Multi-strategy text extraction for SPA-heavy pages (BMW, Daimler, etc). + + Tries progressively broader selectors, falls back to body-minus-chrome, + final fallback: join all paragraph/list/cell tags' textContent. + """ + try: + return await page.evaluate(""" + () => { + // 1) Specific content containers + const selectors = [ + '.article-content', '.page-content', '.entry-content', + '[class*="content-area"]', '[class*="main-content"]', + '[class*="legal-text"]', '[class*="policy-content"]', + 'main article', 'main', 'article', + '[role="main"]', '.content', '#content', '.bodytext', + ]; + for (const sel of selectors) { + const el = document.querySelector(sel); + if (el && el.textContent.trim().length > 200) { + return el.textContent.trim().replace(/\\s+/g, ' '); + } + } + // 2) Body minus nav/header/footer/scripts + const body = document.body.cloneNode(true); + body.querySelectorAll( + 'nav, header, footer, script, style, noscript,' + + ' [class*="nav"], [class*="sidebar"], [class*="cookie"],' + + ' [class*="banner"], [id*="cookie"], [id*="banner"]' + ).forEach(e => e.remove()); + const bodyText = (body.textContent || '').trim().replace(/\\s+/g, ' '); + if (bodyText.length > 200) return bodyText; + // 3) Final fallback: collect all text-bearing tags + const blocks = document.querySelectorAll('p, li, dd, td, h1, h2, h3, h4'); + const parts = []; + for (const b of blocks) { + const t = (b.textContent || '').trim(); + if (t.length > 20) parts.push(t); + } + return parts.join(' ').replace(/\\s+/g, ' '); + } + """) or "" + except Exception as e: + logger.warning("Robust text extraction failed: %s", e) + return "" + + +async def _extract_text_from_iframes(page: Page) -> str: + """Collect text from same-origin iframes (OneTrust, Sourcepoint embeds). + + Many sites render cookie policies inside iframes managed by CMP vendors. + """ + try: + from urllib.parse import urlparse + page_host = urlparse(page.url).netloc + chunks: list[str] = [] + for frame in page.frames: + if frame == page.main_frame: + continue + try: + frame_host = urlparse(frame.url).netloc + # Accept same-origin or known CMP frames + if frame_host and frame_host != page_host: + cmp_hosts = ("onetrust", "cookiebot", "consensu", "sourcepoint", + "usercentrics", "didomi", "klaro") + if not any(h in frame_host for h in cmp_hosts): + continue + text = await frame.evaluate( + "() => (document.body && document.body.innerText || '').trim()" + ) + if text and len(text.split()) > 50: + chunks.append(text) + except Exception: + continue + return "\n\n".join(chunks) + except Exception as e: + logger.debug("Iframe extraction failed: %s", e) + return ""