feat(agent): progress_pct + 6 BMW-Run Verbesserungen

Backend (agent_compliance_check_routes.py): - progress_pct (0-100%) im Job-State, ueber alle Phasen verteilt (Laden 0-30, Profil 35-40, Pruefen 40-80, Banner 80-92, Report 95-100) - Status-Texte vereinheitlicht ("Texte laden X/N", "Pruefen X/N") - Firmenname fuer Email-Subject jetzt aus URL abgeleitet (bmw.de -> "BMW", mercedes-benz.de -> "Mercedes-Benz") statt unzuverlaessigem extracted_profile.companyName (matchte oft juris.de) - E-Mail-Report enthaelt jetzt Banner+TCF-Vendor-Liste (build_provider_list_html) Backend (agent_doc_check_extras.py — neu): - build_scanned_urls_html: gepruefte URLs als Tabelle oben im Report (transparent fuer GF, welche Quellen wirklich gezogen wurden) - Cross-Domain-Hinweis bei >1 netloc (BMW: bmw.de / bmwgroup.com / bmwgroup.jobs — Auffindbarkeit nach Art. 12 DSGVO) - build_provider_list_html: Banner-Box + TCF-Vendor-Tabelle mit Spalten Name | Kategorie | Zweck | Drittland | Rechtsgrundlage Backend (business_profiler.py): - §34d-GewO Versicherungsvermittler-Hinweise zaehlen nicht mehr als "finance"-Industrie (BMW wurde dadurch falsch als B2B/finance erkannt) - Neue Industry "automotive" (Fahrzeug/KFZ/Konfigurator/Modellpalette) - B2B-Keywords: generische Begriffe wie "unternehmen", "beratung", "consulting" entfernt (matchten in jedem Konzerntext) - B2C-Fallback: bei Verbraucher-Signalen ("widerruf", "kunde", redaktioneller Inhalt) tendiert auf b2c statt b2b Frontend (ComplianceCheckTab.tsx): - Progress-Balken mit Width-% und XX%-Anzeige rechts - liest data.progress_pct aus Polling-Response Consent-Tester (dsi_discovery.py): - Cookie-Policy-Extraktion kritisch fixt: wait_for_function bis body.innerText > 500 chars (BMW SPA-Rendering brauchte mehr Zeit) - _extract_text_robust: 3-Strategien-Extraktion (Selektoren -> Body- Cleanup -> P/LI/TD-Tags) - _extract_text_from_iframes: liest OneTrust/Sourcepoint/Usercentrics Iframe-Inhalte (manche Cookie-Policies leben dort) Adressiert alle Findings aus dem BMW-Ground-Truth-Vergleich.
2026-05-16 17:53:14 +02:00
parent 4d1e0a7f8e
commit e61e9d9e2a
6 changed files with 515 additions and 53 deletions
@@ -273,18 +273,35 @@ async def discover_dsi_documents(
            is_self_dsi, self_lang = _matches_dsi_keyword(page_title)
        if is_self_dsi:
            try:
+                # Wait for substantive content to appear (SPAs need time to render).
+                # Polls body.innerText length up to 10s. Many sites (BMW, Daimler)
+                # render via React/Vue after domcontentloaded fires.
+                try:
+                    await page.wait_for_function(
+                        "() => (document.body && document.body.innerText || '').length > 500",
+                        timeout=10000,
+                    )
+                except Exception:
+                    pass  # Continue anyway, extractor below has fallbacks
+
                # Scroll to bottom to trigger lazy-loading of full content
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                await page.wait_for_timeout(1500)
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                await page.wait_for_timeout(1000)

-                self_text = await page.evaluate("""() => {
-                    const main = document.querySelector('main, article, [role="main"], .content, #content, .bodytext')
-                        || document.body;
-                    return main ? main.innerText : document.body.innerText;
-                }""")
+                self_text = await _extract_text_robust(page)
                self_wc = len(self_text.split()) if self_text else 0
+
+                # If still too short, try same-origin iframes (some sites
+                # embed cookie policies via OneTrust/Sourcepoint iframes).
+                if self_wc < 100:
+                    iframe_text = await _extract_text_from_iframes(page)
+                    if iframe_text and len(iframe_text.split()) > self_wc:
+                        self_text = iframe_text
+                        self_wc = len(self_text.split())
+                        logger.info("Self-extraction via iframe for %s: %d words", url, self_wc)
+
                if self_wc >= 100:
                    page_title = await page.title() or url
                    result.documents.append(DiscoveredDSI(
@@ -622,3 +639,83 @@ async def _find_inline_dsi_sections(page: Page) -> list[dict]:
        return sections or []
    except Exception:
        return []
+
+
+async def _extract_text_robust(page: Page) -> str:
+    """Multi-strategy text extraction for SPA-heavy pages (BMW, Daimler, etc).
+
+    Tries progressively broader selectors, falls back to body-minus-chrome,
+    final fallback: join all paragraph/list/cell tags' textContent.
+    """
+    try:
+        return await page.evaluate("""
+            () => {
+                // 1) Specific content containers
+                const selectors = [
+                    '.article-content', '.page-content', '.entry-content',
+                    '[class*="content-area"]', '[class*="main-content"]',
+                    '[class*="legal-text"]', '[class*="policy-content"]',
+                    'main article', 'main', 'article',
+                    '[role="main"]', '.content', '#content', '.bodytext',
+                ];
+                for (const sel of selectors) {
+                    const el = document.querySelector(sel);
+                    if (el && el.textContent.trim().length > 200) {
+                        return el.textContent.trim().replace(/\\s+/g, ' ');
+                    }
+                }
+                // 2) Body minus nav/header/footer/scripts
+                const body = document.body.cloneNode(true);
+                body.querySelectorAll(
+                    'nav, header, footer, script, style, noscript,' +
+                    ' [class*="nav"], [class*="sidebar"], [class*="cookie"],' +
+                    ' [class*="banner"], [id*="cookie"], [id*="banner"]'
+                ).forEach(e => e.remove());
+                const bodyText = (body.textContent || '').trim().replace(/\\s+/g, ' ');
+                if (bodyText.length > 200) return bodyText;
+                // 3) Final fallback: collect all text-bearing tags
+                const blocks = document.querySelectorAll('p, li, dd, td, h1, h2, h3, h4');
+                const parts = [];
+                for (const b of blocks) {
+                    const t = (b.textContent || '').trim();
+                    if (t.length > 20) parts.push(t);
+                }
+                return parts.join(' ').replace(/\\s+/g, ' ');
+            }
+        """) or ""
+    except Exception as e:
+        logger.warning("Robust text extraction failed: %s", e)
+        return ""
+
+
+async def _extract_text_from_iframes(page: Page) -> str:
+    """Collect text from same-origin iframes (OneTrust, Sourcepoint embeds).
+
+    Many sites render cookie policies inside iframes managed by CMP vendors.
+    """
+    try:
+        from urllib.parse import urlparse
+        page_host = urlparse(page.url).netloc
+        chunks: list[str] = []
+        for frame in page.frames:
+            if frame == page.main_frame:
+                continue
+            try:
+                frame_host = urlparse(frame.url).netloc
+                # Accept same-origin or known CMP frames
+                if frame_host and frame_host != page_host:
+                    cmp_hosts = ("onetrust", "cookiebot", "consensu", "sourcepoint",
+                                 "usercentrics", "didomi", "klaro")
+                    if not any(h in frame_host for h in cmp_hosts):
+                        continue
+                text = await frame.evaluate(
+                    "() => (document.body && document.body.innerText || '').trim()"
+                )
+                if text and len(text.split()) > 50:
+                    chunks.append(text)
+            except Exception:
+                continue
+        return "\n\n".join(chunks)
+    except Exception as e:
+        logger.debug("Iframe extraction failed: %s", e)
+        return ""