feat(agent): progress_pct + 6 BMW-Run Verbesserungen
Backend (agent_compliance_check_routes.py):
- progress_pct (0-100%) im Job-State, ueber alle Phasen verteilt
(Laden 0-30, Profil 35-40, Pruefen 40-80, Banner 80-92, Report 95-100)
- Status-Texte vereinheitlicht ("Texte laden X/N", "Pruefen X/N")
- Firmenname fuer Email-Subject jetzt aus URL abgeleitet
(bmw.de -> "BMW", mercedes-benz.de -> "Mercedes-Benz") statt
unzuverlaessigem extracted_profile.companyName (matchte oft juris.de)
- E-Mail-Report enthaelt jetzt Banner+TCF-Vendor-Liste (build_provider_list_html)
Backend (agent_doc_check_extras.py — neu):
- build_scanned_urls_html: gepruefte URLs als Tabelle oben im Report
(transparent fuer GF, welche Quellen wirklich gezogen wurden)
- Cross-Domain-Hinweis bei >1 netloc (BMW: bmw.de / bmwgroup.com /
bmwgroup.jobs — Auffindbarkeit nach Art. 12 DSGVO)
- build_provider_list_html: Banner-Box + TCF-Vendor-Tabelle mit Spalten
Name | Kategorie | Zweck | Drittland | Rechtsgrundlage
Backend (business_profiler.py):
- §34d-GewO Versicherungsvermittler-Hinweise zaehlen nicht mehr als
"finance"-Industrie (BMW wurde dadurch falsch als B2B/finance erkannt)
- Neue Industry "automotive" (Fahrzeug/KFZ/Konfigurator/Modellpalette)
- B2B-Keywords: generische Begriffe wie "unternehmen", "beratung",
"consulting" entfernt (matchten in jedem Konzerntext)
- B2C-Fallback: bei Verbraucher-Signalen ("widerruf", "kunde",
redaktioneller Inhalt) tendiert auf b2c statt b2b
Frontend (ComplianceCheckTab.tsx):
- Progress-Balken mit Width-% und XX%-Anzeige rechts
- liest data.progress_pct aus Polling-Response
Consent-Tester (dsi_discovery.py):
- Cookie-Policy-Extraktion kritisch fixt: wait_for_function bis
body.innerText > 500 chars (BMW SPA-Rendering brauchte mehr Zeit)
- _extract_text_robust: 3-Strategien-Extraktion (Selektoren -> Body-
Cleanup -> P/LI/TD-Tags)
- _extract_text_from_iframes: liest OneTrust/Sourcepoint/Usercentrics
Iframe-Inhalte (manche Cookie-Policies leben dort)
Adressiert alle Findings aus dem BMW-Ground-Truth-Vergleich.
This commit is contained in:
@@ -273,18 +273,35 @@ async def discover_dsi_documents(
|
||||
is_self_dsi, self_lang = _matches_dsi_keyword(page_title)
|
||||
if is_self_dsi:
|
||||
try:
|
||||
# Wait for substantive content to appear (SPAs need time to render).
|
||||
# Polls body.innerText length up to 10s. Many sites (BMW, Daimler)
|
||||
# render via React/Vue after domcontentloaded fires.
|
||||
try:
|
||||
await page.wait_for_function(
|
||||
"() => (document.body && document.body.innerText || '').length > 500",
|
||||
timeout=10000,
|
||||
)
|
||||
except Exception:
|
||||
pass # Continue anyway, extractor below has fallbacks
|
||||
|
||||
# Scroll to bottom to trigger lazy-loading of full content
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(1500)
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(1000)
|
||||
|
||||
self_text = await page.evaluate("""() => {
|
||||
const main = document.querySelector('main, article, [role="main"], .content, #content, .bodytext')
|
||||
|| document.body;
|
||||
return main ? main.innerText : document.body.innerText;
|
||||
}""")
|
||||
self_text = await _extract_text_robust(page)
|
||||
self_wc = len(self_text.split()) if self_text else 0
|
||||
|
||||
# If still too short, try same-origin iframes (some sites
|
||||
# embed cookie policies via OneTrust/Sourcepoint iframes).
|
||||
if self_wc < 100:
|
||||
iframe_text = await _extract_text_from_iframes(page)
|
||||
if iframe_text and len(iframe_text.split()) > self_wc:
|
||||
self_text = iframe_text
|
||||
self_wc = len(self_text.split())
|
||||
logger.info("Self-extraction via iframe for %s: %d words", url, self_wc)
|
||||
|
||||
if self_wc >= 100:
|
||||
page_title = await page.title() or url
|
||||
result.documents.append(DiscoveredDSI(
|
||||
@@ -622,3 +639,83 @@ async def _find_inline_dsi_sections(page: Page) -> list[dict]:
|
||||
return sections or []
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
async def _extract_text_robust(page: Page) -> str:
|
||||
"""Multi-strategy text extraction for SPA-heavy pages (BMW, Daimler, etc).
|
||||
|
||||
Tries progressively broader selectors, falls back to body-minus-chrome,
|
||||
final fallback: join all paragraph/list/cell tags' textContent.
|
||||
"""
|
||||
try:
|
||||
return await page.evaluate("""
|
||||
() => {
|
||||
// 1) Specific content containers
|
||||
const selectors = [
|
||||
'.article-content', '.page-content', '.entry-content',
|
||||
'[class*="content-area"]', '[class*="main-content"]',
|
||||
'[class*="legal-text"]', '[class*="policy-content"]',
|
||||
'main article', 'main', 'article',
|
||||
'[role="main"]', '.content', '#content', '.bodytext',
|
||||
];
|
||||
for (const sel of selectors) {
|
||||
const el = document.querySelector(sel);
|
||||
if (el && el.textContent.trim().length > 200) {
|
||||
return el.textContent.trim().replace(/\\s+/g, ' ');
|
||||
}
|
||||
}
|
||||
// 2) Body minus nav/header/footer/scripts
|
||||
const body = document.body.cloneNode(true);
|
||||
body.querySelectorAll(
|
||||
'nav, header, footer, script, style, noscript,' +
|
||||
' [class*="nav"], [class*="sidebar"], [class*="cookie"],' +
|
||||
' [class*="banner"], [id*="cookie"], [id*="banner"]'
|
||||
).forEach(e => e.remove());
|
||||
const bodyText = (body.textContent || '').trim().replace(/\\s+/g, ' ');
|
||||
if (bodyText.length > 200) return bodyText;
|
||||
// 3) Final fallback: collect all text-bearing tags
|
||||
const blocks = document.querySelectorAll('p, li, dd, td, h1, h2, h3, h4');
|
||||
const parts = [];
|
||||
for (const b of blocks) {
|
||||
const t = (b.textContent || '').trim();
|
||||
if (t.length > 20) parts.push(t);
|
||||
}
|
||||
return parts.join(' ').replace(/\\s+/g, ' ');
|
||||
}
|
||||
""") or ""
|
||||
except Exception as e:
|
||||
logger.warning("Robust text extraction failed: %s", e)
|
||||
return ""
|
||||
|
||||
|
||||
async def _extract_text_from_iframes(page: Page) -> str:
|
||||
"""Collect text from same-origin iframes (OneTrust, Sourcepoint embeds).
|
||||
|
||||
Many sites render cookie policies inside iframes managed by CMP vendors.
|
||||
"""
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
page_host = urlparse(page.url).netloc
|
||||
chunks: list[str] = []
|
||||
for frame in page.frames:
|
||||
if frame == page.main_frame:
|
||||
continue
|
||||
try:
|
||||
frame_host = urlparse(frame.url).netloc
|
||||
# Accept same-origin or known CMP frames
|
||||
if frame_host and frame_host != page_host:
|
||||
cmp_hosts = ("onetrust", "cookiebot", "consensu", "sourcepoint",
|
||||
"usercentrics", "didomi", "klaro")
|
||||
if not any(h in frame_host for h in cmp_hosts):
|
||||
continue
|
||||
text = await frame.evaluate(
|
||||
"() => (document.body && document.body.innerText || '').trim()"
|
||||
)
|
||||
if text and len(text.split()) > 50:
|
||||
chunks.append(text)
|
||||
except Exception:
|
||||
continue
|
||||
return "\n\n".join(chunks)
|
||||
except Exception as e:
|
||||
logger.debug("Iframe extraction failed: %s", e)
|
||||
return ""
|
||||
|
||||
Reference in New Issue
Block a user