diff --git a/backend-compliance/compliance/api/agent_scan_routes.py b/backend-compliance/compliance/api/agent_scan_routes.py index 3b0d80f..1e66273 100644 --- a/backend-compliance/compliance/api/agent_scan_routes.py +++ b/backend-compliance/compliance/api/agent_scan_routes.py @@ -274,28 +274,26 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse: logger.warning("DSI discovery failed: %s %s", type(e).__name__, e) _progress(f"Schritt 3/7: Datenschutzerklaerung analysieren... ({len(discovered_docs)} Dokumente gefunden)") - # Step 2: Fetch privacy policy text - # Priority: 1) Playwright HTMLs, 2) DSI Discovery full_text, 3) httpx fallback + # Step 2: Fetch privacy policy text — combine all DSI texts for best coverage dse_text = "" + # Start with Playwright HTML if available for page_url, html in playwright_htmls.items(): if re.search(r"datenschutz|privacy|dsgvo", page_url, re.IGNORECASE): clean = re.sub(r"<(script|style)[^>]*>.*?", "", html, flags=re.DOTALL | re.IGNORECASE) clean = re.sub(r"<[^>]+>", " ", clean) clean = re.sub(r"\s+", " ", clean).strip() - dse_text = clean[:8000] + dse_text = clean[:30000] break - # Fallback: use DSI discovery texts (combined from all DSE documents found) - if not dse_text and discovered_docs: - try: - dsi_data_local = dsi_resp.json() if 'dsi_resp' in dir() else {} - for doc in dsi_data_local.get("documents", []): - if doc.get("doc_type", "") in ("dse", "privacy", "datenschutz") or \ - "datenschutz" in doc.get("title", "").lower(): - ft = doc.get("full_text", "") - if ft and len(ft) > len(dse_text): - dse_text = ft[:8000] - except Exception: - pass + # Enrich: append DSI discovery texts (they contain the actual document content) + try: + if 'dsi_resp' in dir() or 'dsi_data' in dir(): + dsi_data_for_text = dsi_data if 'dsi_data' in dir() else {} + for doc in dsi_data_for_text.get("documents", []): + ft = doc.get("full_text", "") + if ft and len(ft) > 500: + dse_text = (dse_text + " " + ft)[:50000] + except Exception: + pass if not dse_text: dse_text = await fetch_dse_text(req.url, scan.pages_scanned) diff --git a/consent-tester/main.py b/consent-tester/main.py index f06cd2f..64fc3ab 100644 --- a/consent-tester/main.py +++ b/consent-tester/main.py @@ -312,7 +312,7 @@ async def dsi_discovery(req: DSIDiscoveryRequest): doc_type=d.doc_type, word_count=d.word_count, text_preview=d.text[:500] if d.text else "", - full_text=d.text[:10000] if d.text else "", + full_text=d.text[:50000] if d.text else "", ) for d in result.documents ],