From a349111a01cbc24ea253d2e5add25385c6714c13 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 5 May 2026 16:03:56 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20Raise=20full=5Ftext=20limit=2010K?= =?UTF-8?q?=E2=86=9250K=20+=20combine=20all=20DSI=20texts=20for=20checks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes: 1. consent-tester: full_text truncation raised from 10,000 to 50,000 chars (IHK Internetangebot has ~50K chars, Beschwerderecht was after 10K cutoff) 2. Backend: dse_text now combines Playwright HTML + ALL DSI discovery texts for mandatory content checking. Previously only used first 8K chars from one source, missing Verantwortlicher/DSB that were in DSI documents. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../compliance/api/agent_scan_routes.py | 28 +++++++++---------- consent-tester/main.py | 2 +- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/backend-compliance/compliance/api/agent_scan_routes.py b/backend-compliance/compliance/api/agent_scan_routes.py index 3b0d80f..1e66273 100644 --- a/backend-compliance/compliance/api/agent_scan_routes.py +++ b/backend-compliance/compliance/api/agent_scan_routes.py @@ -274,28 +274,26 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse: logger.warning("DSI discovery failed: %s %s", type(e).__name__, e) _progress(f"Schritt 3/7: Datenschutzerklaerung analysieren... ({len(discovered_docs)} Dokumente gefunden)") - # Step 2: Fetch privacy policy text - # Priority: 1) Playwright HTMLs, 2) DSI Discovery full_text, 3) httpx fallback + # Step 2: Fetch privacy policy text — combine all DSI texts for best coverage dse_text = "" + # Start with Playwright HTML if available for page_url, html in playwright_htmls.items(): if re.search(r"datenschutz|privacy|dsgvo", page_url, re.IGNORECASE): clean = re.sub(r"<(script|style)[^>]*>.*?", "", html, flags=re.DOTALL | re.IGNORECASE) clean = re.sub(r"<[^>]+>", " ", clean) clean = re.sub(r"\s+", " ", clean).strip() - dse_text = clean[:8000] + dse_text = clean[:30000] break - # Fallback: use DSI discovery texts (combined from all DSE documents found) - if not dse_text and discovered_docs: - try: - dsi_data_local = dsi_resp.json() if 'dsi_resp' in dir() else {} - for doc in dsi_data_local.get("documents", []): - if doc.get("doc_type", "") in ("dse", "privacy", "datenschutz") or \ - "datenschutz" in doc.get("title", "").lower(): - ft = doc.get("full_text", "") - if ft and len(ft) > len(dse_text): - dse_text = ft[:8000] - except Exception: - pass + # Enrich: append DSI discovery texts (they contain the actual document content) + try: + if 'dsi_resp' in dir() or 'dsi_data' in dir(): + dsi_data_for_text = dsi_data if 'dsi_data' in dir() else {} + for doc in dsi_data_for_text.get("documents", []): + ft = doc.get("full_text", "") + if ft and len(ft) > 500: + dse_text = (dse_text + " " + ft)[:50000] + except Exception: + pass if not dse_text: dse_text = await fetch_dse_text(req.url, scan.pages_scanned) diff --git a/consent-tester/main.py b/consent-tester/main.py index f06cd2f..64fc3ab 100644 --- a/consent-tester/main.py +++ b/consent-tester/main.py @@ -312,7 +312,7 @@ async def dsi_discovery(req: DSIDiscoveryRequest): doc_type=d.doc_type, word_count=d.word_count, text_preview=d.text[:500] if d.text else "", - full_text=d.text[:10000] if d.text else "", + full_text=d.text[:50000] if d.text else "", ) for d in result.documents ],