From a349111a01cbc24ea253d2e5add25385c6714c13 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Tue, 5 May 2026 16:03:56 +0200
Subject: [PATCH] =?UTF-8?q?fix:=20Raise=20full=5Ftext=20limit=2010K?=
 =?UTF-8?q?=E2=86=9250K=20+=20combine=20all=20DSI=20texts=20for=20checks?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two fixes:
1. consent-tester: full_text truncation raised from 10,000 to 50,000 chars
   (IHK Internetangebot has ~50K chars, Beschwerderecht was after 10K cutoff)
2. Backend: dse_text now combines Playwright HTML + ALL DSI discovery texts
   for mandatory content checking. Previously only used first 8K chars from
   one source, missing Verantwortlicher/DSB that were in DSI documents.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../compliance/api/agent_scan_routes.py       | 28 +++++++++----------
 consent-tester/main.py                        |  2 +-
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/backend-compliance/compliance/api/agent_scan_routes.py b/backend-compliance/compliance/api/agent_scan_routes.py
index 3b0d80f..1e66273 100644
--- a/backend-compliance/compliance/api/agent_scan_routes.py
+++ b/backend-compliance/compliance/api/agent_scan_routes.py
@@ -274,28 +274,26 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
         logger.warning("DSI discovery failed: %s %s", type(e).__name__, e)
 
     _progress(f"Schritt 3/7: Datenschutzerklaerung analysieren... ({len(discovered_docs)} Dokumente gefunden)")
-    # Step 2: Fetch privacy policy text
-    # Priority: 1) Playwright HTMLs, 2) DSI Discovery full_text, 3) httpx fallback
+    # Step 2: Fetch privacy policy text — combine all DSI texts for best coverage
     dse_text = ""
+    # Start with Playwright HTML if available
     for page_url, html in playwright_htmls.items():
         if re.search(r"datenschutz|privacy|dsgvo", page_url, re.IGNORECASE):
             clean = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
             clean = re.sub(r"<[^>]+>", " ", clean)
             clean = re.sub(r"\s+", " ", clean).strip()
-            dse_text = clean[:8000]
+            dse_text = clean[:30000]
             break
-    # Fallback: use DSI discovery texts (combined from all DSE documents found)
-    if not dse_text and discovered_docs:
-        try:
-            dsi_data_local = dsi_resp.json() if 'dsi_resp' in dir() else {}
-            for doc in dsi_data_local.get("documents", []):
-                if doc.get("doc_type", "") in ("dse", "privacy", "datenschutz") or \
-                   "datenschutz" in doc.get("title", "").lower():
-                    ft = doc.get("full_text", "")
-                    if ft and len(ft) > len(dse_text):
-                        dse_text = ft[:8000]
-        except Exception:
-            pass
+    # Enrich: append DSI discovery texts (they contain the actual document content)
+    try:
+        if 'dsi_resp' in dir() or 'dsi_data' in dir():
+            dsi_data_for_text = dsi_data if 'dsi_data' in dir() else {}
+            for doc in dsi_data_for_text.get("documents", []):
+                ft = doc.get("full_text", "")
+                if ft and len(ft) > 500:
+                    dse_text = (dse_text + " " + ft)[:50000]
+    except Exception:
+        pass
     if not dse_text:
         dse_text = await fetch_dse_text(req.url, scan.pages_scanned)
 
diff --git a/consent-tester/main.py b/consent-tester/main.py
index f06cd2f..64fc3ab 100644
--- a/consent-tester/main.py
+++ b/consent-tester/main.py
@@ -312,7 +312,7 @@ async def dsi_discovery(req: DSIDiscoveryRequest):
                 doc_type=d.doc_type,
                 word_count=d.word_count,
                 text_preview=d.text[:500] if d.text else "",
-                full_text=d.text[:10000] if d.text else "",
+                full_text=d.text[:50000] if d.text else "",
             )
             for d in result.documents
         ],