From a14e5ad97d87c4156b030abe7ea5b8c13468face Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Sun, 10 May 2026 10:24:37 +0200
Subject: [PATCH] fix: Non-DSE doc checks prefer self-extracted text from
 actual URL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When checking impressum/agb/widerruf, the DSI discovery would follow
links away from the page and return the wrong document (e.g.
/impressum → finds link to /datenschutz → returns datenschutz text).

Now: for non-DSE doc_types, prefer the html_full_page document
(self-extracted from the actual URL the user provided) over linked
pages found by the crawler.

Fixes safetykon.de/impressum returning datenschutz text.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../compliance/api/agent_doc_check_routes.py      | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/backend-compliance/compliance/api/agent_doc_check_routes.py b/backend-compliance/compliance/api/agent_doc_check_routes.py
index d5c23df..2adb83d 100644
--- a/backend-compliance/compliance/api/agent_doc_check_routes.py
+++ b/backend-compliance/compliance/api/agent_doc_check_routes.py
@@ -242,7 +242,7 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]:
         async with httpx.AsyncClient(timeout=90.0) as client:
             resp = await client.post(
                 f"{CONSENT_TESTER_URL}/dsi-discovery",
-                json={"url": entry.url, "max_documents": 1},
+                json={"url": entry.url, "max_documents": 5},
             )
             if resp.status_code != 200:
                 return [DocCheckResult(
@@ -253,9 +253,20 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]:
             data = resp.json()
             docs = data.get("documents", [])
 
+            # For non-DSE doc types (impressum, agb, widerruf), prefer the
+            # self-extracted document (html_full_page) which is the text of
+            # the URL the user provided — not a linked document found by
+            # the discovery crawler.
             doc_text = ""
             word_count = 0
-            if docs:
+            if entry.doc_type not in ("dse", "datenschutz", "privacy"):
+                # Prefer html_full_page (self-extracted from the actual URL)
+                for d in docs:
+                    if d.get("doc_type") == "html_full_page":
+                        doc_text = d.get("full_text", "") or d.get("text", "")
+                        word_count = d.get("word_count", 0)
+                        break
+            if not doc_text and docs:
                 doc_text = docs[0].get("full_text", "") or docs[0].get("text_preview", "")
                 word_count = docs[0].get("word_count", 0)