From a14e5ad97d87c4156b030abe7ea5b8c13468face Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 10 May 2026 10:24:37 +0200 Subject: [PATCH] fix: Non-DSE doc checks prefer self-extracted text from actual URL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When checking impressum/agb/widerruf, the DSI discovery would follow links away from the page and return the wrong document (e.g. /impressum → finds link to /datenschutz → returns datenschutz text). Now: for non-DSE doc_types, prefer the html_full_page document (self-extracted from the actual URL the user provided) over linked pages found by the crawler. Fixes safetykon.de/impressum returning datenschutz text. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../compliance/api/agent_doc_check_routes.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/backend-compliance/compliance/api/agent_doc_check_routes.py b/backend-compliance/compliance/api/agent_doc_check_routes.py index d5c23df..2adb83d 100644 --- a/backend-compliance/compliance/api/agent_doc_check_routes.py +++ b/backend-compliance/compliance/api/agent_doc_check_routes.py @@ -242,7 +242,7 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]: async with httpx.AsyncClient(timeout=90.0) as client: resp = await client.post( f"{CONSENT_TESTER_URL}/dsi-discovery", - json={"url": entry.url, "max_documents": 1}, + json={"url": entry.url, "max_documents": 5}, ) if resp.status_code != 200: return [DocCheckResult( @@ -253,9 +253,20 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]: data = resp.json() docs = data.get("documents", []) + # For non-DSE doc types (impressum, agb, widerruf), prefer the + # self-extracted document (html_full_page) which is the text of + # the URL the user provided — not a linked document found by + # the discovery crawler. doc_text = "" word_count = 0 - if docs: + if entry.doc_type not in ("dse", "datenschutz", "privacy"): + # Prefer html_full_page (self-extracted from the actual URL) + for d in docs: + if d.get("doc_type") == "html_full_page": + doc_text = d.get("full_text", "") or d.get("text", "") + word_count = d.get("word_count", 0) + break + if not doc_text and docs: doc_text = docs[0].get("full_text", "") or docs[0].get("text_preview", "") word_count = docs[0].get("word_count", 0)