fix: Non-DSE doc checks prefer self-extracted text from actual URL

When checking impressum/agb/widerruf, the DSI discovery would follow
links away from the page and return the wrong document (e.g.
/impressum → finds link to /datenschutz → returns datenschutz text).

Now: for non-DSE doc_types, prefer the html_full_page document
(self-extracted from the actual URL the user provided) over linked
pages found by the crawler.

Fixes safetykon.de/impressum returning datenschutz text.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-10 10:24:37 +02:00
parent df463dbce7
commit a14e5ad97d
@@ -242,7 +242,7 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]:
async with httpx.AsyncClient(timeout=90.0) as client: async with httpx.AsyncClient(timeout=90.0) as client:
resp = await client.post( resp = await client.post(
f"{CONSENT_TESTER_URL}/dsi-discovery", f"{CONSENT_TESTER_URL}/dsi-discovery",
json={"url": entry.url, "max_documents": 1}, json={"url": entry.url, "max_documents": 5},
) )
if resp.status_code != 200: if resp.status_code != 200:
return [DocCheckResult( return [DocCheckResult(
@@ -253,9 +253,20 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]:
data = resp.json() data = resp.json()
docs = data.get("documents", []) docs = data.get("documents", [])
# For non-DSE doc types (impressum, agb, widerruf), prefer the
# self-extracted document (html_full_page) which is the text of
# the URL the user provided — not a linked document found by
# the discovery crawler.
doc_text = "" doc_text = ""
word_count = 0 word_count = 0
if docs: if entry.doc_type not in ("dse", "datenschutz", "privacy"):
# Prefer html_full_page (self-extracted from the actual URL)
for d in docs:
if d.get("doc_type") == "html_full_page":
doc_text = d.get("full_text", "") or d.get("text", "")
word_count = d.get("word_count", 0)
break
if not doc_text and docs:
doc_text = docs[0].get("full_text", "") or docs[0].get("text_preview", "") doc_text = docs[0].get("full_text", "") or docs[0].get("text_preview", "")
word_count = docs[0].get("word_count", 0) word_count = docs[0].get("word_count", 0)