fix: Raise full_text limit 10K→50K + combine all DSI texts for checks
Two fixes: 1. consent-tester: full_text truncation raised from 10,000 to 50,000 chars (IHK Internetangebot has ~50K chars, Beschwerderecht was after 10K cutoff) 2. Backend: dse_text now combines Playwright HTML + ALL DSI discovery texts for mandatory content checking. Previously only used first 8K chars from one source, missing Verantwortlicher/DSB that were in DSI documents. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -274,26 +274,24 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
|
|||||||
logger.warning("DSI discovery failed: %s %s", type(e).__name__, e)
|
logger.warning("DSI discovery failed: %s %s", type(e).__name__, e)
|
||||||
|
|
||||||
_progress(f"Schritt 3/7: Datenschutzerklaerung analysieren... ({len(discovered_docs)} Dokumente gefunden)")
|
_progress(f"Schritt 3/7: Datenschutzerklaerung analysieren... ({len(discovered_docs)} Dokumente gefunden)")
|
||||||
# Step 2: Fetch privacy policy text
|
# Step 2: Fetch privacy policy text — combine all DSI texts for best coverage
|
||||||
# Priority: 1) Playwright HTMLs, 2) DSI Discovery full_text, 3) httpx fallback
|
|
||||||
dse_text = ""
|
dse_text = ""
|
||||||
|
# Start with Playwright HTML if available
|
||||||
for page_url, html in playwright_htmls.items():
|
for page_url, html in playwright_htmls.items():
|
||||||
if re.search(r"datenschutz|privacy|dsgvo", page_url, re.IGNORECASE):
|
if re.search(r"datenschutz|privacy|dsgvo", page_url, re.IGNORECASE):
|
||||||
clean = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
|
clean = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
|
||||||
clean = re.sub(r"<[^>]+>", " ", clean)
|
clean = re.sub(r"<[^>]+>", " ", clean)
|
||||||
clean = re.sub(r"\s+", " ", clean).strip()
|
clean = re.sub(r"\s+", " ", clean).strip()
|
||||||
dse_text = clean[:8000]
|
dse_text = clean[:30000]
|
||||||
break
|
break
|
||||||
# Fallback: use DSI discovery texts (combined from all DSE documents found)
|
# Enrich: append DSI discovery texts (they contain the actual document content)
|
||||||
if not dse_text and discovered_docs:
|
|
||||||
try:
|
try:
|
||||||
dsi_data_local = dsi_resp.json() if 'dsi_resp' in dir() else {}
|
if 'dsi_resp' in dir() or 'dsi_data' in dir():
|
||||||
for doc in dsi_data_local.get("documents", []):
|
dsi_data_for_text = dsi_data if 'dsi_data' in dir() else {}
|
||||||
if doc.get("doc_type", "") in ("dse", "privacy", "datenschutz") or \
|
for doc in dsi_data_for_text.get("documents", []):
|
||||||
"datenschutz" in doc.get("title", "").lower():
|
|
||||||
ft = doc.get("full_text", "")
|
ft = doc.get("full_text", "")
|
||||||
if ft and len(ft) > len(dse_text):
|
if ft and len(ft) > 500:
|
||||||
dse_text = ft[:8000]
|
dse_text = (dse_text + " " + ft)[:50000]
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
if not dse_text:
|
if not dse_text:
|
||||||
|
|||||||
@@ -312,7 +312,7 @@ async def dsi_discovery(req: DSIDiscoveryRequest):
|
|||||||
doc_type=d.doc_type,
|
doc_type=d.doc_type,
|
||||||
word_count=d.word_count,
|
word_count=d.word_count,
|
||||||
text_preview=d.text[:500] if d.text else "",
|
text_preview=d.text[:500] if d.text else "",
|
||||||
full_text=d.text[:10000] if d.text else "",
|
full_text=d.text[:50000] if d.text else "",
|
||||||
)
|
)
|
||||||
for d in result.documents
|
for d in result.documents
|
||||||
],
|
],
|
||||||
|
|||||||
Reference in New Issue
Block a user