fix: Scan quality — raise page limit, use full DSI text for checks
Bug 1: max_pages was hardcoded to 15 in backend call — raised to 50
Bug 2: DSI documents checked against text_preview (500 chars) — now uses
full_text (10,000 chars) for Art. 13 mandatory field checks
Bug 3: DSE text not found when Playwright misses DSE page — now falls
back to DSI Discovery full_text as second source
Bug 4: Backend timeout 120s too short for 50 pages — raised to 300s
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -114,10 +114,10 @@ async def scan_website_endpoint(req: ScanRequest):
|
|||||||
# Step 1: Scan website — try Playwright first (JS-rendered), fallback to httpx
|
# Step 1: Scan website — try Playwright first (JS-rendered), fallback to httpx
|
||||||
playwright_htmls: dict[str, str] = {}
|
playwright_htmls: dict[str, str] = {}
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=120.0) as pw_client:
|
async with httpx.AsyncClient(timeout=300.0) as pw_client:
|
||||||
pw_resp = await pw_client.post(
|
pw_resp = await pw_client.post(
|
||||||
"http://bp-compliance-consent-tester:8094/website-scan",
|
"http://bp-compliance-consent-tester:8094/website-scan",
|
||||||
json={"url": req.url, "max_pages": 15, "click_nav": True},
|
json={"url": req.url, "max_pages": 50, "click_nav": True},
|
||||||
)
|
)
|
||||||
if pw_resp.status_code == 200:
|
if pw_resp.status_code == 200:
|
||||||
pw_data = pw_resp.json()
|
pw_data = pw_resp.json()
|
||||||
@@ -172,8 +172,9 @@ async def scan_website_endpoint(req: ScanRequest):
|
|||||||
)
|
)
|
||||||
for doc in dsi_data.get("documents", []):
|
for doc in dsi_data.get("documents", []):
|
||||||
doc_type = classify_document_type(doc["title"], doc["url"])
|
doc_type = classify_document_type(doc["title"], doc["url"])
|
||||||
|
doc_text = doc.get("full_text", "") or doc.get("text_preview", "")
|
||||||
doc_findings = check_document_completeness(
|
doc_findings = check_document_completeness(
|
||||||
doc.get("text_preview", ""), doc_type, doc["title"], doc["url"],
|
doc_text, doc_type, doc["title"], doc["url"],
|
||||||
)
|
)
|
||||||
# Count completeness
|
# Count completeness
|
||||||
score_finding = next((f for f in doc_findings if "SCORE" in f.get("code", "")), None)
|
score_finding = next((f for f in doc_findings if "SCORE" in f.get("code", "")), None)
|
||||||
@@ -199,16 +200,28 @@ async def scan_website_endpoint(req: ScanRequest):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("DSI discovery failed: %s", e)
|
logger.warning("DSI discovery failed: %s", e)
|
||||||
|
|
||||||
# Step 2: Fetch privacy policy text (from Playwright HTMLs or httpx)
|
# Step 2: Fetch privacy policy text
|
||||||
|
# Priority: 1) Playwright HTMLs, 2) DSI Discovery full_text, 3) httpx fallback
|
||||||
dse_text = ""
|
dse_text = ""
|
||||||
for page_url, html in playwright_htmls.items():
|
for page_url, html in playwright_htmls.items():
|
||||||
if re.search(r"datenschutz|privacy|dsgvo", page_url, re.IGNORECASE):
|
if re.search(r"datenschutz|privacy|dsgvo", page_url, re.IGNORECASE):
|
||||||
import re as _re
|
clean = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
|
||||||
clean = _re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=_re.DOTALL | _re.IGNORECASE)
|
clean = re.sub(r"<[^>]+>", " ", clean)
|
||||||
clean = _re.sub(r"<[^>]+>", " ", clean)
|
clean = re.sub(r"\s+", " ", clean).strip()
|
||||||
clean = _re.sub(r"\s+", " ", clean).strip()
|
dse_text = clean[:8000]
|
||||||
dse_text = clean[:4000]
|
|
||||||
break
|
break
|
||||||
|
# Fallback: use DSI discovery texts (combined from all DSE documents found)
|
||||||
|
if not dse_text and discovered_docs:
|
||||||
|
try:
|
||||||
|
dsi_data_local = dsi_resp.json() if 'dsi_resp' in dir() else {}
|
||||||
|
for doc in dsi_data_local.get("documents", []):
|
||||||
|
if doc.get("doc_type", "") in ("dse", "privacy", "datenschutz") or \
|
||||||
|
"datenschutz" in doc.get("title", "").lower():
|
||||||
|
ft = doc.get("full_text", "")
|
||||||
|
if ft and len(ft) > len(dse_text):
|
||||||
|
dse_text = ft[:8000]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
if not dse_text:
|
if not dse_text:
|
||||||
dse_text = await _fetch_dse_text(req.url, scan.pages_scanned)
|
dse_text = await _fetch_dse_text(req.url, scan.pages_scanned)
|
||||||
|
|
||||||
|
|||||||
@@ -260,6 +260,7 @@ class DSIDocumentInfo(BaseModel):
|
|||||||
doc_type: str = ""
|
doc_type: str = ""
|
||||||
word_count: int = 0
|
word_count: int = 0
|
||||||
text_preview: str = ""
|
text_preview: str = ""
|
||||||
|
full_text: str = ""
|
||||||
|
|
||||||
|
|
||||||
class DSIDiscoveryResponse(BaseModel):
|
class DSIDiscoveryResponse(BaseModel):
|
||||||
@@ -311,6 +312,7 @@ async def dsi_discovery(req: DSIDiscoveryRequest):
|
|||||||
doc_type=d.doc_type,
|
doc_type=d.doc_type,
|
||||||
word_count=d.word_count,
|
word_count=d.word_count,
|
||||||
text_preview=d.text[:500] if d.text else "",
|
text_preview=d.text[:500] if d.text else "",
|
||||||
|
full_text=d.text[:10000] if d.text else "",
|
||||||
)
|
)
|
||||||
for d in result.documents
|
for d in result.documents
|
||||||
],
|
],
|
||||||
|
|||||||
Reference in New Issue
Block a user