feat(crawl): Vollstaendigkeit — Shadow-DOM/versteckte Links + Interaktions-Fixpunkt + Wayback-CDX-Orphans
CI / test-python-backend (push) Successful in 30s
CI / detect-changes (push) Successful in 9s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 12s
CI / loc-budget (push) Successful in 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / test-python-backend (push) Successful in 30s
CI / detect-changes (push) Successful in 9s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 12s
CI / loc-budget (push) Successful in 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
Damit die Specialist-Agents auf vollstaendigem Website-Content arbeiten:
A — _find_dsi_links pierct jetzt Shadow-DOM (Web-Components wie Usercentrics/
Mercedes) rekursiv; versteckte (display:none) Links werden erfasst + als
Coverage-Metadatum geflaggt.
B — _expand_to_fixpoint klappt Akkordeons/Tabs/Hover-Menues in einer Schleife
auf, bis das DOM stabil ist (statt 1 Pass); erweiterte Selektoren;
Coverage-Telemetrie (Runden, expandierte Elemente, DOM-Wachstum, Shadow-/
versteckte Links) → Response + Backend-Log.
C — legacy_url_cdx.cdx_enumerate listet via Wayback-CDX-API ALLE je
archivierten URLs der Domain → findet Orphan-/Legacy-Seiten, die nie im
Slug-Raster standen (z.B. nicht mehr verlinktes /datenschutz, per Direkt-
URL noch erreichbar). Fliesst durch das bestehende Legacy-URL-Inventar.
Tests: test_legacy_url_cdx.py (6) + consent-tester/tests/test_dsi_discovery.py
(Pure-Helper + Real-Browser-Integration). Alle gruen, LOC-Gate gruen.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,89 @@
|
||||
"""Tests für dsi_discovery — Shadow-DOM/versteckte Link-Erfassung (Feature A)
|
||||
+ Interaktions-Fixpunkt (Feature B)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
|
||||
import pytest
|
||||
|
||||
from services.dsi_discovery import (
|
||||
_dom_grew,
|
||||
_expand_to_fixpoint,
|
||||
_find_dsi_links,
|
||||
)
|
||||
|
||||
|
||||
# ── Pure: Fixpunkt-Stopbedingung ────────────────────────────────────
|
||||
|
||||
|
||||
def test_dom_grew_threshold():
|
||||
assert _dom_grew(100, 200) is True
|
||||
assert _dom_grew(100, 133) is True # 33 > 32 (Schwelle)
|
||||
assert _dom_grew(100, 110) is False # unter Schwelle
|
||||
assert _dom_grew(100, 100) is False
|
||||
|
||||
|
||||
# ── Browser-Integration (skip wenn kein chromium) ───────────────────
|
||||
|
||||
|
||||
def _chromium_ok() -> bool:
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
with sync_playwright() as p:
|
||||
b = p.chromium.launch(headless=True, args=["--no-sandbox"])
|
||||
b.close()
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
_BROWSER = _chromium_ok()
|
||||
|
||||
_FIXTURE = """
|
||||
<html><body>
|
||||
<a href="https://example.com/datenschutz" style="display:none">Datenschutz</a>
|
||||
<details><summary>Mehr</summary>
|
||||
<a href="https://example.com/impressum">Impressum</a>
|
||||
</details>
|
||||
<div id="host"></div>
|
||||
<script>
|
||||
const sr = document.getElementById('host').attachShadow({mode:'open'});
|
||||
sr.innerHTML =
|
||||
'<a href="https://example.com/cookie-richtlinie">Cookies</a>';
|
||||
</script>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
|
||||
async def _scan_fixture():
|
||||
from playwright.async_api import async_playwright
|
||||
async with async_playwright() as p:
|
||||
b = await p.chromium.launch(headless=True, args=["--no-sandbox"])
|
||||
try:
|
||||
page = await (await b.new_context()).new_page()
|
||||
await page.set_content(_FIXTURE)
|
||||
tel = await _expand_to_fixpoint(page)
|
||||
links = await _find_dsi_links(page, "example.com")
|
||||
details_open = await page.evaluate(
|
||||
"() => !!(document.querySelector('details')"
|
||||
" && document.querySelector('details').open)")
|
||||
return links, tel, details_open
|
||||
finally:
|
||||
await b.close()
|
||||
|
||||
|
||||
@pytest.mark.skipif(not _BROWSER, reason="chromium nicht installiert")
|
||||
def test_shadow_and_hidden_links_discovered():
|
||||
links, tel, details_open = asyncio.get_event_loop().run_until_complete(
|
||||
_scan_fixture())
|
||||
hrefs = [l["href"] for l in links]
|
||||
# A: Shadow-DOM-Link gefunden + geflaggt
|
||||
assert any("cookie-richtlinie" in h for h in hrefs), hrefs
|
||||
assert any(l.get("in_shadow") for l in links)
|
||||
# A: versteckter (display:none) Link gefunden + als hidden geflaggt
|
||||
assert any("datenschutz" in h for h in hrefs), hrefs
|
||||
assert any(not l["visible"] for l in links)
|
||||
# B: Fixpunkt lief + hat das geschlossene Akkordeon geoeffnet
|
||||
assert tel["rounds"] >= 1
|
||||
assert details_open is True
|
||||
Reference in New Issue
Block a user