feat(crawl): Vollstaendigkeit — Shadow-DOM/versteckte Links + Interaktions-Fixpunkt + Wayback-CDX-Orphans
CI / test-python-backend (push) Successful in 30s
CI / detect-changes (push) Successful in 9s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 12s
CI / loc-budget (push) Successful in 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / test-python-backend (push) Successful in 30s
CI / detect-changes (push) Successful in 9s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 12s
CI / loc-budget (push) Successful in 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
Damit die Specialist-Agents auf vollstaendigem Website-Content arbeiten:
A — _find_dsi_links pierct jetzt Shadow-DOM (Web-Components wie Usercentrics/
Mercedes) rekursiv; versteckte (display:none) Links werden erfasst + als
Coverage-Metadatum geflaggt.
B — _expand_to_fixpoint klappt Akkordeons/Tabs/Hover-Menues in einer Schleife
auf, bis das DOM stabil ist (statt 1 Pass); erweiterte Selektoren;
Coverage-Telemetrie (Runden, expandierte Elemente, DOM-Wachstum, Shadow-/
versteckte Links) → Response + Backend-Log.
C — legacy_url_cdx.cdx_enumerate listet via Wayback-CDX-API ALLE je
archivierten URLs der Domain → findet Orphan-/Legacy-Seiten, die nie im
Slug-Raster standen (z.B. nicht mehr verlinktes /datenschutz, per Direkt-
URL noch erreichbar). Fliesst durch das bestehende Legacy-URL-Inventar.
Tests: test_legacy_url_cdx.py (6) + consent-tester/tests/test_dsi_discovery.py
(Pure-Helper + Real-Browser-Integration). Alle gruen, LOC-Gate gruen.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -54,6 +54,17 @@ async def _fetch_text(url: str, doc_type: str = "") -> tuple[str, list[dict]]:
|
||||
docs = payload.get("documents", [])
|
||||
cmp_payloads = payload.get("cmp_payloads") or []
|
||||
cmp_cookie_text = payload.get("cmp_cookie_text") or ""
|
||||
coverage = payload.get("coverage") or {}
|
||||
if coverage:
|
||||
logger.info(
|
||||
"Crawl-Coverage %s: %d Interaktions-Runden, "
|
||||
"%d Elemente expandiert, %d Shadow-Links, "
|
||||
"%d versteckte Links",
|
||||
url, coverage.get("interaction_rounds", 0),
|
||||
coverage.get("elements_expanded", 0),
|
||||
coverage.get("shadow_links_found", 0),
|
||||
coverage.get("hidden_links_found", 0),
|
||||
)
|
||||
# D — wenn der consent-tester HTML-Tabellen aus dem DOM
|
||||
# extrahiert hat, in die cmp_payloads als "generic_table"
|
||||
# einschleusen damit das Backend sie via cookies_table_parser
|
||||
|
||||
@@ -0,0 +1,89 @@
|
||||
"""Wayback-CDX-Enumeration — listet ALLE je archivierten URLs einer Domain.
|
||||
|
||||
Anders als die per-Slug-Wayback-Pruefung (legacy_url_discovery._wayback_check)
|
||||
holen wir hier die KOMPLETTE History-Liste der Domain ueber die CDX-API. So
|
||||
finden wir Orphan-/Legacy-Seiten, die nie im Slug-Raster standen und heute
|
||||
nicht mehr verlinkt sind, aber per Direkt-URL noch erreichbar — genau der Fall
|
||||
"www.xyz.com/datenschutz existierte mal, wurde nie entfernt".
|
||||
|
||||
Best-effort: jede Exception → leere Liste, blockiert die uebrige Discovery nie.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_CDX_API = "http://web.archive.org/cdx/search/cdx"
|
||||
|
||||
# Nicht-HTML-Assets, die uns fuer Rechts-Content nicht interessieren.
|
||||
_ASSET_SUFFIXES = (
|
||||
".js", ".css", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico",
|
||||
".woff", ".woff2", ".ttf", ".eot", ".webp", ".mp4", ".webm",
|
||||
".zip", ".map", ".json", ".xml", ".rss", ".txt", ".csv",
|
||||
)
|
||||
|
||||
|
||||
def _parse_cdx_rows(rows: list) -> list[tuple[str, str]]:
|
||||
"""Parst CDX-JSON zu (url, timestamp)-Paaren.
|
||||
|
||||
CDX-JSON ist ein Array von Arrays; Zeile 0 ist der Header
|
||||
["original","timestamp","statuscode"]. Assets werden gedroppt,
|
||||
Duplikate (per URL ohne Fragment) entfernt.
|
||||
"""
|
||||
if not isinstance(rows, list) or len(rows) < 2:
|
||||
return []
|
||||
seen: set[str] = set()
|
||||
out: list[tuple[str, str]] = []
|
||||
for row in rows[1:]: # Zeile 0 = Header
|
||||
if not isinstance(row, (list, tuple)) or not row:
|
||||
continue
|
||||
url = str(row[0]).strip()
|
||||
if not url:
|
||||
continue
|
||||
path = url.lower().split("?", 1)[0].split("#", 1)[0]
|
||||
if path.endswith(_ASSET_SUFFIXES):
|
||||
continue
|
||||
key = url.split("#", 1)[0]
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
ts = str(row[1]).strip() if len(row) > 1 else ""
|
||||
out.append((url, ts))
|
||||
return out
|
||||
|
||||
|
||||
async def cdx_enumerate(origin: str, limit: int = 2000) -> list[tuple[str, str]]:
|
||||
"""Liefert (url, wayback_timestamp) fuer alle je archivierten HTML-URLs.
|
||||
|
||||
`collapse=urlkey` → eine Zeile pro URL; `filter=statuscode:200` → nur
|
||||
erfolgreich archivierte. Der timestamp wird spaeter als Legacy-Alter
|
||||
wiederverwendet (spart einen zweiten Wayback-Call pro URL).
|
||||
"""
|
||||
netloc = urlparse(origin).netloc or origin.replace("https://", "").replace(
|
||||
"http://", "",
|
||||
)
|
||||
if not netloc:
|
||||
return []
|
||||
params = {
|
||||
"url": f"{netloc}*",
|
||||
"output": "json",
|
||||
"collapse": "urlkey",
|
||||
"fl": "original,timestamp,statuscode",
|
||||
"filter": "statuscode:200",
|
||||
"limit": str(limit),
|
||||
}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15.0) as c:
|
||||
r = await c.get(_CDX_API, params=params)
|
||||
if r.status_code != 200:
|
||||
return []
|
||||
rows = r.json() or []
|
||||
except Exception as e:
|
||||
logger.info("CDX enumerate failed for %s: %s", netloc, e)
|
||||
return []
|
||||
return _parse_cdx_rows(rows)
|
||||
@@ -29,6 +29,8 @@ from urllib.parse import urljoin, urlparse
|
||||
|
||||
import httpx
|
||||
|
||||
from compliance.services.legacy_url_cdx import cdx_enumerate
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -239,13 +241,24 @@ async def discover_legacy_urls(state: dict) -> dict:
|
||||
return {"candidates": [], "skipped": "no_origin"}
|
||||
|
||||
candidates: set[str] = set()
|
||||
# A.1 Sitemap
|
||||
# A.1 Sitemap + A.3 Slug-Permutations
|
||||
for o in list(origins)[:2]:
|
||||
sitemap_urls = await _fetch_sitemap_urls(o)
|
||||
candidates.update(_filter_legal_urls(sitemap_urls))
|
||||
# A.3 Slug-Permutations
|
||||
candidates.update(_build_slug_candidates(o))
|
||||
|
||||
# A.5 Wayback-CDX: alle je archivierten URLs der Domain → faengt
|
||||
# Orphans, die nie im Slug-Raster standen. (url, cdx_timestamp); der
|
||||
# timestamp dient als Legacy-Alter (kein zweiter Wayback-Call noetig).
|
||||
cdx_pairs: list[tuple[str, str]] = []
|
||||
for o in list(origins)[:2]:
|
||||
cdx_pairs.extend(await cdx_enumerate(o))
|
||||
cdx_legal_urls = set(_filter_legal_urls([u for u, _ in cdx_pairs]))
|
||||
cdx_legal = [
|
||||
(u, ts) for (u, ts) in cdx_pairs
|
||||
if u in cdx_legal_urls and u not in candidates
|
||||
][:100]
|
||||
|
||||
# Cap to avoid explosion
|
||||
cands = list(candidates)[:60]
|
||||
|
||||
@@ -264,12 +277,32 @@ async def discover_legacy_urls(state: dict) -> dict:
|
||||
"age_months": age,
|
||||
"in_footer": in_footer,
|
||||
"recommendation": _recommend(status, age, False, in_footer),
|
||||
"via": "sitemap/slug",
|
||||
}
|
||||
|
||||
results = await asyncio.gather(
|
||||
*[_check(u) for u in cands], return_exceptions=True,
|
||||
# CDX-Kandidaten: nur Liveness pruefen (Archiv-Stand kennen wir schon).
|
||||
async def _check_cdx(url: str, ts: str) -> dict:
|
||||
status, lm = await _probe_alive(url)
|
||||
age = _months_since(ts)
|
||||
in_footer = url.split("#")[0].split("?")[0] in footer_urls
|
||||
return {
|
||||
"url": url,
|
||||
"status": status,
|
||||
"last_modified": lm,
|
||||
"wayback_snapshot": "",
|
||||
"wayback_timestamp": ts,
|
||||
"age_months": age,
|
||||
"in_footer": in_footer,
|
||||
"recommendation": _recommend(status, age, False, in_footer),
|
||||
"via": "wayback-cdx",
|
||||
}
|
||||
|
||||
gathered = await asyncio.gather(
|
||||
*[_check(u) for u in cands],
|
||||
*[_check_cdx(u, ts) for u, ts in cdx_legal],
|
||||
return_exceptions=True,
|
||||
)
|
||||
results = [r for r in results if isinstance(r, dict)]
|
||||
results = [r for r in gathered if isinstance(r, dict)]
|
||||
|
||||
# Filter: only show interesting ones (≥200 reachable + legacy-relevant)
|
||||
interesting: list[dict] = []
|
||||
@@ -297,5 +330,6 @@ async def discover_legacy_urls(state: dict) -> dict:
|
||||
"candidates": interesting,
|
||||
"probed": len(results),
|
||||
"filtered_kept": len(interesting),
|
||||
"cdx_candidates": len(cdx_legal),
|
||||
"origins": list(origins),
|
||||
}
|
||||
|
||||
@@ -0,0 +1,110 @@
|
||||
"""Tests für die Wayback-CDX-Orphan-Enumeration (Feature C)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
|
||||
from compliance.services.legacy_url_cdx import _parse_cdx_rows, cdx_enumerate
|
||||
|
||||
|
||||
def _run(coro):
|
||||
return asyncio.get_event_loop().run_until_complete(coro)
|
||||
|
||||
|
||||
# ── Pure: _parse_cdx_rows ───────────────────────────────────────────
|
||||
|
||||
|
||||
def test_parse_cdx_rows_drops_assets_and_dedups():
|
||||
rows = [
|
||||
["original", "timestamp", "statuscode"], # Header
|
||||
["http://x.com/datenschutz", "20190101", "200"],
|
||||
["http://x.com/datenschutz", "20200101", "200"], # Duplikat
|
||||
["http://x.com/style.css", "20200101", "200"], # Asset
|
||||
["http://x.com/app.js", "20200101", "200"], # Asset
|
||||
["http://x.com/impressum", "20180101", "200"],
|
||||
]
|
||||
out = _parse_cdx_rows(rows)
|
||||
urls = [u for u, _ in out]
|
||||
assert urls == ["http://x.com/datenschutz", "http://x.com/impressum"]
|
||||
# timestamp des ERSTEN (ältesten) Snapshots bleibt erhalten
|
||||
assert out[0] == ("http://x.com/datenschutz", "20190101")
|
||||
|
||||
|
||||
def test_parse_cdx_rows_empty_or_header_only():
|
||||
assert _parse_cdx_rows([]) == []
|
||||
assert _parse_cdx_rows([["original", "timestamp"]]) == []
|
||||
assert _parse_cdx_rows("garbage") == [] # type: ignore[arg-type]
|
||||
|
||||
|
||||
# ── cdx_enumerate mit gemocktem httpx ───────────────────────────────
|
||||
|
||||
|
||||
class _FakeResp:
|
||||
def __init__(self, status_code, json_data):
|
||||
self.status_code = status_code
|
||||
self._json = json_data
|
||||
|
||||
def json(self):
|
||||
return self._json
|
||||
|
||||
|
||||
class _FakeClient:
|
||||
def __init__(self, resp):
|
||||
self._resp = resp
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *a):
|
||||
return False
|
||||
|
||||
async def get(self, *a, **kw):
|
||||
return self._resp
|
||||
|
||||
|
||||
def _patch_httpx(monkeypatch, resp):
|
||||
monkeypatch.setattr(
|
||||
"compliance.services.legacy_url_cdx.httpx.AsyncClient",
|
||||
lambda *a, **kw: _FakeClient(resp),
|
||||
)
|
||||
|
||||
|
||||
def test_cdx_enumerate_returns_parsed_pairs(monkeypatch):
|
||||
rows = [
|
||||
["original", "timestamp", "statuscode"],
|
||||
["http://x.com/datenschutz", "20190101120000", "200"],
|
||||
["http://x.com/logo.png", "20200101", "200"],
|
||||
]
|
||||
_patch_httpx(monkeypatch, _FakeResp(200, rows))
|
||||
out = _run(cdx_enumerate("https://x.com"))
|
||||
urls = [u for u, _ in out]
|
||||
assert "http://x.com/datenschutz" in urls
|
||||
assert "http://x.com/logo.png" not in urls # Asset gedroppt
|
||||
|
||||
|
||||
def test_cdx_enumerate_non_200_returns_empty(monkeypatch):
|
||||
_patch_httpx(monkeypatch, _FakeResp(503, []))
|
||||
assert _run(cdx_enumerate("https://x.com")) == []
|
||||
|
||||
|
||||
def test_cdx_enumerate_no_netloc_returns_empty(monkeypatch):
|
||||
_patch_httpx(monkeypatch, _FakeResp(200, []))
|
||||
assert _run(cdx_enumerate("")) == []
|
||||
|
||||
|
||||
# ── Orphan-Pfad: CDX-Fund → Legal-Filter behält Rechts-Seite ────────
|
||||
|
||||
|
||||
def test_cdx_orphan_survives_legal_filter():
|
||||
"""Der eigentliche Orphan-Fall: CDX findet /datenschutz (nicht mehr
|
||||
verlinkt), der Legal-Filter behält sie, Produktseiten fallen raus."""
|
||||
from compliance.services.legacy_url_discovery import _filter_legal_urls
|
||||
rows = [
|
||||
["original", "timestamp", "statuscode"],
|
||||
["http://x.com/datenschutz", "20190101", "200"],
|
||||
["http://x.com/products/widget", "20200101", "200"],
|
||||
]
|
||||
pairs = _parse_cdx_rows(rows)
|
||||
legal = _filter_legal_urls([u for u, _ in pairs])
|
||||
assert "http://x.com/datenschutz" in legal
|
||||
assert "http://x.com/products/widget" not in legal
|
||||
@@ -324,6 +324,7 @@ class DSIDiscoveryResponse(BaseModel):
|
||||
# Raw CMP payloads captured during navigation (ePaaS, OneTrust, etc.).
|
||||
# Backend uses these to build the per-vendor compliance table.
|
||||
cmp_payloads: list[dict] = []
|
||||
coverage: dict = {} # Coverage-Telemetrie (Feature B), s. coverage_dict()
|
||||
|
||||
|
||||
@app.post("/dsi-discovery", response_model=DSIDiscoveryResponse)
|
||||
@@ -376,6 +377,7 @@ async def dsi_discovery(req: DSIDiscoveryRequest):
|
||||
errors=result.errors,
|
||||
scanned_at=datetime.now(timezone.utc).isoformat(),
|
||||
cmp_payloads=result.cmp_payloads,
|
||||
coverage=result.coverage_dict(),
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -181,6 +181,23 @@ class DSIDiscoveryResult:
|
||||
# the authoritative cookie-text so MC checks run on the real policy,
|
||||
# not the homepage navigation that DOM extraction returns.
|
||||
cmp_cookie_text: str = ""
|
||||
# Coverage-Telemetrie (Feature B): macht messbar, wie erschoepfend die
|
||||
# Interaktion war — wir behaupten kein "100%", wir MESSEN es.
|
||||
interaction_rounds: int = 0
|
||||
elements_expanded: int = 0
|
||||
dom_growth_bytes: int = 0
|
||||
shadow_links_found: int = 0
|
||||
hidden_links_found: int = 0
|
||||
|
||||
def coverage_dict(self) -> dict:
|
||||
"""Coverage-Telemetrie als Dict (Feature B) — fuers Response-Mapping."""
|
||||
return {
|
||||
"interaction_rounds": self.interaction_rounds,
|
||||
"elements_expanded": self.elements_expanded,
|
||||
"dom_growth_bytes": self.dom_growth_bytes,
|
||||
"shadow_links_found": self.shadow_links_found,
|
||||
"hidden_links_found": self.hidden_links_found,
|
||||
}
|
||||
|
||||
async def _extract_dom_tables(page) -> list[list[str]]:
|
||||
"""D — extrahiert alle <table>-Elemente aus dem aktuellen DOM als
|
||||
@@ -444,15 +461,24 @@ async def discover_dsi_documents(
|
||||
links = await _find_dsi_links(page, base_domain)
|
||||
logger.info("Found %d DSI links on %s", len(links), url)
|
||||
|
||||
# Step 3: Expand accordions, tabs, dropdowns to find hidden content
|
||||
await _expand_all_interactive(page)
|
||||
await page.wait_for_timeout(1000)
|
||||
# Step 3: Interaktions-Fixpunkt — aufklappen bis das DOM stabil ist
|
||||
# (faengt verschachtelte/lazy Akkordeons, die ein einzelner Pass
|
||||
# verpasst). Telemetrie als messbares Coverage-Signal.
|
||||
_tel = await _expand_to_fixpoint(page)
|
||||
result.interaction_rounds = _tel["rounds"]
|
||||
result.elements_expanded = _tel["elements_expanded"]
|
||||
result.dom_growth_bytes = _tel["dom_growth"]
|
||||
await page.wait_for_timeout(500)
|
||||
|
||||
# Step 3b: Re-scan after expanding (may reveal new links)
|
||||
links_after = await _find_dsi_links(page, base_domain)
|
||||
for link in links_after:
|
||||
if link["href"] not in [l["href"] for l in links]:
|
||||
links.append(link)
|
||||
result.shadow_links_found = sum(
|
||||
1 for l in links_after if l.get("in_shadow"))
|
||||
result.hidden_links_found = sum(
|
||||
1 for l in links_after if not l.get("visible"))
|
||||
|
||||
# Step 4: Check for inline DSI sections (accordion content already visible)
|
||||
inline_sections = await _find_inline_dsi_sections(page)
|
||||
@@ -524,7 +550,7 @@ async def discover_dsi_documents(
|
||||
continue
|
||||
|
||||
await try_dismiss_consent_banner(page)
|
||||
await _expand_all_interactive(page)
|
||||
await _expand_to_fixpoint(page)
|
||||
await page.wait_for_timeout(500)
|
||||
|
||||
# Extract text — try specific content areas, fall back to full body
|
||||
@@ -595,7 +621,7 @@ async def discover_dsi_documents(
|
||||
# Navigate back for next link
|
||||
await goto_resilient(page, url, timeout=45000)
|
||||
await page.wait_for_timeout(500)
|
||||
await _expand_all_interactive(page)
|
||||
await _expand_to_fixpoint(page)
|
||||
|
||||
except Exception as e:
|
||||
result.errors.append(f"Failed to load {href}: {str(e)[:80]}")
|
||||
@@ -674,25 +700,48 @@ def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
|
||||
return unique
|
||||
|
||||
async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]:
|
||||
"""Find all links whose text or href matches DSI keywords."""
|
||||
"""Find all links whose text or href matches DSI keywords.
|
||||
|
||||
Pierct Shadow-DOM (Web-Components wie Usercentrics/Mercedes) rekursiv —
|
||||
sonst werden Rechts-Links in Shadow-Trees uebersehen. Versteckte Links
|
||||
(display:none) kommen ueber querySelectorAll ohnehin mit; das
|
||||
visible-Flag bleibt als Coverage-Metadatum erhalten.
|
||||
"""
|
||||
try:
|
||||
all_links = await page.evaluate("""
|
||||
() => [...document.querySelectorAll('a[href]')].map(a => ({
|
||||
href: a.href,
|
||||
text: (a.textContent || '').trim().substring(0, 200),
|
||||
ariaLabel: a.getAttribute('aria-label') || '',
|
||||
title: a.getAttribute('title') || '',
|
||||
visible: a.getBoundingClientRect().width > 0,
|
||||
}))
|
||||
() => {
|
||||
const out = [];
|
||||
const collect = (root) => {
|
||||
if (!root || !root.querySelectorAll) return;
|
||||
root.querySelectorAll('a[href]').forEach(a => out.push({
|
||||
href: a.href,
|
||||
text: (a.textContent || '').trim().substring(0, 200),
|
||||
ariaLabel: a.getAttribute('aria-label') || '',
|
||||
title: a.getAttribute('title') || '',
|
||||
visible: a.getBoundingClientRect().width > 0,
|
||||
inShadow: root !== document,
|
||||
}));
|
||||
root.querySelectorAll('*').forEach(el => {
|
||||
if (el.shadowRoot) collect(el.shadowRoot);
|
||||
});
|
||||
};
|
||||
collect(document);
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
dsi_links = []
|
||||
for link in (all_links or []):
|
||||
search_text = f"{link['text']} {link['ariaLabel']} {link['title']}".lower()
|
||||
search_text = (
|
||||
f"{link['text']} {link['ariaLabel']} {link['title']}".lower()
|
||||
)
|
||||
href = link["href"]
|
||||
href_lower = href.lower()
|
||||
|
||||
# Match by link text or href
|
||||
is_match = any(kw in search_text or kw in href_lower for kw in ALL_DSI_KEYWORDS)
|
||||
is_match = any(
|
||||
kw in search_text or kw in href_lower
|
||||
for kw in ALL_DSI_KEYWORDS
|
||||
)
|
||||
if not is_match:
|
||||
continue
|
||||
|
||||
@@ -702,6 +751,7 @@ async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]:
|
||||
"href": href,
|
||||
"text": link["text"],
|
||||
"visible": link["visible"],
|
||||
"in_shadow": link.get("inShadow", False),
|
||||
})
|
||||
|
||||
return dsi_links
|
||||
@@ -709,47 +759,108 @@ async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]:
|
||||
logger.warning("DSI link scan failed: %s", e)
|
||||
return []
|
||||
|
||||
async def _expand_all_interactive(page: Page) -> None:
|
||||
async def _expand_all_interactive(page: Page) -> int:
|
||||
"""Expand all accordions, tabs, details, dropdowns on the page.
|
||||
|
||||
IMPORTANT: Only expand CLOSED elements. Never click elements that
|
||||
are already expanded (aria-expanded="true") — that would close them.
|
||||
BMW, for example, has accordions open by default.
|
||||
|
||||
Returns the number of elements acted on (drives the fixpoint loop +
|
||||
coverage telemetry).
|
||||
"""
|
||||
try:
|
||||
await page.evaluate("""() => {
|
||||
// 1. Open all <details> that are closed
|
||||
document.querySelectorAll('details:not([open])').forEach(d => d.open = true);
|
||||
return await page.evaluate("""() => {
|
||||
let n = 0;
|
||||
const click = (el) => { try { el.click(); n++; } catch {} };
|
||||
|
||||
// 2. Click buttons that are explicitly CLOSED (aria-expanded="false")
|
||||
document.querySelectorAll('button[aria-expanded="false"]').forEach(b => {
|
||||
try { b.click(); } catch {}
|
||||
// 1. Open all <details> that are closed
|
||||
document.querySelectorAll('details:not([open])').forEach(d => {
|
||||
d.open = true; n++;
|
||||
});
|
||||
|
||||
// 2. Anything explicitly CLOSED (aria-expanded="false") — not
|
||||
// only <button>; many accordions use div/a/span roles.
|
||||
document.querySelectorAll('[aria-expanded="false"]').forEach(click);
|
||||
|
||||
// 3. Bootstrap/jQuery collapse triggers (only closed ones)
|
||||
document.querySelectorAll('[data-toggle="collapse"].collapsed').forEach(e => {
|
||||
try { e.click(); } catch {}
|
||||
});
|
||||
document.querySelectorAll('[data-bs-toggle="collapse"].collapsed').forEach(e => {
|
||||
try { e.click(); } catch {}
|
||||
});
|
||||
document.querySelectorAll(
|
||||
'[data-toggle="collapse"].collapsed, '
|
||||
+ '[data-bs-toggle="collapse"].collapsed').forEach(click);
|
||||
|
||||
// 4. "Show more" / "Mehr anzeigen" buttons
|
||||
document.querySelectorAll('button,a').forEach(b => {
|
||||
document.querySelectorAll('button,a,[role="button"]').forEach(b => {
|
||||
const t = (b.textContent || '').trim();
|
||||
if (/^(mehr|more|weiterlesen|read more|show more|anzeigen|alle anzeigen)/i.test(t))
|
||||
try { b.click(); } catch {}
|
||||
if (/^(mehr|more|weiterlesen|read more|show more|anzeigen|alle anzeigen|mehr erfahren|mehr infos?)/i.test(t))
|
||||
click(b);
|
||||
});
|
||||
|
||||
// 5. Tabs — click each to make content visible, then go back
|
||||
// (don't click, just make tab panels visible)
|
||||
// 5. Tab panels — make hidden content visible
|
||||
document.querySelectorAll('[role="tabpanel"][hidden]').forEach(p => {
|
||||
p.removeAttribute('hidden');
|
||||
p.style.display = '';
|
||||
p.removeAttribute('hidden'); p.style.display = ''; n++;
|
||||
});
|
||||
|
||||
// 6. <summary> + accordion headers that are explicitly closed
|
||||
// (aria-expanded="false" only — never toggle open ones).
|
||||
document.querySelectorAll(
|
||||
'summary, [class*="accordion" i] [class*="header" i], '
|
||||
+ '[class*="accordion" i] [class*="toggle" i]').forEach(el => {
|
||||
if (el.getAttribute('aria-expanded') === 'false') click(el);
|
||||
});
|
||||
|
||||
// 7. Hover-reveal menus/dropdowns (JS-driven). Non-destructive
|
||||
// mouseover; CSS-:hover menus are already in the DOM.
|
||||
document.querySelectorAll(
|
||||
'[class*="menu" i], [class*="dropdown" i], nav li').forEach(el => {
|
||||
try { el.dispatchEvent(new MouseEvent('mouseover', {bubbles: true})); } catch {}
|
||||
});
|
||||
|
||||
return n;
|
||||
}""")
|
||||
except Exception:
|
||||
pass
|
||||
return 0
|
||||
|
||||
|
||||
async def _dom_size(page: Page) -> int:
|
||||
"""Body-innerHTML-Laenge als billiger DOM-Wachstums-Indikator."""
|
||||
try:
|
||||
return await page.evaluate(
|
||||
"() => document.body ? document.body.innerHTML.length : 0")
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
def _dom_grew(prev_size: int, new_size: int, threshold: int = 32) -> bool:
|
||||
"""Pure: ist das DOM seit der letzten Runde nennenswert gewachsen?
|
||||
(Toleranz gegen Mikro-Jitter durch Timestamps o.ae.)."""
|
||||
return new_size > prev_size + threshold
|
||||
|
||||
|
||||
async def _expand_to_fixpoint(page: Page, max_rounds: int = 6) -> dict:
|
||||
"""Ruft _expand_all_interactive wiederholt, bis das DOM stabil ist
|
||||
(Fixpunkt) oder max_rounds erreicht — faengt verschachtelte/lazy
|
||||
Akkordeons, die ein einzelner Pass verpasst. Liefert Coverage-
|
||||
Telemetrie {rounds, elements_expanded, dom_growth}."""
|
||||
total_clicked = 0
|
||||
start_size = await _dom_size(page)
|
||||
prev_size = start_size
|
||||
final_size = start_size
|
||||
rounds = 0
|
||||
while rounds < max_rounds:
|
||||
clicked = await _expand_all_interactive(page)
|
||||
total_clicked += clicked
|
||||
await page.wait_for_timeout(250)
|
||||
new_size = await _dom_size(page)
|
||||
rounds += 1
|
||||
final_size = new_size
|
||||
if not _dom_grew(prev_size, new_size):
|
||||
break
|
||||
prev_size = new_size
|
||||
return {
|
||||
"rounds": rounds,
|
||||
"elements_expanded": total_clicked,
|
||||
"dom_growth": max(0, final_size - start_size),
|
||||
}
|
||||
|
||||
async def _find_inline_dsi_sections(page: Page) -> list[dict]:
|
||||
"""Find DSI content already visible on the page (e.g. expanded accordions).
|
||||
|
||||
@@ -0,0 +1,89 @@
|
||||
"""Tests für dsi_discovery — Shadow-DOM/versteckte Link-Erfassung (Feature A)
|
||||
+ Interaktions-Fixpunkt (Feature B)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
|
||||
import pytest
|
||||
|
||||
from services.dsi_discovery import (
|
||||
_dom_grew,
|
||||
_expand_to_fixpoint,
|
||||
_find_dsi_links,
|
||||
)
|
||||
|
||||
|
||||
# ── Pure: Fixpunkt-Stopbedingung ────────────────────────────────────
|
||||
|
||||
|
||||
def test_dom_grew_threshold():
|
||||
assert _dom_grew(100, 200) is True
|
||||
assert _dom_grew(100, 133) is True # 33 > 32 (Schwelle)
|
||||
assert _dom_grew(100, 110) is False # unter Schwelle
|
||||
assert _dom_grew(100, 100) is False
|
||||
|
||||
|
||||
# ── Browser-Integration (skip wenn kein chromium) ───────────────────
|
||||
|
||||
|
||||
def _chromium_ok() -> bool:
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
with sync_playwright() as p:
|
||||
b = p.chromium.launch(headless=True, args=["--no-sandbox"])
|
||||
b.close()
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
_BROWSER = _chromium_ok()
|
||||
|
||||
_FIXTURE = """
|
||||
<html><body>
|
||||
<a href="https://example.com/datenschutz" style="display:none">Datenschutz</a>
|
||||
<details><summary>Mehr</summary>
|
||||
<a href="https://example.com/impressum">Impressum</a>
|
||||
</details>
|
||||
<div id="host"></div>
|
||||
<script>
|
||||
const sr = document.getElementById('host').attachShadow({mode:'open'});
|
||||
sr.innerHTML =
|
||||
'<a href="https://example.com/cookie-richtlinie">Cookies</a>';
|
||||
</script>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
|
||||
async def _scan_fixture():
|
||||
from playwright.async_api import async_playwright
|
||||
async with async_playwright() as p:
|
||||
b = await p.chromium.launch(headless=True, args=["--no-sandbox"])
|
||||
try:
|
||||
page = await (await b.new_context()).new_page()
|
||||
await page.set_content(_FIXTURE)
|
||||
tel = await _expand_to_fixpoint(page)
|
||||
links = await _find_dsi_links(page, "example.com")
|
||||
details_open = await page.evaluate(
|
||||
"() => !!(document.querySelector('details')"
|
||||
" && document.querySelector('details').open)")
|
||||
return links, tel, details_open
|
||||
finally:
|
||||
await b.close()
|
||||
|
||||
|
||||
@pytest.mark.skipif(not _BROWSER, reason="chromium nicht installiert")
|
||||
def test_shadow_and_hidden_links_discovered():
|
||||
links, tel, details_open = asyncio.get_event_loop().run_until_complete(
|
||||
_scan_fixture())
|
||||
hrefs = [l["href"] for l in links]
|
||||
# A: Shadow-DOM-Link gefunden + geflaggt
|
||||
assert any("cookie-richtlinie" in h for h in hrefs), hrefs
|
||||
assert any(l.get("in_shadow") for l in links)
|
||||
# A: versteckter (display:none) Link gefunden + als hidden geflaggt
|
||||
assert any("datenschutz" in h for h in hrefs), hrefs
|
||||
assert any(not l["visible"] for l in links)
|
||||
# B: Fixpunkt lief + hat das geschlossene Akkordeon geoeffnet
|
||||
assert tel["rounds"] >= 1
|
||||
assert details_open is True
|
||||
Reference in New Issue
Block a user