feat(crawl): Vollstaendigkeit — Shadow-DOM/versteckte Links + Interaktions-Fixpunkt + Wayback-CDX-Orphans
CI / test-python-backend (push) Successful in 30s
CI / detect-changes (push) Successful in 9s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 12s
CI / loc-budget (push) Successful in 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped

Damit die Specialist-Agents auf vollstaendigem Website-Content arbeiten:

A — _find_dsi_links pierct jetzt Shadow-DOM (Web-Components wie Usercentrics/
    Mercedes) rekursiv; versteckte (display:none) Links werden erfasst + als
    Coverage-Metadatum geflaggt.
B — _expand_to_fixpoint klappt Akkordeons/Tabs/Hover-Menues in einer Schleife
    auf, bis das DOM stabil ist (statt 1 Pass); erweiterte Selektoren;
    Coverage-Telemetrie (Runden, expandierte Elemente, DOM-Wachstum, Shadow-/
    versteckte Links) → Response + Backend-Log.
C — legacy_url_cdx.cdx_enumerate listet via Wayback-CDX-API ALLE je
    archivierten URLs der Domain → findet Orphan-/Legacy-Seiten, die nie im
    Slug-Raster standen (z.B. nicht mehr verlinktes /datenschutz, per Direkt-
    URL noch erreichbar). Fliesst durch das bestehende Legacy-URL-Inventar.

Tests: test_legacy_url_cdx.py (6) + consent-tester/tests/test_dsi_discovery.py
(Pure-Helper + Real-Browser-Integration). Alle gruen, LOC-Gate gruen.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-09 12:33:34 +02:00
parent b1357915ae
commit 08c08fcba2
7 changed files with 487 additions and 41 deletions
@@ -54,6 +54,17 @@ async def _fetch_text(url: str, doc_type: str = "") -> tuple[str, list[dict]]:
docs = payload.get("documents", []) docs = payload.get("documents", [])
cmp_payloads = payload.get("cmp_payloads") or [] cmp_payloads = payload.get("cmp_payloads") or []
cmp_cookie_text = payload.get("cmp_cookie_text") or "" cmp_cookie_text = payload.get("cmp_cookie_text") or ""
coverage = payload.get("coverage") or {}
if coverage:
logger.info(
"Crawl-Coverage %s: %d Interaktions-Runden, "
"%d Elemente expandiert, %d Shadow-Links, "
"%d versteckte Links",
url, coverage.get("interaction_rounds", 0),
coverage.get("elements_expanded", 0),
coverage.get("shadow_links_found", 0),
coverage.get("hidden_links_found", 0),
)
# D — wenn der consent-tester HTML-Tabellen aus dem DOM # D — wenn der consent-tester HTML-Tabellen aus dem DOM
# extrahiert hat, in die cmp_payloads als "generic_table" # extrahiert hat, in die cmp_payloads als "generic_table"
# einschleusen damit das Backend sie via cookies_table_parser # einschleusen damit das Backend sie via cookies_table_parser
@@ -0,0 +1,89 @@
"""Wayback-CDX-Enumeration — listet ALLE je archivierten URLs einer Domain.
Anders als die per-Slug-Wayback-Pruefung (legacy_url_discovery._wayback_check)
holen wir hier die KOMPLETTE History-Liste der Domain ueber die CDX-API. So
finden wir Orphan-/Legacy-Seiten, die nie im Slug-Raster standen und heute
nicht mehr verlinkt sind, aber per Direkt-URL noch erreichbar — genau der Fall
"www.xyz.com/datenschutz existierte mal, wurde nie entfernt".
Best-effort: jede Exception → leere Liste, blockiert die uebrige Discovery nie.
"""
from __future__ import annotations
import logging
from urllib.parse import urlparse
import httpx
logger = logging.getLogger(__name__)
_CDX_API = "http://web.archive.org/cdx/search/cdx"
# Nicht-HTML-Assets, die uns fuer Rechts-Content nicht interessieren.
_ASSET_SUFFIXES = (
".js", ".css", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico",
".woff", ".woff2", ".ttf", ".eot", ".webp", ".mp4", ".webm",
".zip", ".map", ".json", ".xml", ".rss", ".txt", ".csv",
)
def _parse_cdx_rows(rows: list) -> list[tuple[str, str]]:
"""Parst CDX-JSON zu (url, timestamp)-Paaren.
CDX-JSON ist ein Array von Arrays; Zeile 0 ist der Header
["original","timestamp","statuscode"]. Assets werden gedroppt,
Duplikate (per URL ohne Fragment) entfernt.
"""
if not isinstance(rows, list) or len(rows) < 2:
return []
seen: set[str] = set()
out: list[tuple[str, str]] = []
for row in rows[1:]: # Zeile 0 = Header
if not isinstance(row, (list, tuple)) or not row:
continue
url = str(row[0]).strip()
if not url:
continue
path = url.lower().split("?", 1)[0].split("#", 1)[0]
if path.endswith(_ASSET_SUFFIXES):
continue
key = url.split("#", 1)[0]
if key in seen:
continue
seen.add(key)
ts = str(row[1]).strip() if len(row) > 1 else ""
out.append((url, ts))
return out
async def cdx_enumerate(origin: str, limit: int = 2000) -> list[tuple[str, str]]:
"""Liefert (url, wayback_timestamp) fuer alle je archivierten HTML-URLs.
`collapse=urlkey` → eine Zeile pro URL; `filter=statuscode:200` → nur
erfolgreich archivierte. Der timestamp wird spaeter als Legacy-Alter
wiederverwendet (spart einen zweiten Wayback-Call pro URL).
"""
netloc = urlparse(origin).netloc or origin.replace("https://", "").replace(
"http://", "",
)
if not netloc:
return []
params = {
"url": f"{netloc}*",
"output": "json",
"collapse": "urlkey",
"fl": "original,timestamp,statuscode",
"filter": "statuscode:200",
"limit": str(limit),
}
try:
async with httpx.AsyncClient(timeout=15.0) as c:
r = await c.get(_CDX_API, params=params)
if r.status_code != 200:
return []
rows = r.json() or []
except Exception as e:
logger.info("CDX enumerate failed for %s: %s", netloc, e)
return []
return _parse_cdx_rows(rows)
@@ -29,6 +29,8 @@ from urllib.parse import urljoin, urlparse
import httpx import httpx
from compliance.services.legacy_url_cdx import cdx_enumerate
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -239,13 +241,24 @@ async def discover_legacy_urls(state: dict) -> dict:
return {"candidates": [], "skipped": "no_origin"} return {"candidates": [], "skipped": "no_origin"}
candidates: set[str] = set() candidates: set[str] = set()
# A.1 Sitemap # A.1 Sitemap + A.3 Slug-Permutations
for o in list(origins)[:2]: for o in list(origins)[:2]:
sitemap_urls = await _fetch_sitemap_urls(o) sitemap_urls = await _fetch_sitemap_urls(o)
candidates.update(_filter_legal_urls(sitemap_urls)) candidates.update(_filter_legal_urls(sitemap_urls))
# A.3 Slug-Permutations
candidates.update(_build_slug_candidates(o)) candidates.update(_build_slug_candidates(o))
# A.5 Wayback-CDX: alle je archivierten URLs der Domain → faengt
# Orphans, die nie im Slug-Raster standen. (url, cdx_timestamp); der
# timestamp dient als Legacy-Alter (kein zweiter Wayback-Call noetig).
cdx_pairs: list[tuple[str, str]] = []
for o in list(origins)[:2]:
cdx_pairs.extend(await cdx_enumerate(o))
cdx_legal_urls = set(_filter_legal_urls([u for u, _ in cdx_pairs]))
cdx_legal = [
(u, ts) for (u, ts) in cdx_pairs
if u in cdx_legal_urls and u not in candidates
][:100]
# Cap to avoid explosion # Cap to avoid explosion
cands = list(candidates)[:60] cands = list(candidates)[:60]
@@ -264,12 +277,32 @@ async def discover_legacy_urls(state: dict) -> dict:
"age_months": age, "age_months": age,
"in_footer": in_footer, "in_footer": in_footer,
"recommendation": _recommend(status, age, False, in_footer), "recommendation": _recommend(status, age, False, in_footer),
"via": "sitemap/slug",
} }
results = await asyncio.gather( # CDX-Kandidaten: nur Liveness pruefen (Archiv-Stand kennen wir schon).
*[_check(u) for u in cands], return_exceptions=True, async def _check_cdx(url: str, ts: str) -> dict:
status, lm = await _probe_alive(url)
age = _months_since(ts)
in_footer = url.split("#")[0].split("?")[0] in footer_urls
return {
"url": url,
"status": status,
"last_modified": lm,
"wayback_snapshot": "",
"wayback_timestamp": ts,
"age_months": age,
"in_footer": in_footer,
"recommendation": _recommend(status, age, False, in_footer),
"via": "wayback-cdx",
}
gathered = await asyncio.gather(
*[_check(u) for u in cands],
*[_check_cdx(u, ts) for u, ts in cdx_legal],
return_exceptions=True,
) )
results = [r for r in results if isinstance(r, dict)] results = [r for r in gathered if isinstance(r, dict)]
# Filter: only show interesting ones (≥200 reachable + legacy-relevant) # Filter: only show interesting ones (≥200 reachable + legacy-relevant)
interesting: list[dict] = [] interesting: list[dict] = []
@@ -297,5 +330,6 @@ async def discover_legacy_urls(state: dict) -> dict:
"candidates": interesting, "candidates": interesting,
"probed": len(results), "probed": len(results),
"filtered_kept": len(interesting), "filtered_kept": len(interesting),
"cdx_candidates": len(cdx_legal),
"origins": list(origins), "origins": list(origins),
} }
@@ -0,0 +1,110 @@
"""Tests für die Wayback-CDX-Orphan-Enumeration (Feature C)."""
from __future__ import annotations
import asyncio
from compliance.services.legacy_url_cdx import _parse_cdx_rows, cdx_enumerate
def _run(coro):
return asyncio.get_event_loop().run_until_complete(coro)
# ── Pure: _parse_cdx_rows ───────────────────────────────────────────
def test_parse_cdx_rows_drops_assets_and_dedups():
rows = [
["original", "timestamp", "statuscode"], # Header
["http://x.com/datenschutz", "20190101", "200"],
["http://x.com/datenschutz", "20200101", "200"], # Duplikat
["http://x.com/style.css", "20200101", "200"], # Asset
["http://x.com/app.js", "20200101", "200"], # Asset
["http://x.com/impressum", "20180101", "200"],
]
out = _parse_cdx_rows(rows)
urls = [u for u, _ in out]
assert urls == ["http://x.com/datenschutz", "http://x.com/impressum"]
# timestamp des ERSTEN (ältesten) Snapshots bleibt erhalten
assert out[0] == ("http://x.com/datenschutz", "20190101")
def test_parse_cdx_rows_empty_or_header_only():
assert _parse_cdx_rows([]) == []
assert _parse_cdx_rows([["original", "timestamp"]]) == []
assert _parse_cdx_rows("garbage") == [] # type: ignore[arg-type]
# ── cdx_enumerate mit gemocktem httpx ───────────────────────────────
class _FakeResp:
def __init__(self, status_code, json_data):
self.status_code = status_code
self._json = json_data
def json(self):
return self._json
class _FakeClient:
def __init__(self, resp):
self._resp = resp
async def __aenter__(self):
return self
async def __aexit__(self, *a):
return False
async def get(self, *a, **kw):
return self._resp
def _patch_httpx(monkeypatch, resp):
monkeypatch.setattr(
"compliance.services.legacy_url_cdx.httpx.AsyncClient",
lambda *a, **kw: _FakeClient(resp),
)
def test_cdx_enumerate_returns_parsed_pairs(monkeypatch):
rows = [
["original", "timestamp", "statuscode"],
["http://x.com/datenschutz", "20190101120000", "200"],
["http://x.com/logo.png", "20200101", "200"],
]
_patch_httpx(monkeypatch, _FakeResp(200, rows))
out = _run(cdx_enumerate("https://x.com"))
urls = [u for u, _ in out]
assert "http://x.com/datenschutz" in urls
assert "http://x.com/logo.png" not in urls # Asset gedroppt
def test_cdx_enumerate_non_200_returns_empty(monkeypatch):
_patch_httpx(monkeypatch, _FakeResp(503, []))
assert _run(cdx_enumerate("https://x.com")) == []
def test_cdx_enumerate_no_netloc_returns_empty(monkeypatch):
_patch_httpx(monkeypatch, _FakeResp(200, []))
assert _run(cdx_enumerate("")) == []
# ── Orphan-Pfad: CDX-Fund → Legal-Filter behält Rechts-Seite ────────
def test_cdx_orphan_survives_legal_filter():
"""Der eigentliche Orphan-Fall: CDX findet /datenschutz (nicht mehr
verlinkt), der Legal-Filter behält sie, Produktseiten fallen raus."""
from compliance.services.legacy_url_discovery import _filter_legal_urls
rows = [
["original", "timestamp", "statuscode"],
["http://x.com/datenschutz", "20190101", "200"],
["http://x.com/products/widget", "20200101", "200"],
]
pairs = _parse_cdx_rows(rows)
legal = _filter_legal_urls([u for u, _ in pairs])
assert "http://x.com/datenschutz" in legal
assert "http://x.com/products/widget" not in legal
+2
View File
@@ -324,6 +324,7 @@ class DSIDiscoveryResponse(BaseModel):
# Raw CMP payloads captured during navigation (ePaaS, OneTrust, etc.). # Raw CMP payloads captured during navigation (ePaaS, OneTrust, etc.).
# Backend uses these to build the per-vendor compliance table. # Backend uses these to build the per-vendor compliance table.
cmp_payloads: list[dict] = [] cmp_payloads: list[dict] = []
coverage: dict = {} # Coverage-Telemetrie (Feature B), s. coverage_dict()
@app.post("/dsi-discovery", response_model=DSIDiscoveryResponse) @app.post("/dsi-discovery", response_model=DSIDiscoveryResponse)
@@ -376,6 +377,7 @@ async def dsi_discovery(req: DSIDiscoveryRequest):
errors=result.errors, errors=result.errors,
scanned_at=datetime.now(timezone.utc).isoformat(), scanned_at=datetime.now(timezone.utc).isoformat(),
cmp_payloads=result.cmp_payloads, cmp_payloads=result.cmp_payloads,
coverage=result.coverage_dict(),
) )
+147 -36
View File
@@ -181,6 +181,23 @@ class DSIDiscoveryResult:
# the authoritative cookie-text so MC checks run on the real policy, # the authoritative cookie-text so MC checks run on the real policy,
# not the homepage navigation that DOM extraction returns. # not the homepage navigation that DOM extraction returns.
cmp_cookie_text: str = "" cmp_cookie_text: str = ""
# Coverage-Telemetrie (Feature B): macht messbar, wie erschoepfend die
# Interaktion war — wir behaupten kein "100%", wir MESSEN es.
interaction_rounds: int = 0
elements_expanded: int = 0
dom_growth_bytes: int = 0
shadow_links_found: int = 0
hidden_links_found: int = 0
def coverage_dict(self) -> dict:
"""Coverage-Telemetrie als Dict (Feature B) — fuers Response-Mapping."""
return {
"interaction_rounds": self.interaction_rounds,
"elements_expanded": self.elements_expanded,
"dom_growth_bytes": self.dom_growth_bytes,
"shadow_links_found": self.shadow_links_found,
"hidden_links_found": self.hidden_links_found,
}
async def _extract_dom_tables(page) -> list[list[str]]: async def _extract_dom_tables(page) -> list[list[str]]:
"""D — extrahiert alle <table>-Elemente aus dem aktuellen DOM als """D — extrahiert alle <table>-Elemente aus dem aktuellen DOM als
@@ -444,15 +461,24 @@ async def discover_dsi_documents(
links = await _find_dsi_links(page, base_domain) links = await _find_dsi_links(page, base_domain)
logger.info("Found %d DSI links on %s", len(links), url) logger.info("Found %d DSI links on %s", len(links), url)
# Step 3: Expand accordions, tabs, dropdowns to find hidden content # Step 3: Interaktions-Fixpunkt — aufklappen bis das DOM stabil ist
await _expand_all_interactive(page) # (faengt verschachtelte/lazy Akkordeons, die ein einzelner Pass
await page.wait_for_timeout(1000) # verpasst). Telemetrie als messbares Coverage-Signal.
_tel = await _expand_to_fixpoint(page)
result.interaction_rounds = _tel["rounds"]
result.elements_expanded = _tel["elements_expanded"]
result.dom_growth_bytes = _tel["dom_growth"]
await page.wait_for_timeout(500)
# Step 3b: Re-scan after expanding (may reveal new links) # Step 3b: Re-scan after expanding (may reveal new links)
links_after = await _find_dsi_links(page, base_domain) links_after = await _find_dsi_links(page, base_domain)
for link in links_after: for link in links_after:
if link["href"] not in [l["href"] for l in links]: if link["href"] not in [l["href"] for l in links]:
links.append(link) links.append(link)
result.shadow_links_found = sum(
1 for l in links_after if l.get("in_shadow"))
result.hidden_links_found = sum(
1 for l in links_after if not l.get("visible"))
# Step 4: Check for inline DSI sections (accordion content already visible) # Step 4: Check for inline DSI sections (accordion content already visible)
inline_sections = await _find_inline_dsi_sections(page) inline_sections = await _find_inline_dsi_sections(page)
@@ -524,7 +550,7 @@ async def discover_dsi_documents(
continue continue
await try_dismiss_consent_banner(page) await try_dismiss_consent_banner(page)
await _expand_all_interactive(page) await _expand_to_fixpoint(page)
await page.wait_for_timeout(500) await page.wait_for_timeout(500)
# Extract text — try specific content areas, fall back to full body # Extract text — try specific content areas, fall back to full body
@@ -595,7 +621,7 @@ async def discover_dsi_documents(
# Navigate back for next link # Navigate back for next link
await goto_resilient(page, url, timeout=45000) await goto_resilient(page, url, timeout=45000)
await page.wait_for_timeout(500) await page.wait_for_timeout(500)
await _expand_all_interactive(page) await _expand_to_fixpoint(page)
except Exception as e: except Exception as e:
result.errors.append(f"Failed to load {href}: {str(e)[:80]}") result.errors.append(f"Failed to load {href}: {str(e)[:80]}")
@@ -674,25 +700,48 @@ def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
return unique return unique
async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]: async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]:
"""Find all links whose text or href matches DSI keywords.""" """Find all links whose text or href matches DSI keywords.
Pierct Shadow-DOM (Web-Components wie Usercentrics/Mercedes) rekursiv —
sonst werden Rechts-Links in Shadow-Trees uebersehen. Versteckte Links
(display:none) kommen ueber querySelectorAll ohnehin mit; das
visible-Flag bleibt als Coverage-Metadatum erhalten.
"""
try: try:
all_links = await page.evaluate(""" all_links = await page.evaluate("""
() => [...document.querySelectorAll('a[href]')].map(a => ({ () => {
href: a.href, const out = [];
text: (a.textContent || '').trim().substring(0, 200), const collect = (root) => {
ariaLabel: a.getAttribute('aria-label') || '', if (!root || !root.querySelectorAll) return;
title: a.getAttribute('title') || '', root.querySelectorAll('a[href]').forEach(a => out.push({
visible: a.getBoundingClientRect().width > 0, href: a.href,
})) text: (a.textContent || '').trim().substring(0, 200),
ariaLabel: a.getAttribute('aria-label') || '',
title: a.getAttribute('title') || '',
visible: a.getBoundingClientRect().width > 0,
inShadow: root !== document,
}));
root.querySelectorAll('*').forEach(el => {
if (el.shadowRoot) collect(el.shadowRoot);
});
};
collect(document);
return out;
}
""") """)
dsi_links = [] dsi_links = []
for link in (all_links or []): for link in (all_links or []):
search_text = f"{link['text']} {link['ariaLabel']} {link['title']}".lower() search_text = (
f"{link['text']} {link['ariaLabel']} {link['title']}".lower()
)
href = link["href"] href = link["href"]
href_lower = href.lower() href_lower = href.lower()
# Match by link text or href # Match by link text or href
is_match = any(kw in search_text or kw in href_lower for kw in ALL_DSI_KEYWORDS) is_match = any(
kw in search_text or kw in href_lower
for kw in ALL_DSI_KEYWORDS
)
if not is_match: if not is_match:
continue continue
@@ -702,6 +751,7 @@ async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]:
"href": href, "href": href,
"text": link["text"], "text": link["text"],
"visible": link["visible"], "visible": link["visible"],
"in_shadow": link.get("inShadow", False),
}) })
return dsi_links return dsi_links
@@ -709,47 +759,108 @@ async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]:
logger.warning("DSI link scan failed: %s", e) logger.warning("DSI link scan failed: %s", e)
return [] return []
async def _expand_all_interactive(page: Page) -> None: async def _expand_all_interactive(page: Page) -> int:
"""Expand all accordions, tabs, details, dropdowns on the page. """Expand all accordions, tabs, details, dropdowns on the page.
IMPORTANT: Only expand CLOSED elements. Never click elements that IMPORTANT: Only expand CLOSED elements. Never click elements that
are already expanded (aria-expanded="true") — that would close them. are already expanded (aria-expanded="true") — that would close them.
BMW, for example, has accordions open by default. BMW, for example, has accordions open by default.
Returns the number of elements acted on (drives the fixpoint loop +
coverage telemetry).
""" """
try: try:
await page.evaluate("""() => { return await page.evaluate("""() => {
// 1. Open all <details> that are closed let n = 0;
document.querySelectorAll('details:not([open])').forEach(d => d.open = true); const click = (el) => { try { el.click(); n++; } catch {} };
// 2. Click buttons that are explicitly CLOSED (aria-expanded="false") // 1. Open all <details> that are closed
document.querySelectorAll('button[aria-expanded="false"]').forEach(b => { document.querySelectorAll('details:not([open])').forEach(d => {
try { b.click(); } catch {} d.open = true; n++;
}); });
// 2. Anything explicitly CLOSED (aria-expanded="false") — not
// only <button>; many accordions use div/a/span roles.
document.querySelectorAll('[aria-expanded="false"]').forEach(click);
// 3. Bootstrap/jQuery collapse triggers (only closed ones) // 3. Bootstrap/jQuery collapse triggers (only closed ones)
document.querySelectorAll('[data-toggle="collapse"].collapsed').forEach(e => { document.querySelectorAll(
try { e.click(); } catch {} '[data-toggle="collapse"].collapsed, '
}); + '[data-bs-toggle="collapse"].collapsed').forEach(click);
document.querySelectorAll('[data-bs-toggle="collapse"].collapsed').forEach(e => {
try { e.click(); } catch {}
});
// 4. "Show more" / "Mehr anzeigen" buttons // 4. "Show more" / "Mehr anzeigen" buttons
document.querySelectorAll('button,a').forEach(b => { document.querySelectorAll('button,a,[role="button"]').forEach(b => {
const t = (b.textContent || '').trim(); const t = (b.textContent || '').trim();
if (/^(mehr|more|weiterlesen|read more|show more|anzeigen|alle anzeigen)/i.test(t)) if (/^(mehr|more|weiterlesen|read more|show more|anzeigen|alle anzeigen|mehr erfahren|mehr infos?)/i.test(t))
try { b.click(); } catch {} click(b);
}); });
// 5. Tabs — click each to make content visible, then go back // 5. Tab panels — make hidden content visible
// (don't click, just make tab panels visible)
document.querySelectorAll('[role="tabpanel"][hidden]').forEach(p => { document.querySelectorAll('[role="tabpanel"][hidden]').forEach(p => {
p.removeAttribute('hidden'); p.removeAttribute('hidden'); p.style.display = ''; n++;
p.style.display = '';
}); });
// 6. <summary> + accordion headers that are explicitly closed
// (aria-expanded="false" only — never toggle open ones).
document.querySelectorAll(
'summary, [class*="accordion" i] [class*="header" i], '
+ '[class*="accordion" i] [class*="toggle" i]').forEach(el => {
if (el.getAttribute('aria-expanded') === 'false') click(el);
});
// 7. Hover-reveal menus/dropdowns (JS-driven). Non-destructive
// mouseover; CSS-:hover menus are already in the DOM.
document.querySelectorAll(
'[class*="menu" i], [class*="dropdown" i], nav li').forEach(el => {
try { el.dispatchEvent(new MouseEvent('mouseover', {bubbles: true})); } catch {}
});
return n;
}""") }""")
except Exception: except Exception:
pass return 0
async def _dom_size(page: Page) -> int:
"""Body-innerHTML-Laenge als billiger DOM-Wachstums-Indikator."""
try:
return await page.evaluate(
"() => document.body ? document.body.innerHTML.length : 0")
except Exception:
return 0
def _dom_grew(prev_size: int, new_size: int, threshold: int = 32) -> bool:
"""Pure: ist das DOM seit der letzten Runde nennenswert gewachsen?
(Toleranz gegen Mikro-Jitter durch Timestamps o.ae.)."""
return new_size > prev_size + threshold
async def _expand_to_fixpoint(page: Page, max_rounds: int = 6) -> dict:
"""Ruft _expand_all_interactive wiederholt, bis das DOM stabil ist
(Fixpunkt) oder max_rounds erreicht — faengt verschachtelte/lazy
Akkordeons, die ein einzelner Pass verpasst. Liefert Coverage-
Telemetrie {rounds, elements_expanded, dom_growth}."""
total_clicked = 0
start_size = await _dom_size(page)
prev_size = start_size
final_size = start_size
rounds = 0
while rounds < max_rounds:
clicked = await _expand_all_interactive(page)
total_clicked += clicked
await page.wait_for_timeout(250)
new_size = await _dom_size(page)
rounds += 1
final_size = new_size
if not _dom_grew(prev_size, new_size):
break
prev_size = new_size
return {
"rounds": rounds,
"elements_expanded": total_clicked,
"dom_growth": max(0, final_size - start_size),
}
async def _find_inline_dsi_sections(page: Page) -> list[dict]: async def _find_inline_dsi_sections(page: Page) -> list[dict]:
"""Find DSI content already visible on the page (e.g. expanded accordions). """Find DSI content already visible on the page (e.g. expanded accordions).
@@ -0,0 +1,89 @@
"""Tests für dsi_discovery — Shadow-DOM/versteckte Link-Erfassung (Feature A)
+ Interaktions-Fixpunkt (Feature B)."""
from __future__ import annotations
import asyncio
import pytest
from services.dsi_discovery import (
_dom_grew,
_expand_to_fixpoint,
_find_dsi_links,
)
# ── Pure: Fixpunkt-Stopbedingung ────────────────────────────────────
def test_dom_grew_threshold():
assert _dom_grew(100, 200) is True
assert _dom_grew(100, 133) is True # 33 > 32 (Schwelle)
assert _dom_grew(100, 110) is False # unter Schwelle
assert _dom_grew(100, 100) is False
# ── Browser-Integration (skip wenn kein chromium) ───────────────────
def _chromium_ok() -> bool:
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
b = p.chromium.launch(headless=True, args=["--no-sandbox"])
b.close()
return True
except Exception:
return False
_BROWSER = _chromium_ok()
_FIXTURE = """
<html><body>
<a href="https://example.com/datenschutz" style="display:none">Datenschutz</a>
<details><summary>Mehr</summary>
<a href="https://example.com/impressum">Impressum</a>
</details>
<div id="host"></div>
<script>
const sr = document.getElementById('host').attachShadow({mode:'open'});
sr.innerHTML =
'<a href="https://example.com/cookie-richtlinie">Cookies</a>';
</script>
</body></html>
"""
async def _scan_fixture():
from playwright.async_api import async_playwright
async with async_playwright() as p:
b = await p.chromium.launch(headless=True, args=["--no-sandbox"])
try:
page = await (await b.new_context()).new_page()
await page.set_content(_FIXTURE)
tel = await _expand_to_fixpoint(page)
links = await _find_dsi_links(page, "example.com")
details_open = await page.evaluate(
"() => !!(document.querySelector('details')"
" && document.querySelector('details').open)")
return links, tel, details_open
finally:
await b.close()
@pytest.mark.skipif(not _BROWSER, reason="chromium nicht installiert")
def test_shadow_and_hidden_links_discovered():
links, tel, details_open = asyncio.get_event_loop().run_until_complete(
_scan_fixture())
hrefs = [l["href"] for l in links]
# A: Shadow-DOM-Link gefunden + geflaggt
assert any("cookie-richtlinie" in h for h in hrefs), hrefs
assert any(l.get("in_shadow") for l in links)
# A: versteckter (display:none) Link gefunden + als hidden geflaggt
assert any("datenschutz" in h for h in hrefs), hrefs
assert any(not l["visible"] for l in links)
# B: Fixpunkt lief + hat das geschlossene Akkordeon geoeffnet
assert tel["rounds"] >= 1
assert details_open is True