feat(crawl): Vollstaendigkeit — Shadow-DOM/versteckte Links + Interaktions-Fixpunkt + Wayback-CDX-Orphans
CI / test-python-backend (push) Successful in 30s
CI / detect-changes (push) Successful in 9s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 12s
CI / loc-budget (push) Successful in 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped

Damit die Specialist-Agents auf vollstaendigem Website-Content arbeiten:

A — _find_dsi_links pierct jetzt Shadow-DOM (Web-Components wie Usercentrics/
    Mercedes) rekursiv; versteckte (display:none) Links werden erfasst + als
    Coverage-Metadatum geflaggt.
B — _expand_to_fixpoint klappt Akkordeons/Tabs/Hover-Menues in einer Schleife
    auf, bis das DOM stabil ist (statt 1 Pass); erweiterte Selektoren;
    Coverage-Telemetrie (Runden, expandierte Elemente, DOM-Wachstum, Shadow-/
    versteckte Links) → Response + Backend-Log.
C — legacy_url_cdx.cdx_enumerate listet via Wayback-CDX-API ALLE je
    archivierten URLs der Domain → findet Orphan-/Legacy-Seiten, die nie im
    Slug-Raster standen (z.B. nicht mehr verlinktes /datenschutz, per Direkt-
    URL noch erreichbar). Fliesst durch das bestehende Legacy-URL-Inventar.

Tests: test_legacy_url_cdx.py (6) + consent-tester/tests/test_dsi_discovery.py
(Pure-Helper + Real-Browser-Integration). Alle gruen, LOC-Gate gruen.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-09 12:33:34 +02:00
parent b1357915ae
commit 08c08fcba2
7 changed files with 487 additions and 41 deletions
+2
View File
@@ -324,6 +324,7 @@ class DSIDiscoveryResponse(BaseModel):
# Raw CMP payloads captured during navigation (ePaaS, OneTrust, etc.).
# Backend uses these to build the per-vendor compliance table.
cmp_payloads: list[dict] = []
coverage: dict = {} # Coverage-Telemetrie (Feature B), s. coverage_dict()
@app.post("/dsi-discovery", response_model=DSIDiscoveryResponse)
@@ -376,6 +377,7 @@ async def dsi_discovery(req: DSIDiscoveryRequest):
errors=result.errors,
scanned_at=datetime.now(timezone.utc).isoformat(),
cmp_payloads=result.cmp_payloads,
coverage=result.coverage_dict(),
)
+147 -36
View File
@@ -181,6 +181,23 @@ class DSIDiscoveryResult:
# the authoritative cookie-text so MC checks run on the real policy,
# not the homepage navigation that DOM extraction returns.
cmp_cookie_text: str = ""
# Coverage-Telemetrie (Feature B): macht messbar, wie erschoepfend die
# Interaktion war — wir behaupten kein "100%", wir MESSEN es.
interaction_rounds: int = 0
elements_expanded: int = 0
dom_growth_bytes: int = 0
shadow_links_found: int = 0
hidden_links_found: int = 0
def coverage_dict(self) -> dict:
"""Coverage-Telemetrie als Dict (Feature B) — fuers Response-Mapping."""
return {
"interaction_rounds": self.interaction_rounds,
"elements_expanded": self.elements_expanded,
"dom_growth_bytes": self.dom_growth_bytes,
"shadow_links_found": self.shadow_links_found,
"hidden_links_found": self.hidden_links_found,
}
async def _extract_dom_tables(page) -> list[list[str]]:
"""D — extrahiert alle <table>-Elemente aus dem aktuellen DOM als
@@ -444,15 +461,24 @@ async def discover_dsi_documents(
links = await _find_dsi_links(page, base_domain)
logger.info("Found %d DSI links on %s", len(links), url)
# Step 3: Expand accordions, tabs, dropdowns to find hidden content
await _expand_all_interactive(page)
await page.wait_for_timeout(1000)
# Step 3: Interaktions-Fixpunkt — aufklappen bis das DOM stabil ist
# (faengt verschachtelte/lazy Akkordeons, die ein einzelner Pass
# verpasst). Telemetrie als messbares Coverage-Signal.
_tel = await _expand_to_fixpoint(page)
result.interaction_rounds = _tel["rounds"]
result.elements_expanded = _tel["elements_expanded"]
result.dom_growth_bytes = _tel["dom_growth"]
await page.wait_for_timeout(500)
# Step 3b: Re-scan after expanding (may reveal new links)
links_after = await _find_dsi_links(page, base_domain)
for link in links_after:
if link["href"] not in [l["href"] for l in links]:
links.append(link)
result.shadow_links_found = sum(
1 for l in links_after if l.get("in_shadow"))
result.hidden_links_found = sum(
1 for l in links_after if not l.get("visible"))
# Step 4: Check for inline DSI sections (accordion content already visible)
inline_sections = await _find_inline_dsi_sections(page)
@@ -524,7 +550,7 @@ async def discover_dsi_documents(
continue
await try_dismiss_consent_banner(page)
await _expand_all_interactive(page)
await _expand_to_fixpoint(page)
await page.wait_for_timeout(500)
# Extract text — try specific content areas, fall back to full body
@@ -595,7 +621,7 @@ async def discover_dsi_documents(
# Navigate back for next link
await goto_resilient(page, url, timeout=45000)
await page.wait_for_timeout(500)
await _expand_all_interactive(page)
await _expand_to_fixpoint(page)
except Exception as e:
result.errors.append(f"Failed to load {href}: {str(e)[:80]}")
@@ -674,25 +700,48 @@ def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
return unique
async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]:
"""Find all links whose text or href matches DSI keywords."""
"""Find all links whose text or href matches DSI keywords.
Pierct Shadow-DOM (Web-Components wie Usercentrics/Mercedes) rekursiv —
sonst werden Rechts-Links in Shadow-Trees uebersehen. Versteckte Links
(display:none) kommen ueber querySelectorAll ohnehin mit; das
visible-Flag bleibt als Coverage-Metadatum erhalten.
"""
try:
all_links = await page.evaluate("""
() => [...document.querySelectorAll('a[href]')].map(a => ({
href: a.href,
text: (a.textContent || '').trim().substring(0, 200),
ariaLabel: a.getAttribute('aria-label') || '',
title: a.getAttribute('title') || '',
visible: a.getBoundingClientRect().width > 0,
}))
() => {
const out = [];
const collect = (root) => {
if (!root || !root.querySelectorAll) return;
root.querySelectorAll('a[href]').forEach(a => out.push({
href: a.href,
text: (a.textContent || '').trim().substring(0, 200),
ariaLabel: a.getAttribute('aria-label') || '',
title: a.getAttribute('title') || '',
visible: a.getBoundingClientRect().width > 0,
inShadow: root !== document,
}));
root.querySelectorAll('*').forEach(el => {
if (el.shadowRoot) collect(el.shadowRoot);
});
};
collect(document);
return out;
}
""")
dsi_links = []
for link in (all_links or []):
search_text = f"{link['text']} {link['ariaLabel']} {link['title']}".lower()
search_text = (
f"{link['text']} {link['ariaLabel']} {link['title']}".lower()
)
href = link["href"]
href_lower = href.lower()
# Match by link text or href
is_match = any(kw in search_text or kw in href_lower for kw in ALL_DSI_KEYWORDS)
is_match = any(
kw in search_text or kw in href_lower
for kw in ALL_DSI_KEYWORDS
)
if not is_match:
continue
@@ -702,6 +751,7 @@ async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]:
"href": href,
"text": link["text"],
"visible": link["visible"],
"in_shadow": link.get("inShadow", False),
})
return dsi_links
@@ -709,47 +759,108 @@ async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]:
logger.warning("DSI link scan failed: %s", e)
return []
async def _expand_all_interactive(page: Page) -> None:
async def _expand_all_interactive(page: Page) -> int:
"""Expand all accordions, tabs, details, dropdowns on the page.
IMPORTANT: Only expand CLOSED elements. Never click elements that
are already expanded (aria-expanded="true") — that would close them.
BMW, for example, has accordions open by default.
Returns the number of elements acted on (drives the fixpoint loop +
coverage telemetry).
"""
try:
await page.evaluate("""() => {
// 1. Open all <details> that are closed
document.querySelectorAll('details:not([open])').forEach(d => d.open = true);
return await page.evaluate("""() => {
let n = 0;
const click = (el) => { try { el.click(); n++; } catch {} };
// 2. Click buttons that are explicitly CLOSED (aria-expanded="false")
document.querySelectorAll('button[aria-expanded="false"]').forEach(b => {
try { b.click(); } catch {}
// 1. Open all <details> that are closed
document.querySelectorAll('details:not([open])').forEach(d => {
d.open = true; n++;
});
// 2. Anything explicitly CLOSED (aria-expanded="false") — not
// only <button>; many accordions use div/a/span roles.
document.querySelectorAll('[aria-expanded="false"]').forEach(click);
// 3. Bootstrap/jQuery collapse triggers (only closed ones)
document.querySelectorAll('[data-toggle="collapse"].collapsed').forEach(e => {
try { e.click(); } catch {}
});
document.querySelectorAll('[data-bs-toggle="collapse"].collapsed').forEach(e => {
try { e.click(); } catch {}
});
document.querySelectorAll(
'[data-toggle="collapse"].collapsed, '
+ '[data-bs-toggle="collapse"].collapsed').forEach(click);
// 4. "Show more" / "Mehr anzeigen" buttons
document.querySelectorAll('button,a').forEach(b => {
document.querySelectorAll('button,a,[role="button"]').forEach(b => {
const t = (b.textContent || '').trim();
if (/^(mehr|more|weiterlesen|read more|show more|anzeigen|alle anzeigen)/i.test(t))
try { b.click(); } catch {}
if (/^(mehr|more|weiterlesen|read more|show more|anzeigen|alle anzeigen|mehr erfahren|mehr infos?)/i.test(t))
click(b);
});
// 5. Tabs — click each to make content visible, then go back
// (don't click, just make tab panels visible)
// 5. Tab panels — make hidden content visible
document.querySelectorAll('[role="tabpanel"][hidden]').forEach(p => {
p.removeAttribute('hidden');
p.style.display = '';
p.removeAttribute('hidden'); p.style.display = ''; n++;
});
// 6. <summary> + accordion headers that are explicitly closed
// (aria-expanded="false" only — never toggle open ones).
document.querySelectorAll(
'summary, [class*="accordion" i] [class*="header" i], '
+ '[class*="accordion" i] [class*="toggle" i]').forEach(el => {
if (el.getAttribute('aria-expanded') === 'false') click(el);
});
// 7. Hover-reveal menus/dropdowns (JS-driven). Non-destructive
// mouseover; CSS-:hover menus are already in the DOM.
document.querySelectorAll(
'[class*="menu" i], [class*="dropdown" i], nav li').forEach(el => {
try { el.dispatchEvent(new MouseEvent('mouseover', {bubbles: true})); } catch {}
});
return n;
}""")
except Exception:
pass
return 0
async def _dom_size(page: Page) -> int:
"""Body-innerHTML-Laenge als billiger DOM-Wachstums-Indikator."""
try:
return await page.evaluate(
"() => document.body ? document.body.innerHTML.length : 0")
except Exception:
return 0
def _dom_grew(prev_size: int, new_size: int, threshold: int = 32) -> bool:
"""Pure: ist das DOM seit der letzten Runde nennenswert gewachsen?
(Toleranz gegen Mikro-Jitter durch Timestamps o.ae.)."""
return new_size > prev_size + threshold
async def _expand_to_fixpoint(page: Page, max_rounds: int = 6) -> dict:
"""Ruft _expand_all_interactive wiederholt, bis das DOM stabil ist
(Fixpunkt) oder max_rounds erreicht — faengt verschachtelte/lazy
Akkordeons, die ein einzelner Pass verpasst. Liefert Coverage-
Telemetrie {rounds, elements_expanded, dom_growth}."""
total_clicked = 0
start_size = await _dom_size(page)
prev_size = start_size
final_size = start_size
rounds = 0
while rounds < max_rounds:
clicked = await _expand_all_interactive(page)
total_clicked += clicked
await page.wait_for_timeout(250)
new_size = await _dom_size(page)
rounds += 1
final_size = new_size
if not _dom_grew(prev_size, new_size):
break
prev_size = new_size
return {
"rounds": rounds,
"elements_expanded": total_clicked,
"dom_growth": max(0, final_size - start_size),
}
async def _find_inline_dsi_sections(page: Page) -> list[dict]:
"""Find DSI content already visible on the page (e.g. expanded accordions).
@@ -0,0 +1,89 @@
"""Tests für dsi_discovery — Shadow-DOM/versteckte Link-Erfassung (Feature A)
+ Interaktions-Fixpunkt (Feature B)."""
from __future__ import annotations
import asyncio
import pytest
from services.dsi_discovery import (
_dom_grew,
_expand_to_fixpoint,
_find_dsi_links,
)
# ── Pure: Fixpunkt-Stopbedingung ────────────────────────────────────
def test_dom_grew_threshold():
assert _dom_grew(100, 200) is True
assert _dom_grew(100, 133) is True # 33 > 32 (Schwelle)
assert _dom_grew(100, 110) is False # unter Schwelle
assert _dom_grew(100, 100) is False
# ── Browser-Integration (skip wenn kein chromium) ───────────────────
def _chromium_ok() -> bool:
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
b = p.chromium.launch(headless=True, args=["--no-sandbox"])
b.close()
return True
except Exception:
return False
_BROWSER = _chromium_ok()
_FIXTURE = """
<html><body>
<a href="https://example.com/datenschutz" style="display:none">Datenschutz</a>
<details><summary>Mehr</summary>
<a href="https://example.com/impressum">Impressum</a>
</details>
<div id="host"></div>
<script>
const sr = document.getElementById('host').attachShadow({mode:'open'});
sr.innerHTML =
'<a href="https://example.com/cookie-richtlinie">Cookies</a>';
</script>
</body></html>
"""
async def _scan_fixture():
from playwright.async_api import async_playwright
async with async_playwright() as p:
b = await p.chromium.launch(headless=True, args=["--no-sandbox"])
try:
page = await (await b.new_context()).new_page()
await page.set_content(_FIXTURE)
tel = await _expand_to_fixpoint(page)
links = await _find_dsi_links(page, "example.com")
details_open = await page.evaluate(
"() => !!(document.querySelector('details')"
" && document.querySelector('details').open)")
return links, tel, details_open
finally:
await b.close()
@pytest.mark.skipif(not _BROWSER, reason="chromium nicht installiert")
def test_shadow_and_hidden_links_discovered():
links, tel, details_open = asyncio.get_event_loop().run_until_complete(
_scan_fixture())
hrefs = [l["href"] for l in links]
# A: Shadow-DOM-Link gefunden + geflaggt
assert any("cookie-richtlinie" in h for h in hrefs), hrefs
assert any(l.get("in_shadow") for l in links)
# A: versteckter (display:none) Link gefunden + als hidden geflaggt
assert any("datenschutz" in h for h in hrefs), hrefs
assert any(not l["visible"] for l in links)
# B: Fixpunkt lief + hat das geschlossene Akkordeon geoeffnet
assert tel["rounds"] >= 1
assert details_open is True