"""B13 — Widerrufsbelehrung-Reachability-Check. Erkennt: B2C-Shop ohne öffentlich erreichbare Widerrufsbelehrung. Norm: Art. 246a § 1 Abs. 2 Nr. 1 EGBGB i.V.m. § 312d BGB — Widerrufsbelehrung muss dauerhaft + leicht zugänglich auf der Website verfügbar sein. Footer ohne Widerruf-Link + alle Widerruf-Pfade 404 verletzt das. Signale aus state: - doc_entries: ein entry mit doc_type='widerruf'. Discovery hat es versucht (discovery_attempted=True), aber Text ist leer / unter Mindestlänge → Pfad nicht erreichbar. - DSE / Homepage / sonstige Texte: B2C-Scope-Detection per Keywords. Ein einzelnes Finding mit Schweregrad HIGH bei B2C, MEDIUM bei unklarem Scope, kein Finding bei klarem B2B. """ from __future__ import annotations import logging import re logger = logging.getLogger(__name__) # Min characters that count as 'reachable' widerruf doc. _MIN_TEXT_CHARS = 400 # Strong B2C-Shop signals (Endkunden-Verkauf / E-Commerce). _B2C_STRONG = ( "warenkorb", "in den warenkorb", "zur kasse", "kasse", "bestellung aufgeben", "jetzt kaufen", "preis inkl. mwst", "preis inkl. mehrwertsteuer", "lieferzeit", "versandkosten", "rückgaberecht", "rueckgaberecht", "rücksende", "endkunden", "endverbraucher", "verbraucher i.s.d. § 13 bgb", ) # Weaker B2C signals — only count when paired with a strong one or # when AT LEAST TWO appear together. _B2C_WEAK = ( "shop", "store", "kaufen", "produkt", "ware", "rechnung", "agb", "widerrufsfrist", "widerrufsrecht", "wallbox", "hardware", "abonnement", "tarif buchen", "naturstrom", "ladetarif", # Versicherungs- / Finanz-B2C "reiseversicherung", "versicherung abschließen", "versicherung kaufen", "online abschließen", "online-antrag", "antrag stellen", "police", "vertrag abschließen", "tarifrechner", "beitrag berechnen", "jetzt online", # Telekom / Energie / Mobilfunk B2C "vertrag buchen", "tarif wechseln", "stromtarif", "gastarif", "mobilfunkvertrag", "dsl-tarif", # Reise / Hotel / Mobility B2C "buchen", "reservieren", "buchung", "ticket kaufen", "fahrkarte", "flug buchen", ) # Hard B2B-only signals that override B2C-Verdacht. _B2B_ONLY = ( "ausschließlich an unternehmer", "ausschliesslich an unternehmer", "nur für unternehmen", "b2b only", "kein verkauf an verbraucher", "ausschluss verbraucher", ) def _detect_b2c_scope(state: dict) -> tuple[str, list[str]]: """Return (scope, signals_found). scope ∈ {'b2c_strong', 'b2c_likely', 'b2b_only', 'unknown'} """ haystack_parts: list[str] = [] for e in (state.get("doc_entries") or []): t = (e.get("text") or "").lower() if t: haystack_parts.append(t) home = (state.get("home_text") or "").lower() if home: haystack_parts.append(home) hay = "\n".join(haystack_parts) if not hay: return "unknown", [] b2b_hits = [s for s in _B2B_ONLY if s in hay] if b2b_hits: return "b2b_only", b2b_hits strong_hits = [s for s in _B2C_STRONG if s in hay] if strong_hits: return "b2c_strong", strong_hits weak_hits = [s for s in _B2C_WEAK if s in hay] if len(weak_hits) >= 2: return "b2c_likely", weak_hits[:5] return "unknown", weak_hits[:3] def _footer_has_widerruf_link(state: dict) -> bool: """Best-effort scan for a Widerruf-link in footer / discovered URLs. The discovery phase merges any same-owner widerruf URLs it finds into doc_entries[widerruf].url. If that URL exists AND the discovered text is non-empty, the page is reachable. """ for e in (state.get("doc_entries") or []): if e.get("doc_type") != "widerruf": continue url = (e.get("url") or "").strip() text = (e.get("text") or "").strip() if url and len(text) >= _MIN_TEXT_CHARS: return True # Optional: scan a raw footer-snapshot if the orchestrator stored one. footer = (state.get("footer_html") or "") + " " + \ (state.get("footer_text") or "") if footer and re.search( r"widerruf|cancellation|withdrawal|rückgabe|rueckgabe", footer, re.IGNORECASE, ): return True return False def check_widerrufsbelehrung_reachability(state: dict) -> list[dict]: """Emit a single finding when a B2C-Shop has no reachable Widerruf document and no footer link to one.""" widerruf_entry = next( (e for e in (state.get("doc_entries") or []) if e.get("doc_type") == "widerruf"), None, ) if not widerruf_entry: # No widerruf processing happened at all → don't fabricate. return [] discovery_tried = bool(widerruf_entry.get("discovery_attempted")) text_len = len((widerruf_entry.get("text") or "").strip()) has_url = bool((widerruf_entry.get("url") or "").strip()) if text_len >= _MIN_TEXT_CHARS: # widerruf doc is actually reachable — no finding. return [] if not discovery_tried and not has_url: # User did not submit a widerruf URL and discovery did not run # (e.g. no homepage to crawl). Cannot make a claim. return [] if _footer_has_widerruf_link(state): return [] scope, signals = _detect_b2c_scope(state) if scope == "b2b_only": return [] if scope == "unknown": # Without B2C-Indikatoren remain silent — false positives at # pure agency / B2B sites would erode trust in the report. return [] sev = "HIGH" if scope == "b2c_strong" else "MEDIUM" sev_reason = "missing" if scope == "b2c_strong" else "unverifiable" tried_url = (widerruf_entry.get("url") or widerruf_entry.get("rejected_url") or "").strip() tried_hint = f" (probiert: {tried_url})" if tried_url else "" return [{ "check_id": "WIDERRUF-REACH-001", "severity": sev, "severity_reason": sev_reason, "title": ( "Widerrufsbelehrung nicht öffentlich erreichbar " "trotz B2C-Shop-Merkmalen" ), "norm": ( "Art. 246a § 1 Abs. 2 Nr. 1 EGBGB i.V.m. § 312d BGB" ), "evidence": ( f"Discovery hat Widerruf-Pfade versucht{tried_hint} — keine " f"erreichbare Belehrung gefunden. Footer enthält keinen " f"Widerruf-Link. B2C-Signale: " f"{', '.join(signals[:3]) if signals else 'keine direkten'}." ), "action": ( "Eigenständige Widerrufsbelehrungs-Seite (z.B. " "/widerrufsbelehrung) anlegen UND im Footer dauerhaft " "verlinken. Gesetzliche Musterbelehrung nach Anlage 1 zu " "Art. 246a EGBGB verwenden — eigene Formulierungen sind " "abmahnfähig." ), "b2c_scope": scope, }]