Files
breakpilot-compliance/backend-compliance/compliance/services/widerrufsbelehrung_reachability_check.py
T
Benjamin Admin b9baa8c603 feat(b13): Widerrufsbelehrung-Reachability-Check (GT WIDERRUFSBELEHRUNG-001)
Erkennt B2C-Shop ohne öffentlich erreichbare Widerrufsbelehrung.
Schließt eine der offenen GT-Lücken aus dem Elli-Audit.

Signale:
  - doc_entries[widerruf]: discovery_attempted=True + Text leer
  - kein Footer-Link auf Widerruf/cancellation/rückgabe
  - B2C-Scope: Warenkorb/Kasse/Bestellung/MwSt/Wallbox/Tarif (strong)
    vs Shop/Produkt/Rechnung (weak, ≥2 = likely)
  - B2B-only-Override: "ausschließlich an Unternehmer" etc.

Severity:
  - HIGH bei b2c_strong
  - MEDIUM bei b2c_likely
  - kein Finding bei b2b_only / unknown (False-Positive-Schutz)

Norm: Art. 246a § 1 Abs. 2 Nr. 1 EGBGB i.V.m. § 312d BGB.

Wiring:
  - widerrufsbelehrung_reachability_check.py — Check + Scope-Detection
  - _b13_wiring.py — Render + state-Anschluss
  - _orchestrator.py — run_b13 nach run_b12
  - mail_render_v2/_compose.py — widerruf_reach_html-Block

Tests: 13/13 grün (Scope-Detection 5 + Check-Logik 8).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-07 00:04:41 +02:00

184 lines
6.2 KiB
Python

"""B13 — Widerrufsbelehrung-Reachability-Check.
Erkennt: B2C-Shop ohne öffentlich erreichbare Widerrufsbelehrung.
Norm: Art. 246a § 1 Abs. 2 Nr. 1 EGBGB i.V.m. § 312d BGB —
Widerrufsbelehrung muss dauerhaft + leicht zugänglich auf der Website
verfügbar sein. Footer ohne Widerruf-Link + alle Widerruf-Pfade 404
verletzt das.
Signale aus state:
- doc_entries: ein entry mit doc_type='widerruf'. Discovery hat es
versucht (discovery_attempted=True), aber Text ist leer / unter
Mindestlänge → Pfad nicht erreichbar.
- DSE / Homepage / sonstige Texte: B2C-Scope-Detection per Keywords.
Ein einzelnes Finding mit Schweregrad HIGH bei B2C, MEDIUM bei
unklarem Scope, kein Finding bei klarem B2B.
"""
from __future__ import annotations
import logging
import re
logger = logging.getLogger(__name__)
# Min characters that count as 'reachable' widerruf doc.
_MIN_TEXT_CHARS = 400
# Strong B2C-Shop signals (Endkunden-Verkauf / E-Commerce).
_B2C_STRONG = (
"warenkorb", "in den warenkorb", "zur kasse", "kasse",
"bestellung aufgeben", "jetzt kaufen", "preis inkl. mwst",
"preis inkl. mehrwertsteuer", "lieferzeit", "versandkosten",
"rückgaberecht", "rueckgaberecht", "rücksende",
"endkunden", "endverbraucher", "verbraucher i.s.d. § 13 bgb",
)
# Weaker B2C signals — only count when paired with a strong one or
# when AT LEAST TWO appear together.
_B2C_WEAK = (
"shop", "store", "kaufen", "produkt", "ware", "rechnung",
"agb", "widerrufsfrist", "widerrufsrecht", "wallbox", "hardware",
"abonnement", "tarif buchen", "naturstrom", "ladetarif",
)
# Hard B2B-only signals that override B2C-Verdacht.
_B2B_ONLY = (
"ausschließlich an unternehmer", "ausschliesslich an unternehmer",
"nur für unternehmen", "b2b only", "kein verkauf an verbraucher",
"ausschluss verbraucher",
)
def _detect_b2c_scope(state: dict) -> tuple[str, list[str]]:
"""Return (scope, signals_found).
scope ∈ {'b2c_strong', 'b2c_likely', 'b2b_only', 'unknown'}
"""
haystack_parts: list[str] = []
for e in (state.get("doc_entries") or []):
t = (e.get("text") or "").lower()
if t:
haystack_parts.append(t)
home = (state.get("home_text") or "").lower()
if home:
haystack_parts.append(home)
hay = "\n".join(haystack_parts)
if not hay:
return "unknown", []
b2b_hits = [s for s in _B2B_ONLY if s in hay]
if b2b_hits:
return "b2b_only", b2b_hits
strong_hits = [s for s in _B2C_STRONG if s in hay]
if strong_hits:
return "b2c_strong", strong_hits
weak_hits = [s for s in _B2C_WEAK if s in hay]
if len(weak_hits) >= 2:
return "b2c_likely", weak_hits[:5]
return "unknown", weak_hits[:3]
def _footer_has_widerruf_link(state: dict) -> bool:
"""Best-effort scan for a Widerruf-link in footer / discovered URLs.
The discovery phase merges any same-owner widerruf URLs it finds
into doc_entries[widerruf].url. If that URL exists AND the
discovered text is non-empty, the page is reachable.
"""
for e in (state.get("doc_entries") or []):
if e.get("doc_type") != "widerruf":
continue
url = (e.get("url") or "").strip()
text = (e.get("text") or "").strip()
if url and len(text) >= _MIN_TEXT_CHARS:
return True
# Optional: scan a raw footer-snapshot if the orchestrator stored one.
footer = (state.get("footer_html") or "") + " " + \
(state.get("footer_text") or "")
if footer and re.search(
r"widerruf|cancellation|withdrawal|rückgabe|rueckgabe",
footer, re.IGNORECASE,
):
return True
return False
def check_widerrufsbelehrung_reachability(state: dict) -> list[dict]:
"""Emit a single finding when a B2C-Shop has no reachable Widerruf
document and no footer link to one."""
widerruf_entry = next(
(e for e in (state.get("doc_entries") or [])
if e.get("doc_type") == "widerruf"),
None,
)
if not widerruf_entry:
# No widerruf processing happened at all → don't fabricate.
return []
discovery_tried = bool(widerruf_entry.get("discovery_attempted"))
text_len = len((widerruf_entry.get("text") or "").strip())
has_url = bool((widerruf_entry.get("url") or "").strip())
if text_len >= _MIN_TEXT_CHARS:
# widerruf doc is actually reachable — no finding.
return []
if not discovery_tried and not has_url:
# User did not submit a widerruf URL and discovery did not run
# (e.g. no homepage to crawl). Cannot make a claim.
return []
if _footer_has_widerruf_link(state):
return []
scope, signals = _detect_b2c_scope(state)
if scope == "b2b_only":
return []
if scope == "unknown":
# Without B2C-Indikatoren remain silent — false positives at
# pure agency / B2B sites would erode trust in the report.
return []
sev = "HIGH" if scope == "b2c_strong" else "MEDIUM"
sev_reason = "missing" if scope == "b2c_strong" else "unverifiable"
tried_url = (widerruf_entry.get("url")
or widerruf_entry.get("rejected_url") or "").strip()
tried_hint = f" (probiert: {tried_url})" if tried_url else ""
return [{
"check_id": "WIDERRUF-REACH-001",
"severity": sev,
"severity_reason": sev_reason,
"title": (
"Widerrufsbelehrung nicht öffentlich erreichbar "
"trotz B2C-Shop-Merkmalen"
),
"norm": (
"Art. 246a § 1 Abs. 2 Nr. 1 EGBGB i.V.m. § 312d BGB"
),
"evidence": (
f"Discovery hat Widerruf-Pfade versucht{tried_hint} — keine "
f"erreichbare Belehrung gefunden. Footer enthält keinen "
f"Widerruf-Link. B2C-Signale: "
f"{', '.join(signals[:3]) if signals else 'keine direkten'}."
),
"action": (
"Eigenständige Widerrufsbelehrungs-Seite (z.B. "
"/widerrufsbelehrung) anlegen UND im Footer dauerhaft "
"verlinken. Gesetzliche Musterbelehrung nach Anlage 1 zu "
"Art. 246a EGBGB verwenden — eigene Formulierungen sind "
"abmahnfähig."
),
"b2c_scope": scope,
}]