e8ff75cbfe
5 Backlog-Items aus dem Multi-Site-Briefing in einem Sprint:
1. B13 B2C-Soft-Hints — Versicherungs/Tarif/Buchungs-Marker
_B2C_WEAK erweitert um "Reiseversicherung", "Tarifrechner",
"Online-Antrag", "Flug buchen", "Stromtarif" etc.
Fängt Allianz-Reise-Chatbot (vorher False-Negative).
2. Chatbot-Policy-Discovery (chatbot_policy_discovery.py)
Probt 14 Standard-Slugs (privacypolicychatbot, chatbot-datenschutz,
ai-policy, ki-datenschutz, ...) × 5 Lang-Prefixe auf jeder
submitted Origin. Successful >300-Wort-Findings werden in
doc_texts['dse'] gemerged. Audit-Trail über
doc_entries[dse].chatbot_policy_sources.
Hebt Westfield-iAdvize-Lücke.
3. API-Response-Payload erweitert
phase_f_persist.response um extra_findings, audit_walk und
html_blocks erweitert. B-Wiring-Output (B1, B3-B18) ist nicht
mehr nur im Mail-HTML versteckt — externe Aufrufer sehen jeden
Finding. Schema additiv, legacy clients ignorieren neue Felder.
4. Plausibility-LLM Empty-Response-Fix
Resilienz-Strategie A→B→C→D:
A) format='json' (strict, default)
B) format='' (loose, _try_extract_json mit ```json-fence + prose-
wrap-Unterstützung)
C) Split-Batch-Recursion (vorhanden)
D) Give up, leeres dict (callers behandeln als skipped)
Plus _post_llm() als isolierter LLM-Call-Helper, catched
Network-Errors.
5. Specialist-Agents Phase 2 LLM (MVP) — Impressum-Agent
impressum_agent_llm.py: qwen3:30b-a3b mit § 5 TMG System-Prompt,
business_scope-hints aus profile_dict. Output identisches Schema
wie pattern-agent für ein Merge ohne API-Bruch.
_b18_wiring.py orchestriert beide Agents + deduplet nach
field_id, rendert lila V2-Block mit KB/LLM-Tags pro Finding.
Pattern-first im Dedup (deterministisch + stable).
Tests: 107/107 grün (7 Test-Suites + chatbot-discovery + b18).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
195 lines
6.7 KiB
Python
195 lines
6.7 KiB
Python
"""B13 — Widerrufsbelehrung-Reachability-Check.
|
|
|
|
Erkennt: B2C-Shop ohne öffentlich erreichbare Widerrufsbelehrung.
|
|
|
|
Norm: Art. 246a § 1 Abs. 2 Nr. 1 EGBGB i.V.m. § 312d BGB —
|
|
Widerrufsbelehrung muss dauerhaft + leicht zugänglich auf der Website
|
|
verfügbar sein. Footer ohne Widerruf-Link + alle Widerruf-Pfade 404
|
|
verletzt das.
|
|
|
|
Signale aus state:
|
|
- doc_entries: ein entry mit doc_type='widerruf'. Discovery hat es
|
|
versucht (discovery_attempted=True), aber Text ist leer / unter
|
|
Mindestlänge → Pfad nicht erreichbar.
|
|
- DSE / Homepage / sonstige Texte: B2C-Scope-Detection per Keywords.
|
|
|
|
Ein einzelnes Finding mit Schweregrad HIGH bei B2C, MEDIUM bei
|
|
unklarem Scope, kein Finding bei klarem B2B.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# Min characters that count as 'reachable' widerruf doc.
|
|
_MIN_TEXT_CHARS = 400
|
|
|
|
|
|
# Strong B2C-Shop signals (Endkunden-Verkauf / E-Commerce).
|
|
_B2C_STRONG = (
|
|
"warenkorb", "in den warenkorb", "zur kasse", "kasse",
|
|
"bestellung aufgeben", "jetzt kaufen", "preis inkl. mwst",
|
|
"preis inkl. mehrwertsteuer", "lieferzeit", "versandkosten",
|
|
"rückgaberecht", "rueckgaberecht", "rücksende",
|
|
"endkunden", "endverbraucher", "verbraucher i.s.d. § 13 bgb",
|
|
)
|
|
|
|
# Weaker B2C signals — only count when paired with a strong one or
|
|
# when AT LEAST TWO appear together.
|
|
_B2C_WEAK = (
|
|
"shop", "store", "kaufen", "produkt", "ware", "rechnung",
|
|
"agb", "widerrufsfrist", "widerrufsrecht", "wallbox", "hardware",
|
|
"abonnement", "tarif buchen", "naturstrom", "ladetarif",
|
|
# Versicherungs- / Finanz-B2C
|
|
"reiseversicherung", "versicherung abschließen",
|
|
"versicherung kaufen", "online abschließen", "online-antrag",
|
|
"antrag stellen", "police", "vertrag abschließen",
|
|
"tarifrechner", "beitrag berechnen", "jetzt online",
|
|
# Telekom / Energie / Mobilfunk B2C
|
|
"vertrag buchen", "tarif wechseln", "stromtarif",
|
|
"gastarif", "mobilfunkvertrag", "dsl-tarif",
|
|
# Reise / Hotel / Mobility B2C
|
|
"buchen", "reservieren", "buchung", "ticket kaufen",
|
|
"fahrkarte", "flug buchen",
|
|
)
|
|
|
|
# Hard B2B-only signals that override B2C-Verdacht.
|
|
_B2B_ONLY = (
|
|
"ausschließlich an unternehmer", "ausschliesslich an unternehmer",
|
|
"nur für unternehmen", "b2b only", "kein verkauf an verbraucher",
|
|
"ausschluss verbraucher",
|
|
)
|
|
|
|
|
|
def _detect_b2c_scope(state: dict) -> tuple[str, list[str]]:
|
|
"""Return (scope, signals_found).
|
|
|
|
scope ∈ {'b2c_strong', 'b2c_likely', 'b2b_only', 'unknown'}
|
|
"""
|
|
haystack_parts: list[str] = []
|
|
for e in (state.get("doc_entries") or []):
|
|
t = (e.get("text") or "").lower()
|
|
if t:
|
|
haystack_parts.append(t)
|
|
home = (state.get("home_text") or "").lower()
|
|
if home:
|
|
haystack_parts.append(home)
|
|
hay = "\n".join(haystack_parts)
|
|
|
|
if not hay:
|
|
return "unknown", []
|
|
|
|
b2b_hits = [s for s in _B2B_ONLY if s in hay]
|
|
if b2b_hits:
|
|
return "b2b_only", b2b_hits
|
|
|
|
strong_hits = [s for s in _B2C_STRONG if s in hay]
|
|
if strong_hits:
|
|
return "b2c_strong", strong_hits
|
|
|
|
weak_hits = [s for s in _B2C_WEAK if s in hay]
|
|
if len(weak_hits) >= 2:
|
|
return "b2c_likely", weak_hits[:5]
|
|
return "unknown", weak_hits[:3]
|
|
|
|
|
|
def _footer_has_widerruf_link(state: dict) -> bool:
|
|
"""Best-effort scan for a Widerruf-link in footer / discovered URLs.
|
|
|
|
The discovery phase merges any same-owner widerruf URLs it finds
|
|
into doc_entries[widerruf].url. If that URL exists AND the
|
|
discovered text is non-empty, the page is reachable.
|
|
"""
|
|
for e in (state.get("doc_entries") or []):
|
|
if e.get("doc_type") != "widerruf":
|
|
continue
|
|
url = (e.get("url") or "").strip()
|
|
text = (e.get("text") or "").strip()
|
|
if url and len(text) >= _MIN_TEXT_CHARS:
|
|
return True
|
|
# Optional: scan a raw footer-snapshot if the orchestrator stored one.
|
|
footer = (state.get("footer_html") or "") + " " + \
|
|
(state.get("footer_text") or "")
|
|
if footer and re.search(
|
|
r"widerruf|cancellation|withdrawal|rückgabe|rueckgabe",
|
|
footer, re.IGNORECASE,
|
|
):
|
|
return True
|
|
return False
|
|
|
|
|
|
def check_widerrufsbelehrung_reachability(state: dict) -> list[dict]:
|
|
"""Emit a single finding when a B2C-Shop has no reachable Widerruf
|
|
document and no footer link to one."""
|
|
widerruf_entry = next(
|
|
(e for e in (state.get("doc_entries") or [])
|
|
if e.get("doc_type") == "widerruf"),
|
|
None,
|
|
)
|
|
if not widerruf_entry:
|
|
# No widerruf processing happened at all → don't fabricate.
|
|
return []
|
|
|
|
discovery_tried = bool(widerruf_entry.get("discovery_attempted"))
|
|
text_len = len((widerruf_entry.get("text") or "").strip())
|
|
has_url = bool((widerruf_entry.get("url") or "").strip())
|
|
|
|
if text_len >= _MIN_TEXT_CHARS:
|
|
# widerruf doc is actually reachable — no finding.
|
|
return []
|
|
|
|
if not discovery_tried and not has_url:
|
|
# User did not submit a widerruf URL and discovery did not run
|
|
# (e.g. no homepage to crawl). Cannot make a claim.
|
|
return []
|
|
|
|
if _footer_has_widerruf_link(state):
|
|
return []
|
|
|
|
scope, signals = _detect_b2c_scope(state)
|
|
if scope == "b2b_only":
|
|
return []
|
|
|
|
if scope == "unknown":
|
|
# Without B2C-Indikatoren remain silent — false positives at
|
|
# pure agency / B2B sites would erode trust in the report.
|
|
return []
|
|
|
|
sev = "HIGH" if scope == "b2c_strong" else "MEDIUM"
|
|
sev_reason = "missing" if scope == "b2c_strong" else "unverifiable"
|
|
|
|
tried_url = (widerruf_entry.get("url")
|
|
or widerruf_entry.get("rejected_url") or "").strip()
|
|
tried_hint = f" (probiert: {tried_url})" if tried_url else ""
|
|
|
|
return [{
|
|
"check_id": "WIDERRUF-REACH-001",
|
|
"severity": sev,
|
|
"severity_reason": sev_reason,
|
|
"title": (
|
|
"Widerrufsbelehrung nicht öffentlich erreichbar "
|
|
"trotz B2C-Shop-Merkmalen"
|
|
),
|
|
"norm": (
|
|
"Art. 246a § 1 Abs. 2 Nr. 1 EGBGB i.V.m. § 312d BGB"
|
|
),
|
|
"evidence": (
|
|
f"Discovery hat Widerruf-Pfade versucht{tried_hint} — keine "
|
|
f"erreichbare Belehrung gefunden. Footer enthält keinen "
|
|
f"Widerruf-Link. B2C-Signale: "
|
|
f"{', '.join(signals[:3]) if signals else 'keine direkten'}."
|
|
),
|
|
"action": (
|
|
"Eigenständige Widerrufsbelehrungs-Seite (z.B. "
|
|
"/widerrufsbelehrung) anlegen UND im Footer dauerhaft "
|
|
"verlinken. Gesetzliche Musterbelehrung nach Anlage 1 zu "
|
|
"Art. 246a EGBGB verwenden — eigene Formulierungen sind "
|
|
"abmahnfähig."
|
|
),
|
|
"b2c_scope": scope,
|
|
}]
|