feat: Backlog 1-5 — soft-hints, chatbot-discovery, API-payload, LLM-Agent

5 Backlog-Items aus dem Multi-Site-Briefing in einem Sprint: 1. B13 B2C-Soft-Hints — Versicherungs/Tarif/Buchungs-Marker _B2C_WEAK erweitert um "Reiseversicherung", "Tarifrechner", "Online-Antrag", "Flug buchen", "Stromtarif" etc. Fängt Allianz-Reise-Chatbot (vorher False-Negative). 2. Chatbot-Policy-Discovery (chatbot_policy_discovery.py) Probt 14 Standard-Slugs (privacypolicychatbot, chatbot-datenschutz, ai-policy, ki-datenschutz, ...) × 5 Lang-Prefixe auf jeder submitted Origin. Successful >300-Wort-Findings werden in doc_texts['dse'] gemerged. Audit-Trail über doc_entries[dse].chatbot_policy_sources. Hebt Westfield-iAdvize-Lücke. 3. API-Response-Payload erweitert phase_f_persist.response um extra_findings, audit_walk und html_blocks erweitert. B-Wiring-Output (B1, B3-B18) ist nicht mehr nur im Mail-HTML versteckt — externe Aufrufer sehen jeden Finding. Schema additiv, legacy clients ignorieren neue Felder. 4. Plausibility-LLM Empty-Response-Fix Resilienz-Strategie A→B→C→D: A) format='json' (strict, default) B) format='' (loose, _try_extract_json mit ```json-fence + prose- wrap-Unterstützung) C) Split-Batch-Recursion (vorhanden) D) Give up, leeres dict (callers behandeln als skipped) Plus _post_llm() als isolierter LLM-Call-Helper, catched Network-Errors. 5. Specialist-Agents Phase 2 LLM (MVP) — Impressum-Agent impressum_agent_llm.py: qwen3:30b-a3b mit § 5 TMG System-Prompt, business_scope-hints aus profile_dict. Output identisches Schema wie pattern-agent für ein Merge ohne API-Bruch. _b18_wiring.py orchestriert beide Agents + deduplet nach field_id, rendert lila V2-Block mit KB/LLM-Tags pro Finding. Pattern-first im Dedup (deterministisch + stable). Tests: 107/107 grün (7 Test-Suites + chatbot-discovery + b18). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-07 18:41:54 +02:00
parent a2cae94526
commit e8ff75cbfe
11 changed files with 832 additions and 34 deletions
@@ -0,0 +1,161 @@
+"""Discover separate chatbot-/AI-policy pages and merge them into the
+main DSE text.
+
+Many sites publish their chatbot data-protection notice on a separate
+URL (e.g. westfield.com/germany/privacypolicychatbot) that the regular
+auto-discovery misses because it doesn't classify as 'dse'. As a
+result, B12/B15 (chatbot-cookie classification, AI-Act legal basis)
+never see the iAdvize/Vertex provider names.
+
+Strategy:
+  1. From the discovered URLs derive the base host.
+  2. Probe a fixed list of well-known chatbot-policy paths.
+  3. For each 2xx-response with > 300 words, merge the text into
+     state['doc_texts']['dse'] with a separator.
+
+Best-effort: a probe failure NEVER aborts the check.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import re
+from urllib.parse import urlparse
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+# Slug-Kandidaten, sortiert von häufigsten zu seltensten.
+_CHATBOT_POLICY_SLUGS = (
+    "privacypolicychatbot",
+    "chatbot-datenschutz", "chatbot/datenschutz",
+    "datenschutz-chatbot", "datenschutz/chatbot",
+    "ai-policy", "ai-datenschutz", "ki-datenschutz",
+    "privacy-chatbot", "privacy-ai",
+    "datenschutz-ki", "datenschutz-assistent",
+    "chatbot-privacy", "ai-privacy",
+)
+
+
+# Sprach-Prefixe die wir abklopfen.
+_LANG_PREFIXES = ("", "/de", "/de_DE", "/en", "/germany")
+
+
+def _build_candidate_urls(base_origin: str) -> list[str]:
+    """Build all (lang × slug) combinations for one origin."""
+    out: list[str] = []
+    seen: set[str] = set()
+    for lang in _LANG_PREFIXES:
+        for slug in _CHATBOT_POLICY_SLUGS:
+            url = f"{base_origin}{lang}/{slug}".replace("//", "/")
+            url = url.replace("https:/", "https://").replace("http:/", "http://")
+            if url not in seen:
+                seen.add(url)
+                out.append(url)
+    return out
+
+
+async def _probe(url: str, timeout_s: float = 4.0) -> tuple[str, str] | None:
+    """Return (url, text) on 2xx + >300-word body, else None."""
+    try:
+        async with httpx.AsyncClient(
+            timeout=timeout_s, follow_redirects=True,
+        ) as c:
+            r = await c.get(url)
+            if r.status_code >= 400:
+                return None
+            text = re.sub(r"<script.*?</script>", " ",
+                          r.text, flags=re.S | re.I)
+            text = re.sub(r"<style.*?</style>", " ",
+                          text, flags=re.S | re.I)
+            text = re.sub(r"<[^>]+>", " ", text)
+            text = re.sub(r"\s+", " ", text).strip()
+            if len(text.split()) < 300:
+                return None
+            return url, text
+    except Exception:
+        return None
+
+
+def _base_origins(doc_entries: list[dict]) -> list[str]:
+    seen: set[str] = set()
+    out: list[str] = []
+    for e in doc_entries:
+        url = (e.get("url") or "").strip()
+        if not url:
+            continue
+        try:
+            p = urlparse(url)
+            if not p.scheme or not p.netloc:
+                continue
+            origin = f"{p.scheme}://{p.netloc}"
+            if origin not in seen:
+                seen.add(origin)
+                out.append(origin)
+        except Exception:
+            continue
+    return out
+
+
+async def enrich_dse_with_chatbot_policies(state: dict) -> dict:
+    """Probe known chatbot-policy paths; merge findings into DSE text.
+
+    Returns metadata dict describing what was merged (for logging /
+    debugging). Mutates state['doc_texts']['dse'] in place.
+    """
+    doc_entries = state.get("doc_entries") or []
+    origins = _base_origins(doc_entries)
+    if not origins:
+        return {"probed": 0, "found": [], "merged_chars": 0}
+
+    # Build candidate URL list, capped per origin to avoid noise.
+    candidates: list[str] = []
+    for origin in origins[:2]:  # cap origins for safety
+        candidates.extend(_build_candidate_urls(origin)[:20])
+
+    if not candidates:
+        return {"probed": 0, "found": [], "merged_chars": 0}
+
+    results = await asyncio.gather(
+        *[_probe(u) for u in candidates],
+        return_exceptions=True,
+    )
+    found = [r for r in results if isinstance(r, tuple) and r]
+
+    if not found:
+        return {"probed": len(candidates), "found": [], "merged_chars": 0}
+
+    # Merge into DSE text.
+    doc_texts = state.setdefault("doc_texts", {})
+    dse_text = doc_texts.get("dse") or ""
+    appended_chars = 0
+    appended_urls: list[str] = []
+    for url, text in found:
+        sep = (
+            f"\n\n--- ergänzt aus {url} (chatbot-policy-discovery) ---\n\n"
+        )
+        dse_text += sep + text
+        appended_chars += len(text)
+        appended_urls.append(url)
+    doc_texts["dse"] = dse_text
+
+    # Also record on the dse-entry (audit trail).
+    for e in doc_entries:
+        if e.get("doc_type") == "dse":
+            e["chatbot_policy_sources"] = appended_urls
+            e["text"] = dse_text
+            break
+
+    logger.info(
+        "chatbot-policy enrichment: %d candidate(s) probed, %d found, "
+        "+%d chars merged into DSE",
+        len(candidates), len(found), appended_chars,
+    )
+    return {
+        "probed": len(candidates),
+        "found": appended_urls,
+        "merged_chars": appended_chars,
+    }
@@ -132,54 +132,102 @@ def _build_user_prompt(items: list[dict], doc_title: str,
    )


+async def _post_llm(body: dict) -> str:
+    """One LLM call. Returns content string or empty on failure.
+    Catches network errors so the caller can decide fallback strategy."""
+    try:
+        async with httpx.AsyncClient(timeout=TIMEOUT) as c:
+            r = await c.post(f"{OLLAMA_URL}/api/chat", json=body)
+            r.raise_for_status()
+            return (r.json().get("message") or {}).get("content", "") or ""
+    except Exception as e:
+        logger.warning("plausibility LLM call failed: %s", e)
+        return ""
+
+
+def _try_extract_json(content: str) -> dict | None:
+    """Extract a JSON object from free-form LLM output. Handles
+    markdown-fenced and prose-wrapped responses."""
+    if not content:
+        return None
+    s = content.strip()
+    # Strip ```json … ``` fences
+    if s.startswith("```"):
+        s = s.strip("`")
+        if s.lower().startswith("json"):
+            s = s[4:]
+        s = s.strip()
+    # Heuristic: cut from first { to last }
+    first = s.find("{")
+    last = s.rfind("}")
+    if first >= 0 and last > first:
+        s = s[first:last + 1]
+    try:
+        return json.loads(s)
+    except Exception:
+        return None
+
+
 async def _ask_llm_batch(items: list[dict], doc_title: str,
                          doc_excerpt: str) -> dict[str, dict]:
-    """Send a batch of up to BATCH_SIZE findings to the LLM."""
-    body = {
+    """Send a batch of up to BATCH_SIZE findings to the LLM.
+
+    Resilience strategy (P125 fix for empty-response bug):
+      A. format='json' (strict) — current default
+      B. If A returns empty: format='' (loose), extract JSON manually
+      C. If B also empty AND batch >2: split batch + recurse
+      D. Else: give up, return {} (callers stamp llm_skipped=true)
+    """
+    user_prompt = _build_user_prompt(items, doc_title, doc_excerpt)
+    base_body = {
        "model": MODEL,
        "messages": [
            {"role": "system", "content": _SYSTEM_PROMPT},
-            {"role": "user", "content": _build_user_prompt(
-                items, doc_title, doc_excerpt,
-            )},
+            {"role": "user", "content": user_prompt},
        ],
-        "format": "json",
        "stream": False,
        "options": {"temperature": 0.0, "seed": 42, "num_predict": 1500},
    }
    out: dict[str, dict] = {}
    input_ids = [it["id"] for it in items]
    try:
-        async with httpx.AsyncClient(timeout=TIMEOUT) as c:
-            r = await c.post(f"{OLLAMA_URL}/api/chat", json=body)
-            r.raise_for_status()
-            content = (r.json().get("message") or {}).get("content", "")
-            if not content:
-                # Single retry with smaller batch — qwen3 sometimes
-                # rejects ≥6-item prompts under format='json'.
-                if len(items) > 2:
-                    half = len(items) // 2
-                    logger.info(
-                        "plausibility empty → retry split %d → %dx2",
-                        len(items), half,
-                    )
-                    first = await _ask_llm_batch(
-                        items[:half], doc_title, doc_excerpt,
-                    )
-                    second = await _ask_llm_batch(
-                        items[half:], doc_title, doc_excerpt,
-                    )
-                    out.update(first)
-                    out.update(second)
-                    return out
-                logger.warning("plausibility LLM returned empty content")
+        # Strategy A: format='json'
+        content = await _post_llm({**base_body, "format": "json"})
+        if not content:
+            # Strategy B: format-free, parse-on-our-side
+            logger.info(
+                "plausibility A→empty, trying B (format-free) batch=%d",
+                len(items),
+            )
+            content = await _post_llm(base_body)
+
+        if not content:
+            # Strategy C: split + recurse
+            if len(items) > 2:
+                half = len(items) // 2
+                logger.info(
+                    "plausibility A+B empty → split %d → %dx2",
+                    len(items), half,
+                )
+                first = await _ask_llm_batch(
+                    items[:half], doc_title, doc_excerpt,
+                )
+                second = await _ask_llm_batch(
+                    items[half:], doc_title, doc_excerpt,
+                )
+                out.update(first)
+                out.update(second)
                return out
-            try:
-                data = json.loads(content)
-            except json.JSONDecodeError as je:
+            # Strategy D: give up
+            logger.warning(
+                "plausibility gave up after A+B for batch=%d", len(items),
+            )
+            return out
+            data = _try_extract_json(content)
+            if data is None:
                logger.warning(
-                    "plausibility LLM JSON parse failed: %s; raw=%s",
-                    je, content[:300],
+                    "plausibility LLM JSON parse failed (after fallback); "
+                    "raw=%s", content[:300],
                )
                return out
            llm_findings = data.get("findings") or []
@@ -58,6 +58,8 @@ def compose_v2(state: dict) -> str:
        state.get("url_slug_drift_html", ""),
        # B17 Audit-Walk-Video (Beweis-Aufzeichnung)
        state.get("audit_walk_html", ""),
+        # B18 Impressum-Specialist-Agent (Pattern + LLM)
+        state.get("impressum_agent_html", ""),
        # Browser-Matrix (Stage 1.c)
        state.get("browser_matrix_html", ""),
        # All legacy build_*_html() wrapped in V2 sections — preserves
@@ -0,0 +1,166 @@
+"""Impressum-Specialist-Agent Phase 2 — LLM-gestützt.
+
+Komplementiert den Pattern-Match-Agent (impressum_agent.py) durch
+eine LLM-Pass. Beide Output-Formate sind identisch, sodass das B-Wiring
+beide kombinieren / dedupen kann.
+
+LLM-Setup:
+  - Modell: qwen3:30b-a3b (Standard Ollama, siehe Plausibility-Check)
+  - System-Prompt: KB der § 5 TMG Pflichtangaben
+  - User-Prompt: Impressum-Text + business_scope-Hinweis
+  - Output: JSON-Liste mit {field_id, severity, hint, evidence}
+
+Phase-2-Ziel: schwer-mit-Regex-erfassbare Lücken finden, z.B.
+  - "Geschäftsführer" wird genannt aber ohne Vor- oder Nachname
+  - Aufsichtsbehörde-Pflicht erkannt, aber für falsche Branche
+  - Vertretungsberechtigte einer GmbH bei mehreren Personen unvollständig
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import re
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+OLLAMA_URL = os.environ.get(
+    "OLLAMA_URL", "http://bp-core-ollama:11434",
+)
+MODEL = os.environ.get("IMPRESSUM_AGENT_MODEL", "qwen3:30b-a3b")
+TIMEOUT = float(os.environ.get("IMPRESSUM_AGENT_TIMEOUT", "60"))
+
+
+_SYSTEM_PROMPT = """Du bist ein deutscher Datenschutz-Anwalt mit Fokus
+§ 5 TMG / DDG (Anbieterkennzeichnung). Deine Aufgabe: einen Impressum-
+Text auf Vollständigkeit der Pflichtangaben prüfen und Lücken /
+Mängel strukturiert auflisten.
+
+Pflichtangaben nach § 5 TMG (Standard):
+  - Anbieter-Name + Anschrift (juristische Person: Firma + Sitz)
+  - Vertretungsberechtigte (bei juristischen Personen: ALLE Geschäftsführer
+    mit Vor- und Nachname)
+  - E-Mail UND Telefon (Schnelle elektronische Kontaktaufnahme + UNMITTELBAR)
+  - Handelsregister-Eintrag (HRB/HRA + Registergericht)
+  - USt-IdNr. (falls vorhanden — DE\\d{9})
+  - Bei B2C/Onlineshop: Verbraucherschlichtung + OS-Plattform
+  - Bei reglementiertem Beruf: Berufsbezeichnung + Kammer
+  - Bei genehmigungspflichtigen Tätigkeiten: Aufsichtsbehörde
+
+Ausgabe: NUR gültiges JSON mit Feld "findings", jedes Element:
+  {
+    "field_id": "kurzer-id",
+    "severity": "HIGH"|"MEDIUM"|"LOW",
+    "title": "kurze Lücken-Beschreibung",
+    "evidence": "wörtliches Zitat aus dem Impressum, das das Problem belegt",
+    "action": "konkrete Empfehlung"
+  }
+
+Keine Erklärung außerhalb JSON. Keine Prosa. Wenn alles vollständig:
+gib {"findings": []} zurück.
+"""
+
+
+def _user_prompt(impressum_text: str,
+                  business_scope: set[str] | None) -> str:
+    scope_hint = ""
+    if business_scope:
+        scope_hint = (
+            f"BUSINESS-SCOPE-HINTS: "
+            f"{', '.join(sorted(business_scope))}\n\n"
+        )
+    return (
+        f"{scope_hint}"
+        f"IMPRESSUM-TEXT:\n"
+        f"{impressum_text[:4000]}\n\n"
+        "Liste Lücken nach § 5 TMG. Nur JSON."
+    )
+
+
+def _parse_response(content: str) -> list[dict]:
+    """Robust JSON extraction (handles ```json fences, prose-wrap)."""
+    if not content:
+        return []
+    s = content.strip()
+    if s.startswith("```"):
+        s = s.strip("`")
+        if s.lower().startswith("json"):
+            s = s[4:]
+        s = s.strip()
+    first = s.find("{")
+    last = s.rfind("}")
+    if first >= 0 and last > first:
+        s = s[first:last + 1]
+    try:
+        data = json.loads(s)
+    except Exception:
+        # Try array directly
+        first = content.find("[")
+        last = content.rfind("]")
+        if first >= 0 and last > first:
+            try:
+                arr = json.loads(content[first:last + 1])
+                return arr if isinstance(arr, list) else []
+            except Exception:
+                return []
+        return []
+    findings = data.get("findings") if isinstance(data, dict) else data
+    return findings if isinstance(findings, list) else []
+
+
+async def evaluate_llm(
+    impressum_text: str,
+    business_scope: set[str] | None = None,
+) -> list[dict]:
+    """LLM-gestützte Impressum-Analyse. Returns finding dicts in the
+    same shape as impressum_agent.evaluate() so callers can merge."""
+    if not impressum_text or len(impressum_text.strip()) < 100:
+        return []
+    body = {
+        "model": MODEL,
+        "messages": [
+            {"role": "system", "content": _SYSTEM_PROMPT},
+            {"role": "user", "content": _user_prompt(
+                impressum_text, business_scope,
+            )},
+        ],
+        "format": "json",
+        "stream": False,
+        "options": {"temperature": 0.0, "seed": 42, "num_predict": 1200},
+    }
+    try:
+        async with httpx.AsyncClient(timeout=TIMEOUT) as c:
+            r = await c.post(f"{OLLAMA_URL}/api/chat", json=body)
+            r.raise_for_status()
+            content = (r.json().get("message") or {}).get("content", "") or ""
+    except Exception as e:
+        logger.warning("impressum_agent_llm call failed: %s", e)
+        return []
+
+    raw_findings = _parse_response(content)
+    out: list[dict] = []
+    for f in raw_findings:
+        if not isinstance(f, dict):
+            continue
+        fid = re.sub(r"[^\w\-]", "_",
+                     str(f.get("field_id") or "unknown"))[:40]
+        sev = (f.get("severity") or "MEDIUM").upper()
+        if sev not in ("HIGH", "MEDIUM", "LOW", "INFO"):
+            sev = "MEDIUM"
+        out.append({
+            "check_id": f"IMPRESSUM-AGENT-LLM-{fid.upper()}",
+            "agent": "impressum_agent_v2_llm",
+            "field_id": fid,
+            "severity": sev,
+            "severity_reason": "missing",
+            "title": str(f.get("title") or "")[:200],
+            "norm": "§ 5 TMG / DDG (LLM-Analyse)",
+            "evidence": str(f.get("evidence") or "")[:300],
+            "action": str(f.get("action") or "")[:400],
+        })
+    if out:
+        logger.info("impressum_agent_llm: %d finding(s)", len(out))
+    return out
@@ -44,6 +44,17 @@ _B2C_WEAK = (
    "shop", "store", "kaufen", "produkt", "ware", "rechnung",
    "agb", "widerrufsfrist", "widerrufsrecht", "wallbox", "hardware",
    "abonnement", "tarif buchen", "naturstrom", "ladetarif",
+    # Versicherungs- / Finanz-B2C
+    "reiseversicherung", "versicherung abschließen",
+    "versicherung kaufen", "online abschließen", "online-antrag",
+    "antrag stellen", "police", "vertrag abschließen",
+    "tarifrechner", "beitrag berechnen", "jetzt online",
+    # Telekom / Energie / Mobilfunk B2C
+    "vertrag buchen", "tarif wechseln", "stromtarif",
+    "gastarif", "mobilfunkvertrag", "dsl-tarif",
+    # Reise / Hotel / Mobility B2C
+    "buchen", "reservieren", "buchung", "ticket kaufen",
+    "fahrkarte", "flug buchen",
 )

 # Hard B2B-only signals that override B2C-Verdacht.