feat: Backlog 1-5 — soft-hints, chatbot-discovery, API-payload, LLM-Agent

5 Backlog-Items aus dem Multi-Site-Briefing in einem Sprint:

1. B13 B2C-Soft-Hints — Versicherungs/Tarif/Buchungs-Marker
   _B2C_WEAK erweitert um "Reiseversicherung", "Tarifrechner",
   "Online-Antrag", "Flug buchen", "Stromtarif" etc.
   Fängt Allianz-Reise-Chatbot (vorher False-Negative).

2. Chatbot-Policy-Discovery (chatbot_policy_discovery.py)
   Probt 14 Standard-Slugs (privacypolicychatbot, chatbot-datenschutz,
   ai-policy, ki-datenschutz, ...) × 5 Lang-Prefixe auf jeder
   submitted Origin. Successful >300-Wort-Findings werden in
   doc_texts['dse'] gemerged. Audit-Trail über
   doc_entries[dse].chatbot_policy_sources.
   Hebt Westfield-iAdvize-Lücke.

3. API-Response-Payload erweitert
   phase_f_persist.response um extra_findings, audit_walk und
   html_blocks erweitert. B-Wiring-Output (B1, B3-B18) ist nicht
   mehr nur im Mail-HTML versteckt — externe Aufrufer sehen jeden
   Finding. Schema additiv, legacy clients ignorieren neue Felder.

4. Plausibility-LLM Empty-Response-Fix
   Resilienz-Strategie A→B→C→D:
   A) format='json' (strict, default)
   B) format='' (loose, _try_extract_json mit ```json-fence + prose-
      wrap-Unterstützung)
   C) Split-Batch-Recursion (vorhanden)
   D) Give up, leeres dict (callers behandeln als skipped)
   Plus _post_llm() als isolierter LLM-Call-Helper, catched
   Network-Errors.

5. Specialist-Agents Phase 2 LLM (MVP) — Impressum-Agent
   impressum_agent_llm.py: qwen3:30b-a3b mit § 5 TMG System-Prompt,
   business_scope-hints aus profile_dict. Output identisches Schema
   wie pattern-agent für ein Merge ohne API-Bruch.
   _b18_wiring.py orchestriert beide Agents + deduplet nach
   field_id, rendert lila V2-Block mit KB/LLM-Tags pro Finding.
   Pattern-first im Dedup (deterministisch + stable).

Tests: 107/107 grün (7 Test-Suites + chatbot-discovery + b18).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-07 18:41:54 +02:00
parent a2cae94526
commit e8ff75cbfe
11 changed files with 832 additions and 34 deletions
@@ -0,0 +1,161 @@
"""Discover separate chatbot-/AI-policy pages and merge them into the
main DSE text.
Many sites publish their chatbot data-protection notice on a separate
URL (e.g. westfield.com/germany/privacypolicychatbot) that the regular
auto-discovery misses because it doesn't classify as 'dse'. As a
result, B12/B15 (chatbot-cookie classification, AI-Act legal basis)
never see the iAdvize/Vertex provider names.
Strategy:
1. From the discovered URLs derive the base host.
2. Probe a fixed list of well-known chatbot-policy paths.
3. For each 2xx-response with > 300 words, merge the text into
state['doc_texts']['dse'] with a separator.
Best-effort: a probe failure NEVER aborts the check.
"""
from __future__ import annotations
import asyncio
import logging
import re
from urllib.parse import urlparse
import httpx
logger = logging.getLogger(__name__)
# Slug-Kandidaten, sortiert von häufigsten zu seltensten.
_CHATBOT_POLICY_SLUGS = (
"privacypolicychatbot",
"chatbot-datenschutz", "chatbot/datenschutz",
"datenschutz-chatbot", "datenschutz/chatbot",
"ai-policy", "ai-datenschutz", "ki-datenschutz",
"privacy-chatbot", "privacy-ai",
"datenschutz-ki", "datenschutz-assistent",
"chatbot-privacy", "ai-privacy",
)
# Sprach-Prefixe die wir abklopfen.
_LANG_PREFIXES = ("", "/de", "/de_DE", "/en", "/germany")
def _build_candidate_urls(base_origin: str) -> list[str]:
"""Build all (lang × slug) combinations for one origin."""
out: list[str] = []
seen: set[str] = set()
for lang in _LANG_PREFIXES:
for slug in _CHATBOT_POLICY_SLUGS:
url = f"{base_origin}{lang}/{slug}".replace("//", "/")
url = url.replace("https:/", "https://").replace("http:/", "http://")
if url not in seen:
seen.add(url)
out.append(url)
return out
async def _probe(url: str, timeout_s: float = 4.0) -> tuple[str, str] | None:
"""Return (url, text) on 2xx + >300-word body, else None."""
try:
async with httpx.AsyncClient(
timeout=timeout_s, follow_redirects=True,
) as c:
r = await c.get(url)
if r.status_code >= 400:
return None
text = re.sub(r"<script.*?</script>", " ",
r.text, flags=re.S | re.I)
text = re.sub(r"<style.*?</style>", " ",
text, flags=re.S | re.I)
text = re.sub(r"<[^>]+>", " ", text)
text = re.sub(r"\s+", " ", text).strip()
if len(text.split()) < 300:
return None
return url, text
except Exception:
return None
def _base_origins(doc_entries: list[dict]) -> list[str]:
seen: set[str] = set()
out: list[str] = []
for e in doc_entries:
url = (e.get("url") or "").strip()
if not url:
continue
try:
p = urlparse(url)
if not p.scheme or not p.netloc:
continue
origin = f"{p.scheme}://{p.netloc}"
if origin not in seen:
seen.add(origin)
out.append(origin)
except Exception:
continue
return out
async def enrich_dse_with_chatbot_policies(state: dict) -> dict:
"""Probe known chatbot-policy paths; merge findings into DSE text.
Returns metadata dict describing what was merged (for logging /
debugging). Mutates state['doc_texts']['dse'] in place.
"""
doc_entries = state.get("doc_entries") or []
origins = _base_origins(doc_entries)
if not origins:
return {"probed": 0, "found": [], "merged_chars": 0}
# Build candidate URL list, capped per origin to avoid noise.
candidates: list[str] = []
for origin in origins[:2]: # cap origins for safety
candidates.extend(_build_candidate_urls(origin)[:20])
if not candidates:
return {"probed": 0, "found": [], "merged_chars": 0}
results = await asyncio.gather(
*[_probe(u) for u in candidates],
return_exceptions=True,
)
found = [r for r in results if isinstance(r, tuple) and r]
if not found:
return {"probed": len(candidates), "found": [], "merged_chars": 0}
# Merge into DSE text.
doc_texts = state.setdefault("doc_texts", {})
dse_text = doc_texts.get("dse") or ""
appended_chars = 0
appended_urls: list[str] = []
for url, text in found:
sep = (
f"\n\n--- ergänzt aus {url} (chatbot-policy-discovery) ---\n\n"
)
dse_text += sep + text
appended_chars += len(text)
appended_urls.append(url)
doc_texts["dse"] = dse_text
# Also record on the dse-entry (audit trail).
for e in doc_entries:
if e.get("doc_type") == "dse":
e["chatbot_policy_sources"] = appended_urls
e["text"] = dse_text
break
logger.info(
"chatbot-policy enrichment: %d candidate(s) probed, %d found, "
"+%d chars merged into DSE",
len(candidates), len(found), appended_chars,
)
return {
"probed": len(candidates),
"found": appended_urls,
"merged_chars": appended_chars,
}
@@ -132,54 +132,102 @@ def _build_user_prompt(items: list[dict], doc_title: str,
)
async def _post_llm(body: dict) -> str:
"""One LLM call. Returns content string or empty on failure.
Catches network errors so the caller can decide fallback strategy."""
try:
async with httpx.AsyncClient(timeout=TIMEOUT) as c:
r = await c.post(f"{OLLAMA_URL}/api/chat", json=body)
r.raise_for_status()
return (r.json().get("message") or {}).get("content", "") or ""
except Exception as e:
logger.warning("plausibility LLM call failed: %s", e)
return ""
def _try_extract_json(content: str) -> dict | None:
"""Extract a JSON object from free-form LLM output. Handles
markdown-fenced and prose-wrapped responses."""
if not content:
return None
s = content.strip()
# Strip ```json … ``` fences
if s.startswith("```"):
s = s.strip("`")
if s.lower().startswith("json"):
s = s[4:]
s = s.strip()
# Heuristic: cut from first { to last }
first = s.find("{")
last = s.rfind("}")
if first >= 0 and last > first:
s = s[first:last + 1]
try:
return json.loads(s)
except Exception:
return None
async def _ask_llm_batch(items: list[dict], doc_title: str,
doc_excerpt: str) -> dict[str, dict]:
"""Send a batch of up to BATCH_SIZE findings to the LLM."""
body = {
"""Send a batch of up to BATCH_SIZE findings to the LLM.
Resilience strategy (P125 fix for empty-response bug):
A. format='json' (strict) — current default
B. If A returns empty: format='' (loose), extract JSON manually
C. If B also empty AND batch >2: split batch + recurse
D. Else: give up, return {} (callers stamp llm_skipped=true)
"""
user_prompt = _build_user_prompt(items, doc_title, doc_excerpt)
base_body = {
"model": MODEL,
"messages": [
{"role": "system", "content": _SYSTEM_PROMPT},
{"role": "user", "content": _build_user_prompt(
items, doc_title, doc_excerpt,
)},
{"role": "user", "content": user_prompt},
],
"format": "json",
"stream": False,
"options": {"temperature": 0.0, "seed": 42, "num_predict": 1500},
}
out: dict[str, dict] = {}
input_ids = [it["id"] for it in items]
try:
async with httpx.AsyncClient(timeout=TIMEOUT) as c:
r = await c.post(f"{OLLAMA_URL}/api/chat", json=body)
r.raise_for_status()
content = (r.json().get("message") or {}).get("content", "")
if not content:
# Single retry with smaller batch — qwen3 sometimes
# rejects ≥6-item prompts under format='json'.
if len(items) > 2:
half = len(items) // 2
logger.info(
"plausibility empty → retry split %d%dx2",
len(items), half,
)
first = await _ask_llm_batch(
items[:half], doc_title, doc_excerpt,
)
second = await _ask_llm_batch(
items[half:], doc_title, doc_excerpt,
)
out.update(first)
out.update(second)
return out
logger.warning("plausibility LLM returned empty content")
# Strategy A: format='json'
content = await _post_llm({**base_body, "format": "json"})
if not content:
# Strategy B: format-free, parse-on-our-side
logger.info(
"plausibility A→empty, trying B (format-free) batch=%d",
len(items),
)
content = await _post_llm(base_body)
if not content:
# Strategy C: split + recurse
if len(items) > 2:
half = len(items) // 2
logger.info(
"plausibility A+B empty → split %d%dx2",
len(items), half,
)
first = await _ask_llm_batch(
items[:half], doc_title, doc_excerpt,
)
second = await _ask_llm_batch(
items[half:], doc_title, doc_excerpt,
)
out.update(first)
out.update(second)
return out
try:
data = json.loads(content)
except json.JSONDecodeError as je:
# Strategy D: give up
logger.warning(
"plausibility gave up after A+B for batch=%d", len(items),
)
return out
data = _try_extract_json(content)
if data is None:
logger.warning(
"plausibility LLM JSON parse failed: %s; raw=%s",
je, content[:300],
"plausibility LLM JSON parse failed (after fallback); "
"raw=%s", content[:300],
)
return out
llm_findings = data.get("findings") or []
@@ -58,6 +58,8 @@ def compose_v2(state: dict) -> str:
state.get("url_slug_drift_html", ""),
# B17 Audit-Walk-Video (Beweis-Aufzeichnung)
state.get("audit_walk_html", ""),
# B18 Impressum-Specialist-Agent (Pattern + LLM)
state.get("impressum_agent_html", ""),
# Browser-Matrix (Stage 1.c)
state.get("browser_matrix_html", ""),
# All legacy build_*_html() wrapped in V2 sections — preserves
@@ -0,0 +1,166 @@
"""Impressum-Specialist-Agent Phase 2 — LLM-gestützt.
Komplementiert den Pattern-Match-Agent (impressum_agent.py) durch
eine LLM-Pass. Beide Output-Formate sind identisch, sodass das B-Wiring
beide kombinieren / dedupen kann.
LLM-Setup:
- Modell: qwen3:30b-a3b (Standard Ollama, siehe Plausibility-Check)
- System-Prompt: KB der § 5 TMG Pflichtangaben
- User-Prompt: Impressum-Text + business_scope-Hinweis
- Output: JSON-Liste mit {field_id, severity, hint, evidence}
Phase-2-Ziel: schwer-mit-Regex-erfassbare Lücken finden, z.B.
- "Geschäftsführer" wird genannt aber ohne Vor- oder Nachname
- Aufsichtsbehörde-Pflicht erkannt, aber für falsche Branche
- Vertretungsberechtigte einer GmbH bei mehreren Personen unvollständig
"""
from __future__ import annotations
import json
import logging
import os
import re
import httpx
logger = logging.getLogger(__name__)
OLLAMA_URL = os.environ.get(
"OLLAMA_URL", "http://bp-core-ollama:11434",
)
MODEL = os.environ.get("IMPRESSUM_AGENT_MODEL", "qwen3:30b-a3b")
TIMEOUT = float(os.environ.get("IMPRESSUM_AGENT_TIMEOUT", "60"))
_SYSTEM_PROMPT = """Du bist ein deutscher Datenschutz-Anwalt mit Fokus
§ 5 TMG / DDG (Anbieterkennzeichnung). Deine Aufgabe: einen Impressum-
Text auf Vollständigkeit der Pflichtangaben prüfen und Lücken /
Mängel strukturiert auflisten.
Pflichtangaben nach § 5 TMG (Standard):
- Anbieter-Name + Anschrift (juristische Person: Firma + Sitz)
- Vertretungsberechtigte (bei juristischen Personen: ALLE Geschäftsführer
mit Vor- und Nachname)
- E-Mail UND Telefon (Schnelle elektronische Kontaktaufnahme + UNMITTELBAR)
- Handelsregister-Eintrag (HRB/HRA + Registergericht)
- USt-IdNr. (falls vorhanden — DE\\d{9})
- Bei B2C/Onlineshop: Verbraucherschlichtung + OS-Plattform
- Bei reglementiertem Beruf: Berufsbezeichnung + Kammer
- Bei genehmigungspflichtigen Tätigkeiten: Aufsichtsbehörde
Ausgabe: NUR gültiges JSON mit Feld "findings", jedes Element:
{
"field_id": "kurzer-id",
"severity": "HIGH"|"MEDIUM"|"LOW",
"title": "kurze Lücken-Beschreibung",
"evidence": "wörtliches Zitat aus dem Impressum, das das Problem belegt",
"action": "konkrete Empfehlung"
}
Keine Erklärung außerhalb JSON. Keine Prosa. Wenn alles vollständig:
gib {"findings": []} zurück.
"""
def _user_prompt(impressum_text: str,
business_scope: set[str] | None) -> str:
scope_hint = ""
if business_scope:
scope_hint = (
f"BUSINESS-SCOPE-HINTS: "
f"{', '.join(sorted(business_scope))}\n\n"
)
return (
f"{scope_hint}"
f"IMPRESSUM-TEXT:\n"
f"{impressum_text[:4000]}\n\n"
"Liste Lücken nach § 5 TMG. Nur JSON."
)
def _parse_response(content: str) -> list[dict]:
"""Robust JSON extraction (handles ```json fences, prose-wrap)."""
if not content:
return []
s = content.strip()
if s.startswith("```"):
s = s.strip("`")
if s.lower().startswith("json"):
s = s[4:]
s = s.strip()
first = s.find("{")
last = s.rfind("}")
if first >= 0 and last > first:
s = s[first:last + 1]
try:
data = json.loads(s)
except Exception:
# Try array directly
first = content.find("[")
last = content.rfind("]")
if first >= 0 and last > first:
try:
arr = json.loads(content[first:last + 1])
return arr if isinstance(arr, list) else []
except Exception:
return []
return []
findings = data.get("findings") if isinstance(data, dict) else data
return findings if isinstance(findings, list) else []
async def evaluate_llm(
impressum_text: str,
business_scope: set[str] | None = None,
) -> list[dict]:
"""LLM-gestützte Impressum-Analyse. Returns finding dicts in the
same shape as impressum_agent.evaluate() so callers can merge."""
if not impressum_text or len(impressum_text.strip()) < 100:
return []
body = {
"model": MODEL,
"messages": [
{"role": "system", "content": _SYSTEM_PROMPT},
{"role": "user", "content": _user_prompt(
impressum_text, business_scope,
)},
],
"format": "json",
"stream": False,
"options": {"temperature": 0.0, "seed": 42, "num_predict": 1200},
}
try:
async with httpx.AsyncClient(timeout=TIMEOUT) as c:
r = await c.post(f"{OLLAMA_URL}/api/chat", json=body)
r.raise_for_status()
content = (r.json().get("message") or {}).get("content", "") or ""
except Exception as e:
logger.warning("impressum_agent_llm call failed: %s", e)
return []
raw_findings = _parse_response(content)
out: list[dict] = []
for f in raw_findings:
if not isinstance(f, dict):
continue
fid = re.sub(r"[^\w\-]", "_",
str(f.get("field_id") or "unknown"))[:40]
sev = (f.get("severity") or "MEDIUM").upper()
if sev not in ("HIGH", "MEDIUM", "LOW", "INFO"):
sev = "MEDIUM"
out.append({
"check_id": f"IMPRESSUM-AGENT-LLM-{fid.upper()}",
"agent": "impressum_agent_v2_llm",
"field_id": fid,
"severity": sev,
"severity_reason": "missing",
"title": str(f.get("title") or "")[:200],
"norm": "§ 5 TMG / DDG (LLM-Analyse)",
"evidence": str(f.get("evidence") or "")[:300],
"action": str(f.get("action") or "")[:400],
})
if out:
logger.info("impressum_agent_llm: %d finding(s)", len(out))
return out
@@ -44,6 +44,17 @@ _B2C_WEAK = (
"shop", "store", "kaufen", "produkt", "ware", "rechnung",
"agb", "widerrufsfrist", "widerrufsrecht", "wallbox", "hardware",
"abonnement", "tarif buchen", "naturstrom", "ladetarif",
# Versicherungs- / Finanz-B2C
"reiseversicherung", "versicherung abschließen",
"versicherung kaufen", "online abschließen", "online-antrag",
"antrag stellen", "police", "vertrag abschließen",
"tarifrechner", "beitrag berechnen", "jetzt online",
# Telekom / Energie / Mobilfunk B2C
"vertrag buchen", "tarif wechseln", "stromtarif",
"gastarif", "mobilfunkvertrag", "dsl-tarif",
# Reise / Hotel / Mobility B2C
"buchen", "reservieren", "buchung", "ticket kaufen",
"fahrkarte", "flug buchen",
)
# Hard B2B-only signals that override B2C-Verdacht.