feat: Backlog 1-5 — soft-hints, chatbot-discovery, API-payload, LLM-Agent
5 Backlog-Items aus dem Multi-Site-Briefing in einem Sprint:
1. B13 B2C-Soft-Hints — Versicherungs/Tarif/Buchungs-Marker
_B2C_WEAK erweitert um "Reiseversicherung", "Tarifrechner",
"Online-Antrag", "Flug buchen", "Stromtarif" etc.
Fängt Allianz-Reise-Chatbot (vorher False-Negative).
2. Chatbot-Policy-Discovery (chatbot_policy_discovery.py)
Probt 14 Standard-Slugs (privacypolicychatbot, chatbot-datenschutz,
ai-policy, ki-datenschutz, ...) × 5 Lang-Prefixe auf jeder
submitted Origin. Successful >300-Wort-Findings werden in
doc_texts['dse'] gemerged. Audit-Trail über
doc_entries[dse].chatbot_policy_sources.
Hebt Westfield-iAdvize-Lücke.
3. API-Response-Payload erweitert
phase_f_persist.response um extra_findings, audit_walk und
html_blocks erweitert. B-Wiring-Output (B1, B3-B18) ist nicht
mehr nur im Mail-HTML versteckt — externe Aufrufer sehen jeden
Finding. Schema additiv, legacy clients ignorieren neue Felder.
4. Plausibility-LLM Empty-Response-Fix
Resilienz-Strategie A→B→C→D:
A) format='json' (strict, default)
B) format='' (loose, _try_extract_json mit ```json-fence + prose-
wrap-Unterstützung)
C) Split-Batch-Recursion (vorhanden)
D) Give up, leeres dict (callers behandeln als skipped)
Plus _post_llm() als isolierter LLM-Call-Helper, catched
Network-Errors.
5. Specialist-Agents Phase 2 LLM (MVP) — Impressum-Agent
impressum_agent_llm.py: qwen3:30b-a3b mit § 5 TMG System-Prompt,
business_scope-hints aus profile_dict. Output identisches Schema
wie pattern-agent für ein Merge ohne API-Bruch.
_b18_wiring.py orchestriert beide Agents + deduplet nach
field_id, rendert lila V2-Block mit KB/LLM-Tags pro Finding.
Pattern-first im Dedup (deterministisch + stable).
Tests: 107/107 grün (7 Test-Suites + chatbot-discovery + b18).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,130 @@
|
|||||||
|
"""B18 wiring — Specialist-Agents Phase 2 (Impressum LLM).
|
||||||
|
|
||||||
|
Ruft den LLM-Agent (impressum_agent_llm.evaluate_llm) auf, mergt das
|
||||||
|
Ergebnis mit dem Pattern-Match-Agent und deduplet nach field_id.
|
||||||
|
Rendert einen V2-HTML-Block (impressum_agent_html).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import html
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
from compliance.services.specialist_agents.impressum_agent import (
|
||||||
|
PFLICHTANGABEN, evaluate as evaluate_pattern,
|
||||||
|
)
|
||||||
|
from compliance.services.specialist_agents.impressum_agent_llm import (
|
||||||
|
evaluate_llm,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
_DISABLED = os.environ.get("IMPRESSUM_AGENT_DISABLED", "").lower() in (
|
||||||
|
"1", "true", "yes",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_b18(state: dict) -> None:
|
||||||
|
if _DISABLED:
|
||||||
|
return
|
||||||
|
doc_texts = state.get("doc_texts") or {}
|
||||||
|
imp = (doc_texts.get("impressum") or "").strip()
|
||||||
|
if len(imp) < 100:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Business-scope-Inferenz aus dem profile, falls vorhanden.
|
||||||
|
profile_dict = state.get("profile_dict") or {}
|
||||||
|
scope: set[str] = set()
|
||||||
|
if profile_dict.get("has_online_shop"):
|
||||||
|
scope.add("ecommerce")
|
||||||
|
if profile_dict.get("is_regulated_profession"):
|
||||||
|
scope.add("regulated_profession")
|
||||||
|
if profile_dict.get("industry") in ("insurance", "Finance",
|
||||||
|
"finance"):
|
||||||
|
scope.add("insurance")
|
||||||
|
|
||||||
|
pattern_findings = evaluate_pattern(imp, scope)
|
||||||
|
llm_findings = await evaluate_llm(imp, scope)
|
||||||
|
|
||||||
|
# Dedup: pattern-agent + llm-agent können ähnliche field_ids melden.
|
||||||
|
# Keep first, prefer pattern (deterministisch + stable).
|
||||||
|
seen_keys: set[str] = set()
|
||||||
|
merged: list[dict] = []
|
||||||
|
for f in pattern_findings + llm_findings:
|
||||||
|
# Stable dedup key: field_id (normalised). Both agents emit
|
||||||
|
# the same field for the same gap → fold to one.
|
||||||
|
key = (f.get("field_id") or "").lower()
|
||||||
|
if key and key in seen_keys:
|
||||||
|
continue
|
||||||
|
seen_keys.add(key)
|
||||||
|
merged.append(f)
|
||||||
|
|
||||||
|
if not merged:
|
||||||
|
return
|
||||||
|
|
||||||
|
extras = state.get("extra_findings") or []
|
||||||
|
extras.extend(merged)
|
||||||
|
state["extra_findings"] = extras
|
||||||
|
state["impressum_agent_html"] = _render(merged, pattern_findings,
|
||||||
|
llm_findings)
|
||||||
|
logger.info(
|
||||||
|
"B18 impressum-agent: pattern=%d llm=%d merged=%d",
|
||||||
|
len(pattern_findings), len(llm_findings), len(merged),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _render(merged: list[dict], pattern: list[dict],
|
||||||
|
llm: list[dict]) -> str:
|
||||||
|
cards = []
|
||||||
|
for f in merged:
|
||||||
|
sev = (f.get("severity") or "").upper()
|
||||||
|
color = "#dc2626" if sev == "HIGH" else (
|
||||||
|
"#f59e0b" if sev == "MEDIUM" else "#64748b"
|
||||||
|
)
|
||||||
|
agent_tag = f.get("agent") or ""
|
||||||
|
tag_html = ""
|
||||||
|
if agent_tag:
|
||||||
|
short = "LLM" if "llm" in agent_tag.lower() else "KB"
|
||||||
|
bg = "#dbeafe" if short == "LLM" else "#f1f5f9"
|
||||||
|
col = "#1e40af" if short == "LLM" else "#475569"
|
||||||
|
tag_html = (
|
||||||
|
f"<span style='display:inline-block;background:{bg};"
|
||||||
|
f"color:{col};font-size:10px;padding:1px 6px;"
|
||||||
|
f"border-radius:999px;margin-left:6px;'>{short}</span>"
|
||||||
|
)
|
||||||
|
evidence_html = ""
|
||||||
|
if f.get("evidence"):
|
||||||
|
evidence_html = (
|
||||||
|
"<div style='font-size:12px;color:#475569;margin-top:6px;'>"
|
||||||
|
f"<em>{html.escape(f['evidence'])}</em></div>"
|
||||||
|
)
|
||||||
|
cards.append(
|
||||||
|
f"<div style='margin:12px 0;padding:14px;background:#fff;"
|
||||||
|
f"border-left:3px solid {color};border-radius:4px;'>"
|
||||||
|
f"<div style='font-weight:600;color:{color};font-size:14px;'>"
|
||||||
|
f"{sev} · {html.escape(f.get('check_id') or '')}{tag_html}</div>"
|
||||||
|
f"<div style='font-size:14px;margin-top:4px;'>"
|
||||||
|
f"<strong>{html.escape(f.get('title') or '')}</strong></div>"
|
||||||
|
f"<div style='font-size:12px;color:#64748b;margin-top:2px;'>"
|
||||||
|
f"{html.escape(f.get('norm') or '')}</div>"
|
||||||
|
f"{evidence_html}"
|
||||||
|
f"<div style='font-size:13px;margin-top:8px;background:#dcfce7;"
|
||||||
|
f"padding:8px 10px;border-radius:4px;'>"
|
||||||
|
f"<strong>→ Empfehlung:</strong> "
|
||||||
|
f"{html.escape(f.get('action') or '')}</div>"
|
||||||
|
"</div>"
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
"<div style='margin:24px 0;padding:16px;border-left:4px solid #8b5cf6;"
|
||||||
|
"background:#faf5ff;border-radius:4px;'>"
|
||||||
|
"<h2 style='margin:0 0 8px;color:#5b21b6;font-size:16px;'>"
|
||||||
|
"🤖 Impressum-Specialist-Agent (Pattern-KB + LLM)"
|
||||||
|
"</h2>"
|
||||||
|
f"<p style='margin:0 0 8px;font-size:12px;color:#475569;'>"
|
||||||
|
f"Pattern-Match: {len(pattern)} · LLM-Analyse: {len(llm)} · "
|
||||||
|
f"dedupliziert: {len(merged)}</p>"
|
||||||
|
+ "".join(cards) +
|
||||||
|
"</div>"
|
||||||
|
)
|
||||||
@@ -28,6 +28,7 @@ from ._b14_wiring import run_b14
|
|||||||
from ._b15_wiring import run_b15
|
from ._b15_wiring import run_b15
|
||||||
from ._b16_wiring import run_b16
|
from ._b16_wiring import run_b16
|
||||||
from ._b17_wiring import run_b17
|
from ._b17_wiring import run_b17
|
||||||
|
from ._b18_wiring import run_b18
|
||||||
from ._constants import _compliance_check_jobs
|
from ._constants import _compliance_check_jobs
|
||||||
from ._phase_a_resolve import run_phase_a
|
from ._phase_a_resolve import run_phase_a
|
||||||
from ._phase_b_profile_check import run_phase_b
|
from ._phase_b_profile_check import run_phase_b
|
||||||
@@ -42,6 +43,9 @@ from ._phase_d3_blocks_top import run_phase_d3_top
|
|||||||
from ._phase_e_email import run_phase_e
|
from ._phase_e_email import run_phase_e
|
||||||
from ._phase_f_persist import run_phase_f
|
from ._phase_f_persist import run_phase_f
|
||||||
from ._state import new_state
|
from ._state import new_state
|
||||||
|
from compliance.services.chatbot_policy_discovery import (
|
||||||
|
enrich_dse_with_chatbot_policies,
|
||||||
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -54,6 +58,13 @@ async def run_compliance_check(check_id: str, req) -> None:
|
|||||||
continue_run = await run_phase_a(state)
|
continue_run = await run_phase_a(state)
|
||||||
if not continue_run:
|
if not continue_run:
|
||||||
return # TDM denied — job already marked skipped_tdm
|
return # TDM denied — job already marked skipped_tdm
|
||||||
|
# DSE-Enrichment: Sub-Chatbot-Policies anhängen (Westfield-iAdvize,
|
||||||
|
# vergleichbare Pattern). Best-effort, läuft VOR Phase B damit
|
||||||
|
# die enrichte DSE in alle per-doc-checks fließt.
|
||||||
|
try:
|
||||||
|
await enrich_dse_with_chatbot_policies(state)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("chatbot-policy enrichment skipped: %s", e)
|
||||||
# Phase B: Step 2 (profile detect) + Step 3 (per-doc checks)
|
# Phase B: Step 2 (profile detect) + Step 3 (per-doc checks)
|
||||||
await run_phase_b(state)
|
await run_phase_b(state)
|
||||||
# Phase C: Step 3b-d (banner + cross-check + TCF) + Step 4
|
# Phase C: Step 3b-d (banner + cross-check + TCF) + Step 4
|
||||||
@@ -80,6 +91,7 @@ async def run_compliance_check(check_id: str, req) -> None:
|
|||||||
run_b15(state) # AI-Act Rechtsgrundlage (LLM-Vendor auf lit. f)
|
run_b15(state) # AI-Act Rechtsgrundlage (LLM-Vendor auf lit. f)
|
||||||
run_b16(state) # Footer-Label-vs-URL-Slug-Drift
|
run_b16(state) # Footer-Label-vs-URL-Slug-Drift
|
||||||
await run_b17(state) # Audit-Walk-Video (Beweis-Aufzeichnung)
|
await run_b17(state) # Audit-Walk-Video (Beweis-Aufzeichnung)
|
||||||
|
await run_b18(state) # Impressum-Specialist-Agent (Pattern+LLM)
|
||||||
# Phase D-3 top/mid/bot: Step 5 HTML blocks
|
# Phase D-3 top/mid/bot: Step 5 HTML blocks
|
||||||
await run_phase_d3_top(state)
|
await run_phase_d3_top(state)
|
||||||
await run_phase_d3_mid(state)
|
await run_phase_d3_mid(state)
|
||||||
|
|||||||
@@ -72,6 +72,24 @@ def run_phase_f(state: dict) -> None:
|
|||||||
"total_findings": total_findings,
|
"total_findings": total_findings,
|
||||||
"email_status": email_result.get("status", "failed"),
|
"email_status": email_result.get("status", "failed"),
|
||||||
"checked_at": datetime.now(timezone.utc).isoformat(),
|
"checked_at": datetime.now(timezone.utc).isoformat(),
|
||||||
|
# P125: B-Wiring-Output (B1, B3-B17) ins API-Response-Payload.
|
||||||
|
# Bisher landeten diese nur im Audit-Mail-HTML — externe Aufrufer
|
||||||
|
# (Admin-UI) sahen sie nicht. Schema additiv; legacy clients
|
||||||
|
# ignorieren unbekannte Felder.
|
||||||
|
"extra_findings": state.get("extra_findings") or [],
|
||||||
|
"audit_walk": state.get("audit_walk") or None,
|
||||||
|
"html_blocks": {
|
||||||
|
"widerruf_reach": state.get("widerruf_reach_html", ""),
|
||||||
|
"retention_conflict": state.get("retention_conflict_html", ""),
|
||||||
|
"ai_legal_basis": state.get("ai_legal_basis_html", ""),
|
||||||
|
"url_slug_drift": state.get("url_slug_drift_html", ""),
|
||||||
|
"chatbot_cookie": state.get("chatbot_cookie_html", ""),
|
||||||
|
"audit_walk": state.get("audit_walk_html", ""),
|
||||||
|
"browser_matrix": state.get("browser_matrix_html", ""),
|
||||||
|
"vendor_consistency": state.get("vendor_consistency_html", ""),
|
||||||
|
"ai_act": state.get("ai_act_html", ""),
|
||||||
|
"impressum_agent": state.get("impressum_agent_html", ""),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
_compliance_check_jobs[check_id]["status"] = "completed"
|
_compliance_check_jobs[check_id]["status"] = "completed"
|
||||||
|
|||||||
@@ -0,0 +1,161 @@
|
|||||||
|
"""Discover separate chatbot-/AI-policy pages and merge them into the
|
||||||
|
main DSE text.
|
||||||
|
|
||||||
|
Many sites publish their chatbot data-protection notice on a separate
|
||||||
|
URL (e.g. westfield.com/germany/privacypolicychatbot) that the regular
|
||||||
|
auto-discovery misses because it doesn't classify as 'dse'. As a
|
||||||
|
result, B12/B15 (chatbot-cookie classification, AI-Act legal basis)
|
||||||
|
never see the iAdvize/Vertex provider names.
|
||||||
|
|
||||||
|
Strategy:
|
||||||
|
1. From the discovered URLs derive the base host.
|
||||||
|
2. Probe a fixed list of well-known chatbot-policy paths.
|
||||||
|
3. For each 2xx-response with > 300 words, merge the text into
|
||||||
|
state['doc_texts']['dse'] with a separator.
|
||||||
|
|
||||||
|
Best-effort: a probe failure NEVER aborts the check.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# Slug-Kandidaten, sortiert von häufigsten zu seltensten.
|
||||||
|
_CHATBOT_POLICY_SLUGS = (
|
||||||
|
"privacypolicychatbot",
|
||||||
|
"chatbot-datenschutz", "chatbot/datenschutz",
|
||||||
|
"datenschutz-chatbot", "datenschutz/chatbot",
|
||||||
|
"ai-policy", "ai-datenschutz", "ki-datenschutz",
|
||||||
|
"privacy-chatbot", "privacy-ai",
|
||||||
|
"datenschutz-ki", "datenschutz-assistent",
|
||||||
|
"chatbot-privacy", "ai-privacy",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Sprach-Prefixe die wir abklopfen.
|
||||||
|
_LANG_PREFIXES = ("", "/de", "/de_DE", "/en", "/germany")
|
||||||
|
|
||||||
|
|
||||||
|
def _build_candidate_urls(base_origin: str) -> list[str]:
|
||||||
|
"""Build all (lang × slug) combinations for one origin."""
|
||||||
|
out: list[str] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
for lang in _LANG_PREFIXES:
|
||||||
|
for slug in _CHATBOT_POLICY_SLUGS:
|
||||||
|
url = f"{base_origin}{lang}/{slug}".replace("//", "/")
|
||||||
|
url = url.replace("https:/", "https://").replace("http:/", "http://")
|
||||||
|
if url not in seen:
|
||||||
|
seen.add(url)
|
||||||
|
out.append(url)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
async def _probe(url: str, timeout_s: float = 4.0) -> tuple[str, str] | None:
|
||||||
|
"""Return (url, text) on 2xx + >300-word body, else None."""
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(
|
||||||
|
timeout=timeout_s, follow_redirects=True,
|
||||||
|
) as c:
|
||||||
|
r = await c.get(url)
|
||||||
|
if r.status_code >= 400:
|
||||||
|
return None
|
||||||
|
text = re.sub(r"<script.*?</script>", " ",
|
||||||
|
r.text, flags=re.S | re.I)
|
||||||
|
text = re.sub(r"<style.*?</style>", " ",
|
||||||
|
text, flags=re.S | re.I)
|
||||||
|
text = re.sub(r"<[^>]+>", " ", text)
|
||||||
|
text = re.sub(r"\s+", " ", text).strip()
|
||||||
|
if len(text.split()) < 300:
|
||||||
|
return None
|
||||||
|
return url, text
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _base_origins(doc_entries: list[dict]) -> list[str]:
|
||||||
|
seen: set[str] = set()
|
||||||
|
out: list[str] = []
|
||||||
|
for e in doc_entries:
|
||||||
|
url = (e.get("url") or "").strip()
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
p = urlparse(url)
|
||||||
|
if not p.scheme or not p.netloc:
|
||||||
|
continue
|
||||||
|
origin = f"{p.scheme}://{p.netloc}"
|
||||||
|
if origin not in seen:
|
||||||
|
seen.add(origin)
|
||||||
|
out.append(origin)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
async def enrich_dse_with_chatbot_policies(state: dict) -> dict:
|
||||||
|
"""Probe known chatbot-policy paths; merge findings into DSE text.
|
||||||
|
|
||||||
|
Returns metadata dict describing what was merged (for logging /
|
||||||
|
debugging). Mutates state['doc_texts']['dse'] in place.
|
||||||
|
"""
|
||||||
|
doc_entries = state.get("doc_entries") or []
|
||||||
|
origins = _base_origins(doc_entries)
|
||||||
|
if not origins:
|
||||||
|
return {"probed": 0, "found": [], "merged_chars": 0}
|
||||||
|
|
||||||
|
# Build candidate URL list, capped per origin to avoid noise.
|
||||||
|
candidates: list[str] = []
|
||||||
|
for origin in origins[:2]: # cap origins for safety
|
||||||
|
candidates.extend(_build_candidate_urls(origin)[:20])
|
||||||
|
|
||||||
|
if not candidates:
|
||||||
|
return {"probed": 0, "found": [], "merged_chars": 0}
|
||||||
|
|
||||||
|
results = await asyncio.gather(
|
||||||
|
*[_probe(u) for u in candidates],
|
||||||
|
return_exceptions=True,
|
||||||
|
)
|
||||||
|
found = [r for r in results if isinstance(r, tuple) and r]
|
||||||
|
|
||||||
|
if not found:
|
||||||
|
return {"probed": len(candidates), "found": [], "merged_chars": 0}
|
||||||
|
|
||||||
|
# Merge into DSE text.
|
||||||
|
doc_texts = state.setdefault("doc_texts", {})
|
||||||
|
dse_text = doc_texts.get("dse") or ""
|
||||||
|
appended_chars = 0
|
||||||
|
appended_urls: list[str] = []
|
||||||
|
for url, text in found:
|
||||||
|
sep = (
|
||||||
|
f"\n\n--- ergänzt aus {url} (chatbot-policy-discovery) ---\n\n"
|
||||||
|
)
|
||||||
|
dse_text += sep + text
|
||||||
|
appended_chars += len(text)
|
||||||
|
appended_urls.append(url)
|
||||||
|
doc_texts["dse"] = dse_text
|
||||||
|
|
||||||
|
# Also record on the dse-entry (audit trail).
|
||||||
|
for e in doc_entries:
|
||||||
|
if e.get("doc_type") == "dse":
|
||||||
|
e["chatbot_policy_sources"] = appended_urls
|
||||||
|
e["text"] = dse_text
|
||||||
|
break
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"chatbot-policy enrichment: %d candidate(s) probed, %d found, "
|
||||||
|
"+%d chars merged into DSE",
|
||||||
|
len(candidates), len(found), appended_chars,
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"probed": len(candidates),
|
||||||
|
"found": appended_urls,
|
||||||
|
"merged_chars": appended_chars,
|
||||||
|
}
|
||||||
@@ -132,54 +132,102 @@ def _build_user_prompt(items: list[dict], doc_title: str,
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _post_llm(body: dict) -> str:
|
||||||
|
"""One LLM call. Returns content string or empty on failure.
|
||||||
|
Catches network errors so the caller can decide fallback strategy."""
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=TIMEOUT) as c:
|
||||||
|
r = await c.post(f"{OLLAMA_URL}/api/chat", json=body)
|
||||||
|
r.raise_for_status()
|
||||||
|
return (r.json().get("message") or {}).get("content", "") or ""
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("plausibility LLM call failed: %s", e)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _try_extract_json(content: str) -> dict | None:
|
||||||
|
"""Extract a JSON object from free-form LLM output. Handles
|
||||||
|
markdown-fenced and prose-wrapped responses."""
|
||||||
|
if not content:
|
||||||
|
return None
|
||||||
|
s = content.strip()
|
||||||
|
# Strip ```json … ``` fences
|
||||||
|
if s.startswith("```"):
|
||||||
|
s = s.strip("`")
|
||||||
|
if s.lower().startswith("json"):
|
||||||
|
s = s[4:]
|
||||||
|
s = s.strip()
|
||||||
|
# Heuristic: cut from first { to last }
|
||||||
|
first = s.find("{")
|
||||||
|
last = s.rfind("}")
|
||||||
|
if first >= 0 and last > first:
|
||||||
|
s = s[first:last + 1]
|
||||||
|
try:
|
||||||
|
return json.loads(s)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
async def _ask_llm_batch(items: list[dict], doc_title: str,
|
async def _ask_llm_batch(items: list[dict], doc_title: str,
|
||||||
doc_excerpt: str) -> dict[str, dict]:
|
doc_excerpt: str) -> dict[str, dict]:
|
||||||
"""Send a batch of up to BATCH_SIZE findings to the LLM."""
|
"""Send a batch of up to BATCH_SIZE findings to the LLM.
|
||||||
body = {
|
|
||||||
|
Resilience strategy (P125 fix for empty-response bug):
|
||||||
|
A. format='json' (strict) — current default
|
||||||
|
B. If A returns empty: format='' (loose), extract JSON manually
|
||||||
|
C. If B also empty AND batch >2: split batch + recurse
|
||||||
|
D. Else: give up, return {} (callers stamp llm_skipped=true)
|
||||||
|
"""
|
||||||
|
user_prompt = _build_user_prompt(items, doc_title, doc_excerpt)
|
||||||
|
base_body = {
|
||||||
"model": MODEL,
|
"model": MODEL,
|
||||||
"messages": [
|
"messages": [
|
||||||
{"role": "system", "content": _SYSTEM_PROMPT},
|
{"role": "system", "content": _SYSTEM_PROMPT},
|
||||||
{"role": "user", "content": _build_user_prompt(
|
{"role": "user", "content": user_prompt},
|
||||||
items, doc_title, doc_excerpt,
|
|
||||||
)},
|
|
||||||
],
|
],
|
||||||
"format": "json",
|
|
||||||
"stream": False,
|
"stream": False,
|
||||||
"options": {"temperature": 0.0, "seed": 42, "num_predict": 1500},
|
"options": {"temperature": 0.0, "seed": 42, "num_predict": 1500},
|
||||||
}
|
}
|
||||||
out: dict[str, dict] = {}
|
out: dict[str, dict] = {}
|
||||||
input_ids = [it["id"] for it in items]
|
input_ids = [it["id"] for it in items]
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=TIMEOUT) as c:
|
# Strategy A: format='json'
|
||||||
r = await c.post(f"{OLLAMA_URL}/api/chat", json=body)
|
content = await _post_llm({**base_body, "format": "json"})
|
||||||
r.raise_for_status()
|
if not content:
|
||||||
content = (r.json().get("message") or {}).get("content", "")
|
# Strategy B: format-free, parse-on-our-side
|
||||||
if not content:
|
logger.info(
|
||||||
# Single retry with smaller batch — qwen3 sometimes
|
"plausibility A→empty, trying B (format-free) batch=%d",
|
||||||
# rejects ≥6-item prompts under format='json'.
|
len(items),
|
||||||
if len(items) > 2:
|
)
|
||||||
half = len(items) // 2
|
content = await _post_llm(base_body)
|
||||||
logger.info(
|
|
||||||
"plausibility empty → retry split %d → %dx2",
|
if not content:
|
||||||
len(items), half,
|
# Strategy C: split + recurse
|
||||||
)
|
if len(items) > 2:
|
||||||
first = await _ask_llm_batch(
|
half = len(items) // 2
|
||||||
items[:half], doc_title, doc_excerpt,
|
logger.info(
|
||||||
)
|
"plausibility A+B empty → split %d → %dx2",
|
||||||
second = await _ask_llm_batch(
|
len(items), half,
|
||||||
items[half:], doc_title, doc_excerpt,
|
)
|
||||||
)
|
first = await _ask_llm_batch(
|
||||||
out.update(first)
|
items[:half], doc_title, doc_excerpt,
|
||||||
out.update(second)
|
)
|
||||||
return out
|
second = await _ask_llm_batch(
|
||||||
logger.warning("plausibility LLM returned empty content")
|
items[half:], doc_title, doc_excerpt,
|
||||||
|
)
|
||||||
|
out.update(first)
|
||||||
|
out.update(second)
|
||||||
return out
|
return out
|
||||||
try:
|
# Strategy D: give up
|
||||||
data = json.loads(content)
|
logger.warning(
|
||||||
except json.JSONDecodeError as je:
|
"plausibility gave up after A+B for batch=%d", len(items),
|
||||||
|
)
|
||||||
|
return out
|
||||||
|
data = _try_extract_json(content)
|
||||||
|
if data is None:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"plausibility LLM JSON parse failed: %s; raw=%s",
|
"plausibility LLM JSON parse failed (after fallback); "
|
||||||
je, content[:300],
|
"raw=%s", content[:300],
|
||||||
)
|
)
|
||||||
return out
|
return out
|
||||||
llm_findings = data.get("findings") or []
|
llm_findings = data.get("findings") or []
|
||||||
|
|||||||
@@ -58,6 +58,8 @@ def compose_v2(state: dict) -> str:
|
|||||||
state.get("url_slug_drift_html", ""),
|
state.get("url_slug_drift_html", ""),
|
||||||
# B17 Audit-Walk-Video (Beweis-Aufzeichnung)
|
# B17 Audit-Walk-Video (Beweis-Aufzeichnung)
|
||||||
state.get("audit_walk_html", ""),
|
state.get("audit_walk_html", ""),
|
||||||
|
# B18 Impressum-Specialist-Agent (Pattern + LLM)
|
||||||
|
state.get("impressum_agent_html", ""),
|
||||||
# Browser-Matrix (Stage 1.c)
|
# Browser-Matrix (Stage 1.c)
|
||||||
state.get("browser_matrix_html", ""),
|
state.get("browser_matrix_html", ""),
|
||||||
# All legacy build_*_html() wrapped in V2 sections — preserves
|
# All legacy build_*_html() wrapped in V2 sections — preserves
|
||||||
|
|||||||
@@ -0,0 +1,166 @@
|
|||||||
|
"""Impressum-Specialist-Agent Phase 2 — LLM-gestützt.
|
||||||
|
|
||||||
|
Komplementiert den Pattern-Match-Agent (impressum_agent.py) durch
|
||||||
|
eine LLM-Pass. Beide Output-Formate sind identisch, sodass das B-Wiring
|
||||||
|
beide kombinieren / dedupen kann.
|
||||||
|
|
||||||
|
LLM-Setup:
|
||||||
|
- Modell: qwen3:30b-a3b (Standard Ollama, siehe Plausibility-Check)
|
||||||
|
- System-Prompt: KB der § 5 TMG Pflichtangaben
|
||||||
|
- User-Prompt: Impressum-Text + business_scope-Hinweis
|
||||||
|
- Output: JSON-Liste mit {field_id, severity, hint, evidence}
|
||||||
|
|
||||||
|
Phase-2-Ziel: schwer-mit-Regex-erfassbare Lücken finden, z.B.
|
||||||
|
- "Geschäftsführer" wird genannt aber ohne Vor- oder Nachname
|
||||||
|
- Aufsichtsbehörde-Pflicht erkannt, aber für falsche Branche
|
||||||
|
- Vertretungsberechtigte einer GmbH bei mehreren Personen unvollständig
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
OLLAMA_URL = os.environ.get(
|
||||||
|
"OLLAMA_URL", "http://bp-core-ollama:11434",
|
||||||
|
)
|
||||||
|
MODEL = os.environ.get("IMPRESSUM_AGENT_MODEL", "qwen3:30b-a3b")
|
||||||
|
TIMEOUT = float(os.environ.get("IMPRESSUM_AGENT_TIMEOUT", "60"))
|
||||||
|
|
||||||
|
|
||||||
|
_SYSTEM_PROMPT = """Du bist ein deutscher Datenschutz-Anwalt mit Fokus
|
||||||
|
§ 5 TMG / DDG (Anbieterkennzeichnung). Deine Aufgabe: einen Impressum-
|
||||||
|
Text auf Vollständigkeit der Pflichtangaben prüfen und Lücken /
|
||||||
|
Mängel strukturiert auflisten.
|
||||||
|
|
||||||
|
Pflichtangaben nach § 5 TMG (Standard):
|
||||||
|
- Anbieter-Name + Anschrift (juristische Person: Firma + Sitz)
|
||||||
|
- Vertretungsberechtigte (bei juristischen Personen: ALLE Geschäftsführer
|
||||||
|
mit Vor- und Nachname)
|
||||||
|
- E-Mail UND Telefon (Schnelle elektronische Kontaktaufnahme + UNMITTELBAR)
|
||||||
|
- Handelsregister-Eintrag (HRB/HRA + Registergericht)
|
||||||
|
- USt-IdNr. (falls vorhanden — DE\\d{9})
|
||||||
|
- Bei B2C/Onlineshop: Verbraucherschlichtung + OS-Plattform
|
||||||
|
- Bei reglementiertem Beruf: Berufsbezeichnung + Kammer
|
||||||
|
- Bei genehmigungspflichtigen Tätigkeiten: Aufsichtsbehörde
|
||||||
|
|
||||||
|
Ausgabe: NUR gültiges JSON mit Feld "findings", jedes Element:
|
||||||
|
{
|
||||||
|
"field_id": "kurzer-id",
|
||||||
|
"severity": "HIGH"|"MEDIUM"|"LOW",
|
||||||
|
"title": "kurze Lücken-Beschreibung",
|
||||||
|
"evidence": "wörtliches Zitat aus dem Impressum, das das Problem belegt",
|
||||||
|
"action": "konkrete Empfehlung"
|
||||||
|
}
|
||||||
|
|
||||||
|
Keine Erklärung außerhalb JSON. Keine Prosa. Wenn alles vollständig:
|
||||||
|
gib {"findings": []} zurück.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def _user_prompt(impressum_text: str,
|
||||||
|
business_scope: set[str] | None) -> str:
|
||||||
|
scope_hint = ""
|
||||||
|
if business_scope:
|
||||||
|
scope_hint = (
|
||||||
|
f"BUSINESS-SCOPE-HINTS: "
|
||||||
|
f"{', '.join(sorted(business_scope))}\n\n"
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
f"{scope_hint}"
|
||||||
|
f"IMPRESSUM-TEXT:\n"
|
||||||
|
f"{impressum_text[:4000]}\n\n"
|
||||||
|
"Liste Lücken nach § 5 TMG. Nur JSON."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_response(content: str) -> list[dict]:
|
||||||
|
"""Robust JSON extraction (handles ```json fences, prose-wrap)."""
|
||||||
|
if not content:
|
||||||
|
return []
|
||||||
|
s = content.strip()
|
||||||
|
if s.startswith("```"):
|
||||||
|
s = s.strip("`")
|
||||||
|
if s.lower().startswith("json"):
|
||||||
|
s = s[4:]
|
||||||
|
s = s.strip()
|
||||||
|
first = s.find("{")
|
||||||
|
last = s.rfind("}")
|
||||||
|
if first >= 0 and last > first:
|
||||||
|
s = s[first:last + 1]
|
||||||
|
try:
|
||||||
|
data = json.loads(s)
|
||||||
|
except Exception:
|
||||||
|
# Try array directly
|
||||||
|
first = content.find("[")
|
||||||
|
last = content.rfind("]")
|
||||||
|
if first >= 0 and last > first:
|
||||||
|
try:
|
||||||
|
arr = json.loads(content[first:last + 1])
|
||||||
|
return arr if isinstance(arr, list) else []
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
return []
|
||||||
|
findings = data.get("findings") if isinstance(data, dict) else data
|
||||||
|
return findings if isinstance(findings, list) else []
|
||||||
|
|
||||||
|
|
||||||
|
async def evaluate_llm(
|
||||||
|
impressum_text: str,
|
||||||
|
business_scope: set[str] | None = None,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""LLM-gestützte Impressum-Analyse. Returns finding dicts in the
|
||||||
|
same shape as impressum_agent.evaluate() so callers can merge."""
|
||||||
|
if not impressum_text or len(impressum_text.strip()) < 100:
|
||||||
|
return []
|
||||||
|
body = {
|
||||||
|
"model": MODEL,
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": _SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": _user_prompt(
|
||||||
|
impressum_text, business_scope,
|
||||||
|
)},
|
||||||
|
],
|
||||||
|
"format": "json",
|
||||||
|
"stream": False,
|
||||||
|
"options": {"temperature": 0.0, "seed": 42, "num_predict": 1200},
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=TIMEOUT) as c:
|
||||||
|
r = await c.post(f"{OLLAMA_URL}/api/chat", json=body)
|
||||||
|
r.raise_for_status()
|
||||||
|
content = (r.json().get("message") or {}).get("content", "") or ""
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("impressum_agent_llm call failed: %s", e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
raw_findings = _parse_response(content)
|
||||||
|
out: list[dict] = []
|
||||||
|
for f in raw_findings:
|
||||||
|
if not isinstance(f, dict):
|
||||||
|
continue
|
||||||
|
fid = re.sub(r"[^\w\-]", "_",
|
||||||
|
str(f.get("field_id") or "unknown"))[:40]
|
||||||
|
sev = (f.get("severity") or "MEDIUM").upper()
|
||||||
|
if sev not in ("HIGH", "MEDIUM", "LOW", "INFO"):
|
||||||
|
sev = "MEDIUM"
|
||||||
|
out.append({
|
||||||
|
"check_id": f"IMPRESSUM-AGENT-LLM-{fid.upper()}",
|
||||||
|
"agent": "impressum_agent_v2_llm",
|
||||||
|
"field_id": fid,
|
||||||
|
"severity": sev,
|
||||||
|
"severity_reason": "missing",
|
||||||
|
"title": str(f.get("title") or "")[:200],
|
||||||
|
"norm": "§ 5 TMG / DDG (LLM-Analyse)",
|
||||||
|
"evidence": str(f.get("evidence") or "")[:300],
|
||||||
|
"action": str(f.get("action") or "")[:400],
|
||||||
|
})
|
||||||
|
if out:
|
||||||
|
logger.info("impressum_agent_llm: %d finding(s)", len(out))
|
||||||
|
return out
|
||||||
@@ -44,6 +44,17 @@ _B2C_WEAK = (
|
|||||||
"shop", "store", "kaufen", "produkt", "ware", "rechnung",
|
"shop", "store", "kaufen", "produkt", "ware", "rechnung",
|
||||||
"agb", "widerrufsfrist", "widerrufsrecht", "wallbox", "hardware",
|
"agb", "widerrufsfrist", "widerrufsrecht", "wallbox", "hardware",
|
||||||
"abonnement", "tarif buchen", "naturstrom", "ladetarif",
|
"abonnement", "tarif buchen", "naturstrom", "ladetarif",
|
||||||
|
# Versicherungs- / Finanz-B2C
|
||||||
|
"reiseversicherung", "versicherung abschließen",
|
||||||
|
"versicherung kaufen", "online abschließen", "online-antrag",
|
||||||
|
"antrag stellen", "police", "vertrag abschließen",
|
||||||
|
"tarifrechner", "beitrag berechnen", "jetzt online",
|
||||||
|
# Telekom / Energie / Mobilfunk B2C
|
||||||
|
"vertrag buchen", "tarif wechseln", "stromtarif",
|
||||||
|
"gastarif", "mobilfunkvertrag", "dsl-tarif",
|
||||||
|
# Reise / Hotel / Mobility B2C
|
||||||
|
"buchen", "reservieren", "buchung", "ticket kaufen",
|
||||||
|
"fahrkarte", "flug buchen",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Hard B2B-only signals that override B2C-Verdacht.
|
# Hard B2B-only signals that override B2C-Verdacht.
|
||||||
|
|||||||
@@ -0,0 +1,132 @@
|
|||||||
|
"""Tests for B18 Impressum-Specialist-Agent (Pattern + LLM)."""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
|
|
||||||
|
from compliance.api.agent_check._b18_wiring import _render, run_b18
|
||||||
|
from compliance.services.specialist_agents.impressum_agent_llm import (
|
||||||
|
_parse_response,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_GOOD_IMPRESSUM = """
|
||||||
|
Acme GmbH
|
||||||
|
Musterstraße 1
|
||||||
|
10115 Berlin
|
||||||
|
|
||||||
|
Handelsregister: HRB 12345 Berlin
|
||||||
|
USt-IdNr: DE123456789
|
||||||
|
Geschäftsführer: Max Mustermann
|
||||||
|
|
||||||
|
Telefon: +49 30 12345
|
||||||
|
E-Mail: info@acme.example
|
||||||
|
"""
|
||||||
|
|
||||||
|
_BAD_IMPRESSUM = (
|
||||||
|
"Acme GmbH, Musterstraße 1, 10115 Berlin. "
|
||||||
|
"Kontakt: info@acme.example. "
|
||||||
|
"Wir freuen uns ueber Ihren Besuch auf unserer Website "
|
||||||
|
"und ueber Ihr Interesse an unserem Unternehmen und unseren "
|
||||||
|
"Produkten. Bitte beachten Sie auch unsere weiteren Hinweise."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestParseResponse:
|
||||||
|
def test_pure_json(self):
|
||||||
|
out = _parse_response('{"findings":[{"field_id":"foo","severity":"HIGH"}]}')
|
||||||
|
assert len(out) == 1
|
||||||
|
assert out[0]["field_id"] == "foo"
|
||||||
|
|
||||||
|
def test_markdown_fenced_json(self):
|
||||||
|
out = _parse_response('```json\n{"findings":[{"field_id":"x"}]}\n```')
|
||||||
|
assert len(out) == 1
|
||||||
|
|
||||||
|
def test_prose_wrapped(self):
|
||||||
|
out = _parse_response(
|
||||||
|
'Hier ist die Analyse: {"findings":[{"field_id":"y"}]} Ende.'
|
||||||
|
)
|
||||||
|
assert len(out) == 1
|
||||||
|
|
||||||
|
def test_empty(self):
|
||||||
|
assert _parse_response("") == []
|
||||||
|
|
||||||
|
def test_garbage(self):
|
||||||
|
assert _parse_response("not json at all") == []
|
||||||
|
|
||||||
|
|
||||||
|
class TestRunB18Wiring:
|
||||||
|
def test_short_impressum_skipped(self):
|
||||||
|
state = {"doc_texts": {"impressum": "tiny"}}
|
||||||
|
asyncio.run(run_b18(state))
|
||||||
|
assert "impressum_agent_html" not in state
|
||||||
|
|
||||||
|
def test_no_impressum_skipped(self):
|
||||||
|
asyncio.run(run_b18({"doc_texts": {}}))
|
||||||
|
|
||||||
|
def test_merges_pattern_and_llm(self):
|
||||||
|
# Pattern-agent will likely find no gaps in _GOOD_IMPRESSUM.
|
||||||
|
# Mock the LLM to return a fake additional finding.
|
||||||
|
async def fake_llm(text, scope):
|
||||||
|
return [{
|
||||||
|
"check_id": "IMPRESSUM-AGENT-LLM-DPO",
|
||||||
|
"agent": "impressum_agent_v2_llm",
|
||||||
|
"field_id": "dpo",
|
||||||
|
"severity": "MEDIUM",
|
||||||
|
"title": "DSB-Verweis fehlt",
|
||||||
|
"norm": "§ 5 TMG / DDG (LLM)",
|
||||||
|
"evidence": "kein Hinweis auf DSB",
|
||||||
|
"action": "DSB im Impressum verlinken",
|
||||||
|
}]
|
||||||
|
with patch(
|
||||||
|
"compliance.api.agent_check._b18_wiring.evaluate_llm",
|
||||||
|
new=fake_llm,
|
||||||
|
):
|
||||||
|
state = {"doc_texts": {"impressum": _GOOD_IMPRESSUM},
|
||||||
|
"profile_dict": {}}
|
||||||
|
asyncio.run(run_b18(state))
|
||||||
|
assert "impressum_agent_html" in state
|
||||||
|
extras = state.get("extra_findings") or []
|
||||||
|
ids = [f.get("check_id") for f in extras]
|
||||||
|
assert any("LLM-DPO" in i for i in ids)
|
||||||
|
|
||||||
|
def test_dedup_pattern_vs_llm_same_field(self):
|
||||||
|
# Pattern agent returns ust_id; mocked LLM also returns ust_id —
|
||||||
|
# only one should survive the dedup.
|
||||||
|
async def fake_llm(text, scope):
|
||||||
|
return [{
|
||||||
|
"check_id": "IMPRESSUM-AGENT-LLM-UST_ID",
|
||||||
|
"agent": "impressum_agent_v2_llm",
|
||||||
|
"field_id": "ust_id",
|
||||||
|
"severity": "HIGH",
|
||||||
|
"title": "duplicate ust_id finding",
|
||||||
|
"norm": "§ 5 TMG",
|
||||||
|
"evidence": "—",
|
||||||
|
"action": "—",
|
||||||
|
}]
|
||||||
|
with patch(
|
||||||
|
"compliance.api.agent_check._b18_wiring.evaluate_llm",
|
||||||
|
new=fake_llm,
|
||||||
|
):
|
||||||
|
state = {"doc_texts": {"impressum": _BAD_IMPRESSUM},
|
||||||
|
"profile_dict": {}}
|
||||||
|
asyncio.run(run_b18(state))
|
||||||
|
ust_findings = [
|
||||||
|
f for f in state.get("extra_findings") or []
|
||||||
|
if (f.get("field_id") or "").lower() == "ust_id"
|
||||||
|
]
|
||||||
|
assert len(ust_findings) == 1
|
||||||
|
|
||||||
|
|
||||||
|
class TestRender:
|
||||||
|
def test_render_with_two_findings(self):
|
||||||
|
merged = [
|
||||||
|
{"check_id": "X", "title": "A", "severity": "HIGH",
|
||||||
|
"agent": "impressum_agent_v1", "norm": "n", "action": "do"},
|
||||||
|
{"check_id": "Y", "title": "B", "severity": "MEDIUM",
|
||||||
|
"agent": "impressum_agent_v2_llm", "norm": "n", "action": "do"},
|
||||||
|
]
|
||||||
|
html = _render(merged, merged[:1], merged[1:])
|
||||||
|
assert "KB" in html # pattern tag
|
||||||
|
assert "LLM" in html # llm tag
|
||||||
|
assert "Pattern-Match: 1" in html
|
||||||
|
assert "LLM-Analyse: 1" in html
|
||||||
@@ -0,0 +1,107 @@
|
|||||||
|
"""Tests for chatbot-policy DSE-enrichment."""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from compliance.services.chatbot_policy_discovery import (
|
||||||
|
_base_origins,
|
||||||
|
_build_candidate_urls,
|
||||||
|
enrich_dse_with_chatbot_policies,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestBuildCandidates:
|
||||||
|
def test_includes_known_slug(self):
|
||||||
|
urls = _build_candidate_urls("https://example.com")
|
||||||
|
assert any("privacypolicychatbot" in u for u in urls)
|
||||||
|
|
||||||
|
def test_includes_lang_prefix_variants(self):
|
||||||
|
urls = _build_candidate_urls("https://example.com")
|
||||||
|
# Both root and /de variants exist
|
||||||
|
assert any("/de/" in u for u in urls)
|
||||||
|
assert any("https://example.com/privacypolicychatbot" == u
|
||||||
|
for u in urls)
|
||||||
|
|
||||||
|
|
||||||
|
class TestBaseOrigins:
|
||||||
|
def test_dedup(self):
|
||||||
|
entries = [
|
||||||
|
{"url": "https://example.com/a"},
|
||||||
|
{"url": "https://example.com/b"},
|
||||||
|
{"url": "https://other.de/x"},
|
||||||
|
]
|
||||||
|
assert _base_origins(entries) == [
|
||||||
|
"https://example.com", "https://other.de",
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_skip_empty(self):
|
||||||
|
entries = [{"url": ""}, {"url": "https://example.com/"}]
|
||||||
|
assert _base_origins(entries) == ["https://example.com"]
|
||||||
|
|
||||||
|
|
||||||
|
class TestEnrichment:
|
||||||
|
def test_no_entries_returns_zero(self):
|
||||||
|
result = asyncio.run(enrich_dse_with_chatbot_policies({}))
|
||||||
|
assert result["probed"] == 0
|
||||||
|
|
||||||
|
def test_all_404_no_merge(self):
|
||||||
|
async def fake_probe(url, timeout_s=4.0):
|
||||||
|
return None
|
||||||
|
with patch(
|
||||||
|
"compliance.services.chatbot_policy_discovery._probe",
|
||||||
|
new=fake_probe,
|
||||||
|
):
|
||||||
|
state = {
|
||||||
|
"doc_entries": [{"url": "https://x.de/dse"}],
|
||||||
|
"doc_texts": {"dse": "original"},
|
||||||
|
}
|
||||||
|
result = asyncio.run(enrich_dse_with_chatbot_policies(state))
|
||||||
|
assert result["found"] == []
|
||||||
|
assert state["doc_texts"]["dse"] == "original"
|
||||||
|
|
||||||
|
def test_mocked_probe_merges_short_text(self):
|
||||||
|
# When _probe is mocked, the word-count gate of the real _probe
|
||||||
|
# is bypassed; this is the helper-level contract.
|
||||||
|
async def fake_probe(url, timeout_s=4.0):
|
||||||
|
if "privacypolicychatbot" in url:
|
||||||
|
return (url, "short text")
|
||||||
|
return None
|
||||||
|
with patch(
|
||||||
|
"compliance.services.chatbot_policy_discovery._probe",
|
||||||
|
new=fake_probe,
|
||||||
|
):
|
||||||
|
state = {
|
||||||
|
"doc_entries": [
|
||||||
|
{"url": "https://x.de/dse", "doc_type": "dse",
|
||||||
|
"text": "main dse"},
|
||||||
|
],
|
||||||
|
"doc_texts": {"dse": "main dse"},
|
||||||
|
}
|
||||||
|
result = asyncio.run(enrich_dse_with_chatbot_policies(state))
|
||||||
|
assert len(result["found"]) >= 1
|
||||||
|
|
||||||
|
def test_long_enough_text_is_merged(self):
|
||||||
|
async def fake_probe(url, timeout_s=4.0):
|
||||||
|
if "privacypolicychatbot" in url:
|
||||||
|
return (url, "chatbot iadvize ".strip() * 200)
|
||||||
|
return None
|
||||||
|
with patch(
|
||||||
|
"compliance.services.chatbot_policy_discovery._probe",
|
||||||
|
new=fake_probe,
|
||||||
|
):
|
||||||
|
state = {
|
||||||
|
"doc_entries": [
|
||||||
|
{"url": "https://x.de/dse", "doc_type": "dse",
|
||||||
|
"text": "original"},
|
||||||
|
],
|
||||||
|
"doc_texts": {"dse": "original"},
|
||||||
|
}
|
||||||
|
asyncio.run(enrich_dse_with_chatbot_policies(state))
|
||||||
|
# The text has 200 repeats of "chatbot iadvize " = 400 words
|
||||||
|
assert "iadvize" in state["doc_texts"]["dse"]
|
||||||
|
assert state["doc_texts"]["dse"].startswith("original")
|
||||||
|
# dse-entry should record source for audit trail
|
||||||
|
dse_entry = next(
|
||||||
|
e for e in state["doc_entries"] if e["doc_type"] == "dse"
|
||||||
|
)
|
||||||
|
assert dse_entry["chatbot_policy_sources"]
|
||||||
@@ -42,6 +42,17 @@ class TestDetectB2CScope:
|
|||||||
scope, _ = _detect_b2c_scope(s)
|
scope, _ = _detect_b2c_scope(s)
|
||||||
assert scope == "unknown"
|
assert scope == "unknown"
|
||||||
|
|
||||||
|
def test_versicherung_combo_promotes_to_likely(self):
|
||||||
|
s = _state(home_text="Reiseversicherung jetzt online "
|
||||||
|
"abschließen. Tarifrechner verfügbar.")
|
||||||
|
scope, _ = _detect_b2c_scope(s)
|
||||||
|
assert scope == "b2c_likely"
|
||||||
|
|
||||||
|
def test_buchung_combo_promotes_to_likely(self):
|
||||||
|
s = _state(home_text="Flug buchen oder Hotel reservieren.")
|
||||||
|
scope, _ = _detect_b2c_scope(s)
|
||||||
|
assert scope == "b2c_likely"
|
||||||
|
|
||||||
def test_empty_state(self):
|
def test_empty_state(self):
|
||||||
s = _state()
|
s = _state()
|
||||||
scope, _ = _detect_b2c_scope(s)
|
scope, _ = _detect_b2c_scope(s)
|
||||||
|
|||||||
Reference in New Issue
Block a user