e8ff75cbfe
5 Backlog-Items aus dem Multi-Site-Briefing in einem Sprint:
1. B13 B2C-Soft-Hints — Versicherungs/Tarif/Buchungs-Marker
_B2C_WEAK erweitert um "Reiseversicherung", "Tarifrechner",
"Online-Antrag", "Flug buchen", "Stromtarif" etc.
Fängt Allianz-Reise-Chatbot (vorher False-Negative).
2. Chatbot-Policy-Discovery (chatbot_policy_discovery.py)
Probt 14 Standard-Slugs (privacypolicychatbot, chatbot-datenschutz,
ai-policy, ki-datenschutz, ...) × 5 Lang-Prefixe auf jeder
submitted Origin. Successful >300-Wort-Findings werden in
doc_texts['dse'] gemerged. Audit-Trail über
doc_entries[dse].chatbot_policy_sources.
Hebt Westfield-iAdvize-Lücke.
3. API-Response-Payload erweitert
phase_f_persist.response um extra_findings, audit_walk und
html_blocks erweitert. B-Wiring-Output (B1, B3-B18) ist nicht
mehr nur im Mail-HTML versteckt — externe Aufrufer sehen jeden
Finding. Schema additiv, legacy clients ignorieren neue Felder.
4. Plausibility-LLM Empty-Response-Fix
Resilienz-Strategie A→B→C→D:
A) format='json' (strict, default)
B) format='' (loose, _try_extract_json mit ```json-fence + prose-
wrap-Unterstützung)
C) Split-Batch-Recursion (vorhanden)
D) Give up, leeres dict (callers behandeln als skipped)
Plus _post_llm() als isolierter LLM-Call-Helper, catched
Network-Errors.
5. Specialist-Agents Phase 2 LLM (MVP) — Impressum-Agent
impressum_agent_llm.py: qwen3:30b-a3b mit § 5 TMG System-Prompt,
business_scope-hints aus profile_dict. Output identisches Schema
wie pattern-agent für ein Merge ohne API-Bruch.
_b18_wiring.py orchestriert beide Agents + deduplet nach
field_id, rendert lila V2-Block mit KB/LLM-Tags pro Finding.
Pattern-first im Dedup (deterministisch + stable).
Tests: 107/107 grün (7 Test-Suites + chatbot-discovery + b18).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
108 lines
3.8 KiB
Python
108 lines
3.8 KiB
Python
"""Tests for chatbot-policy DSE-enrichment."""
|
|
|
|
import asyncio
|
|
from unittest.mock import patch
|
|
|
|
from compliance.services.chatbot_policy_discovery import (
|
|
_base_origins,
|
|
_build_candidate_urls,
|
|
enrich_dse_with_chatbot_policies,
|
|
)
|
|
|
|
|
|
class TestBuildCandidates:
|
|
def test_includes_known_slug(self):
|
|
urls = _build_candidate_urls("https://example.com")
|
|
assert any("privacypolicychatbot" in u for u in urls)
|
|
|
|
def test_includes_lang_prefix_variants(self):
|
|
urls = _build_candidate_urls("https://example.com")
|
|
# Both root and /de variants exist
|
|
assert any("/de/" in u for u in urls)
|
|
assert any("https://example.com/privacypolicychatbot" == u
|
|
for u in urls)
|
|
|
|
|
|
class TestBaseOrigins:
|
|
def test_dedup(self):
|
|
entries = [
|
|
{"url": "https://example.com/a"},
|
|
{"url": "https://example.com/b"},
|
|
{"url": "https://other.de/x"},
|
|
]
|
|
assert _base_origins(entries) == [
|
|
"https://example.com", "https://other.de",
|
|
]
|
|
|
|
def test_skip_empty(self):
|
|
entries = [{"url": ""}, {"url": "https://example.com/"}]
|
|
assert _base_origins(entries) == ["https://example.com"]
|
|
|
|
|
|
class TestEnrichment:
|
|
def test_no_entries_returns_zero(self):
|
|
result = asyncio.run(enrich_dse_with_chatbot_policies({}))
|
|
assert result["probed"] == 0
|
|
|
|
def test_all_404_no_merge(self):
|
|
async def fake_probe(url, timeout_s=4.0):
|
|
return None
|
|
with patch(
|
|
"compliance.services.chatbot_policy_discovery._probe",
|
|
new=fake_probe,
|
|
):
|
|
state = {
|
|
"doc_entries": [{"url": "https://x.de/dse"}],
|
|
"doc_texts": {"dse": "original"},
|
|
}
|
|
result = asyncio.run(enrich_dse_with_chatbot_policies(state))
|
|
assert result["found"] == []
|
|
assert state["doc_texts"]["dse"] == "original"
|
|
|
|
def test_mocked_probe_merges_short_text(self):
|
|
# When _probe is mocked, the word-count gate of the real _probe
|
|
# is bypassed; this is the helper-level contract.
|
|
async def fake_probe(url, timeout_s=4.0):
|
|
if "privacypolicychatbot" in url:
|
|
return (url, "short text")
|
|
return None
|
|
with patch(
|
|
"compliance.services.chatbot_policy_discovery._probe",
|
|
new=fake_probe,
|
|
):
|
|
state = {
|
|
"doc_entries": [
|
|
{"url": "https://x.de/dse", "doc_type": "dse",
|
|
"text": "main dse"},
|
|
],
|
|
"doc_texts": {"dse": "main dse"},
|
|
}
|
|
result = asyncio.run(enrich_dse_with_chatbot_policies(state))
|
|
assert len(result["found"]) >= 1
|
|
|
|
def test_long_enough_text_is_merged(self):
|
|
async def fake_probe(url, timeout_s=4.0):
|
|
if "privacypolicychatbot" in url:
|
|
return (url, "chatbot iadvize ".strip() * 200)
|
|
return None
|
|
with patch(
|
|
"compliance.services.chatbot_policy_discovery._probe",
|
|
new=fake_probe,
|
|
):
|
|
state = {
|
|
"doc_entries": [
|
|
{"url": "https://x.de/dse", "doc_type": "dse",
|
|
"text": "original"},
|
|
],
|
|
"doc_texts": {"dse": "original"},
|
|
}
|
|
asyncio.run(enrich_dse_with_chatbot_policies(state))
|
|
# The text has 200 repeats of "chatbot iadvize " = 400 words
|
|
assert "iadvize" in state["doc_texts"]["dse"]
|
|
assert state["doc_texts"]["dse"].startswith("original")
|
|
# dse-entry should record source for audit trail
|
|
dse_entry = next(
|
|
e for e in state["doc_entries"] if e["doc_type"] == "dse"
|
|
)
|
|
assert dse_entry["chatbot_policy_sources"]
|