"""Tests for chatbot-policy DSE-enrichment.""" import asyncio from unittest.mock import patch from compliance.services.chatbot_policy_discovery import ( _base_origins, _build_candidate_urls, enrich_dse_with_chatbot_policies, ) class TestBuildCandidates: def test_includes_known_slug(self): urls = _build_candidate_urls("https://example.com") assert any("privacypolicychatbot" in u for u in urls) def test_includes_lang_prefix_variants(self): urls = _build_candidate_urls("https://example.com") # Both root and /de variants exist assert any("/de/" in u for u in urls) assert any("https://example.com/privacypolicychatbot" == u for u in urls) class TestBaseOrigins: def test_dedup(self): entries = [ {"url": "https://example.com/a"}, {"url": "https://example.com/b"}, {"url": "https://other.de/x"}, ] assert _base_origins(entries) == [ "https://example.com", "https://other.de", ] def test_skip_empty(self): entries = [{"url": ""}, {"url": "https://example.com/"}] assert _base_origins(entries) == ["https://example.com"] class TestEnrichment: def test_no_entries_returns_zero(self): result = asyncio.run(enrich_dse_with_chatbot_policies({})) assert result["probed"] == 0 def test_all_404_no_merge(self): async def fake_probe(url, timeout_s=4.0): return None with patch( "compliance.services.chatbot_policy_discovery._probe", new=fake_probe, ): state = { "doc_entries": [{"url": "https://x.de/dse"}], "doc_texts": {"dse": "original"}, } result = asyncio.run(enrich_dse_with_chatbot_policies(state)) assert result["found"] == [] assert state["doc_texts"]["dse"] == "original" def test_mocked_probe_merges_short_text(self): # When _probe is mocked, the word-count gate of the real _probe # is bypassed; this is the helper-level contract. async def fake_probe(url, timeout_s=4.0): if "privacypolicychatbot" in url: return (url, "short text") return None with patch( "compliance.services.chatbot_policy_discovery._probe", new=fake_probe, ): state = { "doc_entries": [ {"url": "https://x.de/dse", "doc_type": "dse", "text": "main dse"}, ], "doc_texts": {"dse": "main dse"}, } result = asyncio.run(enrich_dse_with_chatbot_policies(state)) assert len(result["found"]) >= 1 def test_long_enough_text_is_merged(self): async def fake_probe(url, timeout_s=4.0): if "privacypolicychatbot" in url: return (url, "chatbot iadvize ".strip() * 200) return None with patch( "compliance.services.chatbot_policy_discovery._probe", new=fake_probe, ): state = { "doc_entries": [ {"url": "https://x.de/dse", "doc_type": "dse", "text": "original"}, ], "doc_texts": {"dse": "original"}, } asyncio.run(enrich_dse_with_chatbot_policies(state)) # The text has 200 repeats of "chatbot iadvize " = 400 words assert "iadvize" in state["doc_texts"]["dse"] assert state["doc_texts"]["dse"].startswith("original") # dse-entry should record source for audit trail dse_entry = next( e for e in state["doc_entries"] if e["doc_type"] == "dse" ) assert dse_entry["chatbot_policy_sources"]