breakpilot-compliance/backend-compliance/tests/test_chatbot_policy_discovery.py

"""Tests for chatbot-policy DSE-enrichment."""

import asyncio
from unittest.mock import patch

from compliance.services.chatbot_policy_discovery import (
    _base_origins,
    _build_candidate_urls,
    enrich_dse_with_chatbot_policies,
)


class TestBuildCandidates:
    def test_includes_known_slug(self):
        urls = _build_candidate_urls("https://example.com")
        assert any("privacypolicychatbot" in u for u in urls)

    def test_includes_lang_prefix_variants(self):
        urls = _build_candidate_urls("https://example.com")
        # Both root and /de variants exist
        assert any("/de/" in u for u in urls)
        assert any("https://example.com/privacypolicychatbot" == u
                   for u in urls)


class TestBaseOrigins:
    def test_dedup(self):
        entries = [
            {"url": "https://example.com/a"},
            {"url": "https://example.com/b"},
            {"url": "https://other.de/x"},
        ]
        assert _base_origins(entries) == [
            "https://example.com", "https://other.de",
        ]

    def test_skip_empty(self):
        entries = [{"url": ""}, {"url": "https://example.com/"}]
        assert _base_origins(entries) == ["https://example.com"]


class TestEnrichment:
    def test_no_entries_returns_zero(self):
        result = asyncio.run(enrich_dse_with_chatbot_policies({}))
        assert result["probed"] == 0

    def test_all_404_no_merge(self):
        async def fake_probe(url, timeout_s=4.0):
            return None
        with patch(
            "compliance.services.chatbot_policy_discovery._probe",
            new=fake_probe,
        ):
            state = {
                "doc_entries": [{"url": "https://x.de/dse"}],
                "doc_texts": {"dse": "original"},
            }
            result = asyncio.run(enrich_dse_with_chatbot_policies(state))
        assert result["found"] == []
        assert state["doc_texts"]["dse"] == "original"

    def test_mocked_probe_merges_short_text(self):
        # When _probe is mocked, the word-count gate of the real _probe
        # is bypassed; this is the helper-level contract.
        async def fake_probe(url, timeout_s=4.0):
            if "privacypolicychatbot" in url:
                return (url, "short text")
            return None
        with patch(
            "compliance.services.chatbot_policy_discovery._probe",
            new=fake_probe,
        ):
            state = {
                "doc_entries": [
                    {"url": "https://x.de/dse", "doc_type": "dse",
                     "text": "main dse"},
                ],
                "doc_texts": {"dse": "main dse"},
            }
            result = asyncio.run(enrich_dse_with_chatbot_policies(state))
        assert len(result["found"]) >= 1

    def test_long_enough_text_is_merged(self):
        async def fake_probe(url, timeout_s=4.0):
            if "privacypolicychatbot" in url:
                return (url, "chatbot iadvize ".strip() * 200)
            return None
        with patch(
            "compliance.services.chatbot_policy_discovery._probe",
            new=fake_probe,
        ):
            state = {
                "doc_entries": [
                    {"url": "https://x.de/dse", "doc_type": "dse",
                     "text": "original"},
                ],
                "doc_texts": {"dse": "original"},
            }
            asyncio.run(enrich_dse_with_chatbot_policies(state))
        # The text has 200 repeats of "chatbot iadvize " = 400 words
        assert "iadvize" in state["doc_texts"]["dse"]
        assert state["doc_texts"]["dse"].startswith("original")
        # dse-entry should record source for audit trail
        dse_entry = next(
            e for e in state["doc_entries"] if e["doc_type"] == "dse"
        )
        assert dse_entry["chatbot_policy_sources"]