breakpilot-compliance/backend-compliance/compliance/services/chatbot_policy_discovery.py

"""Discover separate chatbot-/AI-policy pages and merge them into the
main DSE text.

Many sites publish their chatbot data-protection notice on a separate
URL (e.g. westfield.com/germany/privacypolicychatbot) that the regular
auto-discovery misses because it doesn't classify as 'dse'. As a
result, B12/B15 (chatbot-cookie classification, AI-Act legal basis)
never see the iAdvize/Vertex provider names.

Strategy:
  1. From the discovered URLs derive the base host.
  2. Probe a fixed list of well-known chatbot-policy paths.
  3. For each 2xx-response with > 300 words, merge the text into
     state['doc_texts']['dse'] with a separator.

Best-effort: a probe failure NEVER aborts the check.
"""

from __future__ import annotations

import asyncio
import logging
import re
from urllib.parse import urlparse

import httpx

logger = logging.getLogger(__name__)


# Slug-Kandidaten, sortiert von häufigsten zu seltensten.
_CHATBOT_POLICY_SLUGS = (
    "privacypolicychatbot",
    "chatbot-datenschutz", "chatbot/datenschutz",
    "datenschutz-chatbot", "datenschutz/chatbot",
    "ai-policy", "ai-datenschutz", "ki-datenschutz",
    "privacy-chatbot", "privacy-ai",
    "datenschutz-ki", "datenschutz-assistent",
    "chatbot-privacy", "ai-privacy",
)


# Sprach-Prefixe die wir abklopfen.
_LANG_PREFIXES = ("", "/de", "/de_DE", "/en", "/germany")


def _build_candidate_urls(base_origin: str) -> list[str]:
    """Build all (lang × slug) combinations for one origin."""
    out: list[str] = []
    seen: set[str] = set()
    for lang in _LANG_PREFIXES:
        for slug in _CHATBOT_POLICY_SLUGS:
            url = f"{base_origin}{lang}/{slug}".replace("//", "/")
            url = url.replace("https:/", "https://").replace("http:/", "http://")
            if url not in seen:
                seen.add(url)
                out.append(url)
    return out


async def _probe(url: str, timeout_s: float = 4.0) -> tuple[str, str] | None:
    """Return (url, text) on 2xx + >300-word body, else None."""
    try:
        async with httpx.AsyncClient(
            timeout=timeout_s, follow_redirects=True,
        ) as c:
            r = await c.get(url)
            if r.status_code >= 400:
                return None
            text = re.sub(r"<script.*?</script>", " ",
                          r.text, flags=re.S | re.I)
            text = re.sub(r"<style.*?</style>", " ",
                          text, flags=re.S | re.I)
            text = re.sub(r"<[^>]+>", " ", text)
            text = re.sub(r"\s+", " ", text).strip()
            if len(text.split()) < 300:
                return None
            return url, text
    except Exception:
        return None


def _base_origins(doc_entries: list[dict]) -> list[str]:
    seen: set[str] = set()
    out: list[str] = []
    for e in doc_entries:
        url = (e.get("url") or "").strip()
        if not url:
            continue
        try:
            p = urlparse(url)
            if not p.scheme or not p.netloc:
                continue
            origin = f"{p.scheme}://{p.netloc}"
            if origin not in seen:
                seen.add(origin)
                out.append(origin)
        except Exception:
            continue
    return out


async def enrich_dse_with_chatbot_policies(state: dict) -> dict:
    """Probe known chatbot-policy paths; merge findings into DSE text.

    Returns metadata dict describing what was merged (for logging /
    debugging). Mutates state['doc_texts']['dse'] in place.
    """
    doc_entries = state.get("doc_entries") or []
    origins = _base_origins(doc_entries)
    if not origins:
        return {"probed": 0, "found": [], "merged_chars": 0}

    # Build candidate URL list, capped per origin to avoid noise.
    candidates: list[str] = []
    for origin in origins[:2]:  # cap origins for safety
        candidates.extend(_build_candidate_urls(origin)[:20])

    if not candidates:
        return {"probed": 0, "found": [], "merged_chars": 0}

    results = await asyncio.gather(
        *[_probe(u) for u in candidates],
        return_exceptions=True,
    )
    found = [r for r in results if isinstance(r, tuple) and r]

    if not found:
        return {"probed": len(candidates), "found": [], "merged_chars": 0}

    # Merge into DSE text.
    doc_texts = state.setdefault("doc_texts", {})
    dse_text = doc_texts.get("dse") or ""
    appended_chars = 0
    appended_urls: list[str] = []
    for url, text in found:
        sep = (
            f"\n\n--- ergänzt aus {url} (chatbot-policy-discovery) ---\n\n"
        )
        dse_text += sep + text
        appended_chars += len(text)
        appended_urls.append(url)
    doc_texts["dse"] = dse_text

    # Also record on the dse-entry (audit trail).
    for e in doc_entries:
        if e.get("doc_type") == "dse":
            e["chatbot_policy_sources"] = appended_urls
            e["text"] = dse_text
            break

    logger.info(
        "chatbot-policy enrichment: %d candidate(s) probed, %d found, "
        "+%d chars merged into DSE",
        len(candidates), len(found), appended_chars,
    )
    return {
        "probed": len(candidates),
        "found": appended_urls,
        "merged_chars": appended_chars,
    }