"""Discover separate chatbot-/AI-policy pages and merge them into the main DSE text. Many sites publish their chatbot data-protection notice on a separate URL (e.g. westfield.com/germany/privacypolicychatbot) that the regular auto-discovery misses because it doesn't classify as 'dse'. As a result, B12/B15 (chatbot-cookie classification, AI-Act legal basis) never see the iAdvize/Vertex provider names. Strategy: 1. From the discovered URLs derive the base host. 2. Probe a fixed list of well-known chatbot-policy paths. 3. For each 2xx-response with > 300 words, merge the text into state['doc_texts']['dse'] with a separator. Best-effort: a probe failure NEVER aborts the check. """ from __future__ import annotations import asyncio import logging import re from urllib.parse import urlparse import httpx logger = logging.getLogger(__name__) # Slug-Kandidaten, sortiert von häufigsten zu seltensten. _CHATBOT_POLICY_SLUGS = ( "privacypolicychatbot", "chatbot-datenschutz", "chatbot/datenschutz", "datenschutz-chatbot", "datenschutz/chatbot", "ai-policy", "ai-datenschutz", "ki-datenschutz", "privacy-chatbot", "privacy-ai", "datenschutz-ki", "datenschutz-assistent", "chatbot-privacy", "ai-privacy", ) # Sprach-Prefixe die wir abklopfen. _LANG_PREFIXES = ("", "/de", "/de_DE", "/en", "/germany") def _build_candidate_urls(base_origin: str) -> list[str]: """Build all (lang × slug) combinations for one origin.""" out: list[str] = [] seen: set[str] = set() for lang in _LANG_PREFIXES: for slug in _CHATBOT_POLICY_SLUGS: url = f"{base_origin}{lang}/{slug}".replace("//", "/") url = url.replace("https:/", "https://").replace("http:/", "http://") if url not in seen: seen.add(url) out.append(url) return out async def _probe(url: str, timeout_s: float = 4.0) -> tuple[str, str] | None: """Return (url, text) on 2xx + >300-word body, else None.""" try: async with httpx.AsyncClient( timeout=timeout_s, follow_redirects=True, ) as c: r = await c.get(url) if r.status_code >= 400: return None text = re.sub(r"", " ", r.text, flags=re.S | re.I) text = re.sub(r"", " ", text, flags=re.S | re.I) text = re.sub(r"<[^>]+>", " ", text) text = re.sub(r"\s+", " ", text).strip() if len(text.split()) < 300: return None return url, text except Exception: return None def _base_origins(doc_entries: list[dict]) -> list[str]: seen: set[str] = set() out: list[str] = [] for e in doc_entries: url = (e.get("url") or "").strip() if not url: continue try: p = urlparse(url) if not p.scheme or not p.netloc: continue origin = f"{p.scheme}://{p.netloc}" if origin not in seen: seen.add(origin) out.append(origin) except Exception: continue return out async def enrich_dse_with_chatbot_policies(state: dict) -> dict: """Probe known chatbot-policy paths; merge findings into DSE text. Returns metadata dict describing what was merged (for logging / debugging). Mutates state['doc_texts']['dse'] in place. """ doc_entries = state.get("doc_entries") or [] origins = _base_origins(doc_entries) if not origins: return {"probed": 0, "found": [], "merged_chars": 0} # Build candidate URL list, capped per origin to avoid noise. candidates: list[str] = [] for origin in origins[:2]: # cap origins for safety candidates.extend(_build_candidate_urls(origin)[:20]) if not candidates: return {"probed": 0, "found": [], "merged_chars": 0} results = await asyncio.gather( *[_probe(u) for u in candidates], return_exceptions=True, ) found = [r for r in results if isinstance(r, tuple) and r] if not found: return {"probed": len(candidates), "found": [], "merged_chars": 0} # Merge into DSE text. doc_texts = state.setdefault("doc_texts", {}) dse_text = doc_texts.get("dse") or "" appended_chars = 0 appended_urls: list[str] = [] for url, text in found: sep = ( f"\n\n--- ergänzt aus {url} (chatbot-policy-discovery) ---\n\n" ) dse_text += sep + text appended_chars += len(text) appended_urls.append(url) doc_texts["dse"] = dse_text # Also record on the dse-entry (audit trail). for e in doc_entries: if e.get("doc_type") == "dse": e["chatbot_policy_sources"] = appended_urls e["text"] = dse_text break logger.info( "chatbot-policy enrichment: %d candidate(s) probed, %d found, " "+%d chars merged into DSE", len(candidates), len(found), appended_chars, ) return { "probed": len(candidates), "found": appended_urls, "merged_chars": appended_chars, }