breakpilot-compliance/consent-tester/services/cmp_heuristic.py

"""
Generic Cookie-Policy JSON heuristic.

When a CMP we don't know yet returns a JSON payload, we can still recognize
"this JSON describes a cookie policy" by its shape. This module:

1. `looks_like_cookie_policy(data)` — fast shape-based classifier
2. `reconstruct_generic(data)` — walks the JSON, extracts every name/
   description/purpose/expiry field and emits a flat German Markdown text

The point: Phase A makes unknown CMPs work without hand-coding each one.
The named library (Phase B) still takes priority because it produces nicer
text, but the heuristic catches everything else.
"""

from __future__ import annotations

import logging
from typing import Any

logger = logging.getLogger(__name__)


# ── Shape classifier ────────────────────────────────────────────────

# Keys whose presence strongly suggests "this JSON is a cookie policy".
# We require at least ONE of these at top-level OR within first nesting.
_SHAPE_KEYS = {
    "cookies", "categories", "providers", "vendors", "purposes",
    "cookielist", "cookiegroups", "consentcategories",
    "cookiedeclaration", "groupedcookies", "groups",
    "policy", "policypage", "policypagemetadata",
}

# Field names that mark a "category-like" or "vendor-like" object.
_OBJECT_NAME_FIELDS = ("name", "title", "label", "displayname",
                       "categoryname", "groupname", "vendorname",
                       "cookiename", "providername")
_OBJECT_DESC_FIELDS = ("description", "desc", "purpose", "zweck",
                       "explanation", "info", "details",
                       "groupdescription", "categorydescription",
                       "vendordescription", "providerdescription",
                       "descriptionhtml", "descriptiontext")


def looks_like_cookie_policy(data: Any) -> bool:
    """True when `data` shape strongly suggests a CMP cookie-policy payload.

    Heuristic (any one is enough):
      a) Top-level or first-nesting has one of `_SHAPE_KEYS` AND that key's
         value is a non-empty list of dicts with name+description fields
      b) IAB TCF v2 shape: top-level has `vendors` (list) AND `purposes` (list)
    """
    if not isinstance(data, dict):
        return False

    # Direct top-level match
    if _has_cookie_policy_shape(data):
        return True

    # First nesting (some CMPs wrap in {"data": {...}} or similar)
    for v in data.values():
        if isinstance(v, dict) and _has_cookie_policy_shape(v):
            return True

    # IAB TCF v2 shape
    if isinstance(data.get("vendors"), list) and isinstance(data.get("purposes"), list):
        if len(data["vendors"]) >= 2 and len(data["purposes"]) >= 2:
            return True

    return False


def _has_cookie_policy_shape(d: dict) -> bool:
    lower_keys = {k.lower(): k for k in d.keys()}
    matched = _SHAPE_KEYS & set(lower_keys.keys())
    if not matched:
        return False

    # Verify at least one matched key holds a list of dicts that look like
    # categories or vendors (name+description).
    for low_key in matched:
        val = d[lower_keys[low_key]]
        if not isinstance(val, list) or len(val) < 2:
            continue
        well_formed = sum(
            1 for entry in val
            if isinstance(entry, dict)
            and any(field in {k.lower() for k in entry.keys()} for field in _OBJECT_NAME_FIELDS)
        )
        if well_formed >= 2:
            return True
    return False


# ── Reconstruction ───────────────────────────────────────────────────

def reconstruct_generic(data: Any, max_words: int = 5000) -> str:
    """Walk the JSON structure, extract names/descriptions/purposes, and emit
    a flat German Markdown text suitable for the compliance regex checker.

    Limits output to `max_words` words to avoid pathological documents.
    """
    parts: list[str] = ["# Cookie-Richtlinie"]
    _walk(data, parts, depth=0, max_depth=6)

    # Strip duplicates that often slip in (translations, repeated values)
    seen: set[str] = set()
    unique_parts: list[str] = []
    for p in parts:
        key = p.strip().lower()
        if not key or key in seen:
            continue
        seen.add(key)
        unique_parts.append(p)

    text = "\n".join(unique_parts)
    words = text.split()
    if len(words) > max_words:
        text = " ".join(words[:max_words])
    return text


def _walk(node: Any, out: list[str], depth: int, max_depth: int) -> None:
    if depth > max_depth:
        return

    if isinstance(node, dict):
        # Emit name + description as a unit if both present
        name = _first_field(node, _OBJECT_NAME_FIELDS)
        desc = _first_field(node, _OBJECT_DESC_FIELDS)
        if name and desc:
            out.append("")
            out.append(f"## {_clean(name)}")
            out.append(_clean(desc))
        elif name:
            out.append("")
            out.append(f"## {_clean(name)}")
        elif desc:
            out.append(_clean(desc))

        # Common standalone fields
        for key in ("prologue", "epilogue", "subheading", "datacontroller",
                    "expiresafter", "persistencedescription",
                    "persistencepurposetext", "persistencepurposedescription"):
            val = _first_field(node, (key,))
            if val:
                out.append(_clean(val))

        # Provider/vendor entries — emit as bullet line
        provider_name = _first_field(node, ("vendorname", "providername"))
        if provider_name and not name:
            out.append(f"- {_clean(provider_name)}")

        # Recurse into all values
        for v in node.values():
            _walk(v, out, depth + 1, max_depth)

    elif isinstance(node, list):
        for item in node:
            _walk(item, out, depth + 1, max_depth)


def _first_field(d: dict, field_names: tuple[str, ...]) -> str:
    """Return first non-empty string value matching any of field_names (case-insensitive)."""
    lower_map = {k.lower(): k for k in d.keys()}
    for f in field_names:
        actual_key = lower_map.get(f)
        if actual_key:
            v = d[actual_key]
            if isinstance(v, str) and v.strip():
                return v
    return ""


_TAG_RE = None


def _clean(text: str) -> str:
    """Strip HTML tags and collapse whitespace."""
    global _TAG_RE
    if _TAG_RE is None:
        import re
        _TAG_RE = re.compile(r"<[^>]+>")
    no_tags = _TAG_RE.sub(" ", text)
    no_tags = (no_tags
               .replace("&nbsp;", " ").replace("&amp;", "&")
               .replace("&lt;", "<").replace("&gt;", ">")
               .replace("&quot;", '"').replace("&#39;", "'"))
    import re
    return re.sub(r"\s+", " ", no_tags).strip()