breakpilot-compliance/backend-compliance/compliance/api/agent_check/_helpers.py

"""Pure helpers for the compliance-check route — no I/O, no async.

Grouped here because each is small and they share the same constants
imports. Splitting further would not improve readability.
"""

from __future__ import annotations

import logging
from urllib.parse import urlparse

from ._constants import (
    _ALL_DOC_TYPES,
    _COMPOUND_TLDS,
    _DISCOVERY_RULES,
    _DOC_TYPE_LABELS,
    _compliance_check_jobs,
)

logger = logging.getLogger(__name__)


def _update(check_id: str, msg: str, pct: int | None = None) -> None:
    """Update the in-memory job entry with a progress message + pct."""
    job = _compliance_check_jobs[check_id]
    job["progress"] = msg
    if pct is not None:
        job["progress_pct"] = max(0, min(100, int(pct)))


def _doc_type_label(doc_type: str) -> str:
    return _DOC_TYPE_LABELS.get(doc_type, doc_type.upper())


def _classify_discovered_doc(title: str, url: str) -> str | None:
    """Map a discovered doc (by its title + URL) to one of our 8 canonical types."""
    haystack = f"{title} {url}"
    for canon, keywords in _DISCOVERY_RULES:
        if any(kw in haystack for kw in keywords):
            return canon
    return None


def _extract_domain(doc_entries: list[dict]) -> str | None:
    """Extract base domain (without www) from first URL."""
    for entry in doc_entries:
        url = entry.get("url", "")
        if url and "://" in url:
            host = urlparse(url).netloc.lower()
            if host.startswith("www."):
                host = host[4:]
            return host or None
    return None


def _company_name_from_url(doc_entries: list[dict]) -> str | None:
    """Derive a display company name from the entered URLs.

    Heuristic: take the second-level domain (e.g. "bmw" from "www.bmw.de"),
    uppercase short acronyms (<=4 chars, no hyphens), title-case the rest.

    Examples:
      www.bmw.de              -> BMW
      mercedes-benz.de        -> Mercedes-Benz
      shop.example.co.uk      -> Example
      juris.de                -> Juris
    """
    for entry in doc_entries:
        url = entry.get("url", "")
        if not url or "://" not in url:
            continue
        host = urlparse(url).netloc.lower()
        if host.startswith("www."):
            host = host[4:]
        parts = host.split(".")
        if len(parts) < 2:
            continue
        # Handle compound TLDs (.co.uk etc.)
        if len(parts) >= 3 and ".".join(parts[-2:]) in _COMPOUND_TLDS:
            sld = parts[-3]
        else:
            sld = parts[-2]
        if not sld:
            continue
        if len(sld) <= 4 and "-" not in sld:
            return sld.upper()
        return "-".join(p.capitalize() for p in sld.split("-"))
    return None


def _get_skip_types(profile) -> dict[str, str]:
    """Doc_types to skip entirely with a per-type reason message.

    Heute primaer fuer OEM-Konfigurator-Pattern (BMW/Audi/Mercedes):
    wenn die Site kein Direkt-Vertrieb macht, sind AGB/Widerruf/
    Nutzungsbedingungen nicht Pflicht auf der Website — sie werden
    beim Vertragshaendler ausgehaendigt.
    """
    if getattr(profile, "no_direct_sales", False):
        msg = (
            "Nicht anwendbar — die Webseite schliesst keinen Direkt-"
            "Kaufvertrag (OEM-Konfigurator-Pattern, Vertrag laeuft "
            "ueber Vertragshaendler). AGB/Widerruf werden beim "
            "Haendler ausgehaendigt."
        )
        return {
            "agb": msg,
            "widerruf": msg,
            "nutzungsbedingungen": msg,
        }
    return {}


def _apply_profile_filter(result, profile, doc_type: str):
    """Adjust INFO-level checks based on business profile context.

    For example: ODR check only relevant for B2C online shops.
    """
    for check in result.checks:
        cid = check.id.lower()

        # ODR/OS-Link: relevant ONLY for B2C online shops. The check's
        # default hint is written for B2B (it explains why it's not
        # relevant) — for B2C we must replace it with action-oriented
        # guidance, otherwise the report contradicts itself.
        if "odr" in cid or "os-link" in cid or "streitbeilegung" in check.label.lower():
            if profile.needs_odr:
                if not check.passed:
                    check.hint = (
                        "Als B2C-Anbieter muessen Sie nach Art. 14 EU-VO 524/2013 "
                        "auf die OS-Plattform (https://ec.europa.eu/consumers/odr) "
                        "verlinken — klickbarer Link, nicht nur Text. Zusaetzlich "
                        "§36 VSBG: angeben, ob Sie an Verbraucher-"
                        "Streitbeilegungsverfahren teilnehmen (oder nicht)."
                    )
            else:
                check.skipped = True
                check.hint = "Nicht relevant (kein B2C Online-Shop)"

        # Widerruf: Flag entire document as unnecessary for B2B
        if doc_type == "widerruf" and profile.business_type not in ("b2c", "unknown"):
            check.severity = "INFO"
            if not check.passed:
                check.hint = (
                    "Als B2B-Unternehmen benoetigen Sie keine Widerrufsbelehrung "
                    "(§355 BGB gilt nur fuer Verbrauchervertraege). "
                    "Empfehlung: Entfernen Sie die Widerrufsbelehrung von "
                    "Ihrer Website, da sie Verwirrung stiften kann."
                )

        # Regulated profession: check for Kammer info
        if "kammer" in cid or "berufsordnung" in check.label.lower():
            if not profile.is_regulated_profession:
                check.skipped = True
                check.hint = "Nicht relevant (kein regulierter Beruf)"

    return result


def _pad_results_with_missing(
    results: list,
    discovery_attempted: set[str] | None = None,
) -> list:
    """Ensure every canonical doc_type has an entry in the results list.

    Doc_types the user did not submit AND auto-discovery did not find get
    a placeholder DocCheckResult. The error message distinguishes:
      - 'Auf der Website nicht gefunden' (discovery was attempted)
      - 'Nicht eingereicht' (no submitted URLs to crawl from)

    Preserves the canonical ordering from _ALL_DOC_TYPES so the report
    layout is stable.
    """
    from ..agent_doc_check_routes import DocCheckResult
    attempted = discovery_attempted or set()

    by_type: dict[str, object] = {}
    for r in results:
        canon = "dse" if r.doc_type in ("datenschutz", "privacy") else r.doc_type
        by_type[canon] = r

    ordered: list = []
    for dt in _ALL_DOC_TYPES:
        if dt in by_type:
            ordered.append(by_type[dt])
            continue
        if dt in attempted:
            msg = ("Auf der Website nicht gefunden — bitte URL des "
                   "Dokuments manuell eintragen, falls vorhanden")
        else:
            msg = "Nicht eingereicht — Quelle nicht angegeben"
        ordered.append(DocCheckResult(
            label=_doc_type_label(dt),
            url="",
            doc_type=dt,
            word_count=0,
            completeness_pct=0,
            correctness_pct=0,
            checks=[],
            findings_count=0,
            error=msg,
            scenario="missing",
        ))

    extras = [r for r in results
              if (r.doc_type if r.doc_type not in ("datenschutz", "privacy") else "dse")
              not in _ALL_DOC_TYPES]
    ordered.extend(extras)
    return ordered


def _result_to_dict(r) -> dict:
    """Convert DocCheckResult to JSON-serializable dict."""
    fields = ("id", "label", "passed", "severity", "matched_text",
              "level", "parent", "skipped", "hint")
    return {
        "label": r.label, "url": r.url, "doc_type": r.doc_type,
        "word_count": r.word_count, "completeness_pct": r.completeness_pct,
        "correctness_pct": r.correctness_pct,
        "checks": [{f: getattr(c, f) for f in fields} for c in r.checks],
        "findings_count": r.findings_count, "error": r.error,
        "scenario": getattr(r, "scenario", ""),
    }


def _build_profile_html(profile) -> str:
    from ..agent_doc_check_report import build_profile_html
    return build_profile_html(profile)