"""Pure helpers for the compliance-check route — no I/O, no async. Grouped here because each is small and they share the same constants imports. Splitting further would not improve readability. """ from __future__ import annotations import logging from urllib.parse import urlparse from ._constants import ( _ALL_DOC_TYPES, _COMPOUND_TLDS, _DISCOVERY_RULES, _DOC_TYPE_LABELS, _compliance_check_jobs, ) logger = logging.getLogger(__name__) def _update(check_id: str, msg: str, pct: int | None = None) -> None: """Update the in-memory job entry with a progress message + pct.""" job = _compliance_check_jobs[check_id] job["progress"] = msg if pct is not None: job["progress_pct"] = max(0, min(100, int(pct))) def _doc_type_label(doc_type: str) -> str: return _DOC_TYPE_LABELS.get(doc_type, doc_type.upper()) def _classify_discovered_doc(title: str, url: str) -> str | None: """Map a discovered doc (by its title + URL) to one of our 8 canonical types.""" haystack = f"{title} {url}" for canon, keywords in _DISCOVERY_RULES: if any(kw in haystack for kw in keywords): return canon return None def _extract_domain(doc_entries: list[dict]) -> str | None: """Extract base domain (without www) from first URL.""" for entry in doc_entries: url = entry.get("url", "") if url and "://" in url: host = urlparse(url).netloc.lower() if host.startswith("www."): host = host[4:] return host or None return None def _company_name_from_url(doc_entries: list[dict]) -> str | None: """Derive a display company name from the entered URLs. Heuristic: take the second-level domain (e.g. "bmw" from "www.bmw.de"), uppercase short acronyms (<=4 chars, no hyphens), title-case the rest. Examples: www.bmw.de -> BMW mercedes-benz.de -> Mercedes-Benz shop.example.co.uk -> Example juris.de -> Juris """ for entry in doc_entries: url = entry.get("url", "") if not url or "://" not in url: continue host = urlparse(url).netloc.lower() if host.startswith("www."): host = host[4:] parts = host.split(".") if len(parts) < 2: continue # Handle compound TLDs (.co.uk etc.) if len(parts) >= 3 and ".".join(parts[-2:]) in _COMPOUND_TLDS: sld = parts[-3] else: sld = parts[-2] if not sld: continue if len(sld) <= 4 and "-" not in sld: return sld.upper() return "-".join(p.capitalize() for p in sld.split("-")) return None def _get_skip_types(profile) -> dict[str, str]: """Doc_types to skip entirely with a per-type reason message. Heute primaer fuer OEM-Konfigurator-Pattern (BMW/Audi/Mercedes): wenn die Site kein Direkt-Vertrieb macht, sind AGB/Widerruf/ Nutzungsbedingungen nicht Pflicht auf der Website — sie werden beim Vertragshaendler ausgehaendigt. """ if getattr(profile, "no_direct_sales", False): msg = ( "Nicht anwendbar — die Webseite schliesst keinen Direkt-" "Kaufvertrag (OEM-Konfigurator-Pattern, Vertrag laeuft " "ueber Vertragshaendler). AGB/Widerruf werden beim " "Haendler ausgehaendigt." ) return { "agb": msg, "widerruf": msg, "nutzungsbedingungen": msg, } return {} def _apply_profile_filter(result, profile, doc_type: str): """Adjust INFO-level checks based on business profile context. For example: ODR check only relevant for B2C online shops. """ for check in result.checks: cid = check.id.lower() # ODR/OS-Link: relevant ONLY for B2C online shops. The check's # default hint is written for B2B (it explains why it's not # relevant) — for B2C we must replace it with action-oriented # guidance, otherwise the report contradicts itself. if "odr" in cid or "os-link" in cid or "streitbeilegung" in check.label.lower(): if profile.needs_odr: if not check.passed: check.hint = ( "Als B2C-Anbieter muessen Sie nach Art. 14 EU-VO 524/2013 " "auf die OS-Plattform (https://ec.europa.eu/consumers/odr) " "verlinken — klickbarer Link, nicht nur Text. Zusaetzlich " "§36 VSBG: angeben, ob Sie an Verbraucher-" "Streitbeilegungsverfahren teilnehmen (oder nicht)." ) else: check.skipped = True check.hint = "Nicht relevant (kein B2C Online-Shop)" # Widerruf: Flag entire document as unnecessary for B2B if doc_type == "widerruf" and profile.business_type not in ("b2c", "unknown"): check.severity = "INFO" if not check.passed: check.hint = ( "Als B2B-Unternehmen benoetigen Sie keine Widerrufsbelehrung " "(§355 BGB gilt nur fuer Verbrauchervertraege). " "Empfehlung: Entfernen Sie die Widerrufsbelehrung von " "Ihrer Website, da sie Verwirrung stiften kann." ) # Regulated profession: check for Kammer info if "kammer" in cid or "berufsordnung" in check.label.lower(): if not profile.is_regulated_profession: check.skipped = True check.hint = "Nicht relevant (kein regulierter Beruf)" return result def _pad_results_with_missing( results: list, discovery_attempted: set[str] | None = None, ) -> list: """Ensure every canonical doc_type has an entry in the results list. Doc_types the user did not submit AND auto-discovery did not find get a placeholder DocCheckResult. The error message distinguishes: - 'Auf der Website nicht gefunden' (discovery was attempted) - 'Nicht eingereicht' (no submitted URLs to crawl from) Preserves the canonical ordering from _ALL_DOC_TYPES so the report layout is stable. """ from ..agent_doc_check_routes import DocCheckResult attempted = discovery_attempted or set() by_type: dict[str, object] = {} for r in results: canon = "dse" if r.doc_type in ("datenschutz", "privacy") else r.doc_type by_type[canon] = r ordered: list = [] for dt in _ALL_DOC_TYPES: if dt in by_type: ordered.append(by_type[dt]) continue if dt in attempted: msg = ("Auf der Website nicht gefunden — bitte URL des " "Dokuments manuell eintragen, falls vorhanden") else: msg = "Nicht eingereicht — Quelle nicht angegeben" ordered.append(DocCheckResult( label=_doc_type_label(dt), url="", doc_type=dt, word_count=0, completeness_pct=0, correctness_pct=0, checks=[], findings_count=0, error=msg, scenario="missing", )) extras = [r for r in results if (r.doc_type if r.doc_type not in ("datenschutz", "privacy") else "dse") not in _ALL_DOC_TYPES] ordered.extend(extras) return ordered def _result_to_dict(r) -> dict: """Convert DocCheckResult to JSON-serializable dict.""" fields = ("id", "label", "passed", "severity", "matched_text", "level", "parent", "skipped", "hint") return { "label": r.label, "url": r.url, "doc_type": r.doc_type, "word_count": r.word_count, "completeness_pct": r.completeness_pct, "correctness_pct": r.correctness_pct, "checks": [{f: getattr(c, f) for f in fields} for c in r.checks], "findings_count": r.findings_count, "error": r.error, "scenario": getattr(r, "scenario", ""), } def _build_profile_html(profile) -> str: from ..agent_doc_check_report import build_profile_html return build_profile_html(profile)