refactor(agent-check): split routes file (2692→347 LOC) + wire B1/B3/A1 [guardrail-change]
Phase-5 split of agent_compliance_check_routes.py — the 2700-line
monolith was decomposed into 19 modules in compliance/api/agent_check/:
- Phase A-F: resolve / profile+check / banner+TCF / vendors raw+finalize /
HTML blocks top+mid+bot / email / persist
- Helpers: _constants, _helpers, _fetch, _discovery, _single_check
- Schemas + State + thin _orchestrator
A1 ZIP-Anhang nativ in _phase_e_email: evidence_zip_builder.py bundles
slices + manifest.json + audit_metadata.json (SHA256 per slice +
build_sha + source_url). smtp_sender.py erweitert um attachments-Parameter.
B1 COOKIE-CONSENT-UX-001 (Mobile Reachability): consent_reachability_check.py
parses footer anchors, classifies intent (reopen_cmp / info_only /
browser_deflect) + target (same_page_cmp / new_tab / external).
_b1_wiring.py fetches homepage with iPhone-UA + renders Art-7-Abs-3
severity-coloured block.
B3 TH-RETENTION (Cross-Doc Speicherdauer): retention_comparator.py
compares DSI claim ↔ cookie-table duration ↔ actual Max-Age/expires
with 5% tolerance + severity hierarchy (dsi_under_actual HIGH,
table_under_actual HIGH, dsi_vs_table MEDIUM, actual_under_table LOW
Safari-ITP-Hint). _b3_wiring.py + Top-10 mismatches table in mail.
Side-effects:
- Fixed silent UnboundLocalError in original Step 5 (gf_one_pager used
audit_quality_findings before declaration, caught by surrounding
except → block never rendered). New _phase_d3_blocks_bot.py runs
audit-quality FIRST.
- agent_compliance_check_routes.py removed from loc-exceptions.txt
("Phase 5 split target" — done).
Tests: 55/55 grün (B1 22 + B3 27 + saving_scan 6).
E2E: smoke against Elli DSE+Cookie produced HIGH/missing B1 finding,
TH-RETENTION table (17 cookies / 3 ✓ / 3 ✗ / 11 ?), evidence-zip
with 2 slices + manifest + audit_metadata (12089B, SHA256-chained,
source verified), email sent (attachments=1).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,228 @@
|
||||
"""Pure helpers for the compliance-check route — no I/O, no async.
|
||||
|
||||
Grouped here because each is small and they share the same constants
|
||||
imports. Splitting further would not improve readability.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from ._constants import (
|
||||
_ALL_DOC_TYPES,
|
||||
_COMPOUND_TLDS,
|
||||
_DISCOVERY_RULES,
|
||||
_DOC_TYPE_LABELS,
|
||||
_compliance_check_jobs,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _update(check_id: str, msg: str, pct: int | None = None) -> None:
|
||||
"""Update the in-memory job entry with a progress message + pct."""
|
||||
job = _compliance_check_jobs[check_id]
|
||||
job["progress"] = msg
|
||||
if pct is not None:
|
||||
job["progress_pct"] = max(0, min(100, int(pct)))
|
||||
|
||||
|
||||
def _doc_type_label(doc_type: str) -> str:
|
||||
return _DOC_TYPE_LABELS.get(doc_type, doc_type.upper())
|
||||
|
||||
|
||||
def _classify_discovered_doc(title: str, url: str) -> str | None:
|
||||
"""Map a discovered doc (by its title + URL) to one of our 8 canonical types."""
|
||||
haystack = f"{title} {url}"
|
||||
for canon, keywords in _DISCOVERY_RULES:
|
||||
if any(kw in haystack for kw in keywords):
|
||||
return canon
|
||||
return None
|
||||
|
||||
|
||||
def _extract_domain(doc_entries: list[dict]) -> str | None:
|
||||
"""Extract base domain (without www) from first URL."""
|
||||
for entry in doc_entries:
|
||||
url = entry.get("url", "")
|
||||
if url and "://" in url:
|
||||
host = urlparse(url).netloc.lower()
|
||||
if host.startswith("www."):
|
||||
host = host[4:]
|
||||
return host or None
|
||||
return None
|
||||
|
||||
|
||||
def _company_name_from_url(doc_entries: list[dict]) -> str | None:
|
||||
"""Derive a display company name from the entered URLs.
|
||||
|
||||
Heuristic: take the second-level domain (e.g. "bmw" from "www.bmw.de"),
|
||||
uppercase short acronyms (<=4 chars, no hyphens), title-case the rest.
|
||||
|
||||
Examples:
|
||||
www.bmw.de -> BMW
|
||||
mercedes-benz.de -> Mercedes-Benz
|
||||
shop.example.co.uk -> Example
|
||||
juris.de -> Juris
|
||||
"""
|
||||
for entry in doc_entries:
|
||||
url = entry.get("url", "")
|
||||
if not url or "://" not in url:
|
||||
continue
|
||||
host = urlparse(url).netloc.lower()
|
||||
if host.startswith("www."):
|
||||
host = host[4:]
|
||||
parts = host.split(".")
|
||||
if len(parts) < 2:
|
||||
continue
|
||||
# Handle compound TLDs (.co.uk etc.)
|
||||
if len(parts) >= 3 and ".".join(parts[-2:]) in _COMPOUND_TLDS:
|
||||
sld = parts[-3]
|
||||
else:
|
||||
sld = parts[-2]
|
||||
if not sld:
|
||||
continue
|
||||
if len(sld) <= 4 and "-" not in sld:
|
||||
return sld.upper()
|
||||
return "-".join(p.capitalize() for p in sld.split("-"))
|
||||
return None
|
||||
|
||||
|
||||
def _get_skip_types(profile) -> dict[str, str]:
|
||||
"""Doc_types to skip entirely with a per-type reason message.
|
||||
|
||||
Heute primaer fuer OEM-Konfigurator-Pattern (BMW/Audi/Mercedes):
|
||||
wenn die Site kein Direkt-Vertrieb macht, sind AGB/Widerruf/
|
||||
Nutzungsbedingungen nicht Pflicht auf der Website — sie werden
|
||||
beim Vertragshaendler ausgehaendigt.
|
||||
"""
|
||||
if getattr(profile, "no_direct_sales", False):
|
||||
msg = (
|
||||
"Nicht anwendbar — die Webseite schliesst keinen Direkt-"
|
||||
"Kaufvertrag (OEM-Konfigurator-Pattern, Vertrag laeuft "
|
||||
"ueber Vertragshaendler). AGB/Widerruf werden beim "
|
||||
"Haendler ausgehaendigt."
|
||||
)
|
||||
return {
|
||||
"agb": msg,
|
||||
"widerruf": msg,
|
||||
"nutzungsbedingungen": msg,
|
||||
}
|
||||
return {}
|
||||
|
||||
|
||||
def _apply_profile_filter(result, profile, doc_type: str):
|
||||
"""Adjust INFO-level checks based on business profile context.
|
||||
|
||||
For example: ODR check only relevant for B2C online shops.
|
||||
"""
|
||||
for check in result.checks:
|
||||
cid = check.id.lower()
|
||||
|
||||
# ODR/OS-Link: relevant ONLY for B2C online shops. The check's
|
||||
# default hint is written for B2B (it explains why it's not
|
||||
# relevant) — for B2C we must replace it with action-oriented
|
||||
# guidance, otherwise the report contradicts itself.
|
||||
if "odr" in cid or "os-link" in cid or "streitbeilegung" in check.label.lower():
|
||||
if profile.needs_odr:
|
||||
if not check.passed:
|
||||
check.hint = (
|
||||
"Als B2C-Anbieter muessen Sie nach Art. 14 EU-VO 524/2013 "
|
||||
"auf die OS-Plattform (https://ec.europa.eu/consumers/odr) "
|
||||
"verlinken — klickbarer Link, nicht nur Text. Zusaetzlich "
|
||||
"§36 VSBG: angeben, ob Sie an Verbraucher-"
|
||||
"Streitbeilegungsverfahren teilnehmen (oder nicht)."
|
||||
)
|
||||
else:
|
||||
check.skipped = True
|
||||
check.hint = "Nicht relevant (kein B2C Online-Shop)"
|
||||
|
||||
# Widerruf: Flag entire document as unnecessary for B2B
|
||||
if doc_type == "widerruf" and profile.business_type not in ("b2c", "unknown"):
|
||||
check.severity = "INFO"
|
||||
if not check.passed:
|
||||
check.hint = (
|
||||
"Als B2B-Unternehmen benoetigen Sie keine Widerrufsbelehrung "
|
||||
"(§355 BGB gilt nur fuer Verbrauchervertraege). "
|
||||
"Empfehlung: Entfernen Sie die Widerrufsbelehrung von "
|
||||
"Ihrer Website, da sie Verwirrung stiften kann."
|
||||
)
|
||||
|
||||
# Regulated profession: check for Kammer info
|
||||
if "kammer" in cid or "berufsordnung" in check.label.lower():
|
||||
if not profile.is_regulated_profession:
|
||||
check.skipped = True
|
||||
check.hint = "Nicht relevant (kein regulierter Beruf)"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _pad_results_with_missing(
|
||||
results: list,
|
||||
discovery_attempted: set[str] | None = None,
|
||||
) -> list:
|
||||
"""Ensure every canonical doc_type has an entry in the results list.
|
||||
|
||||
Doc_types the user did not submit AND auto-discovery did not find get
|
||||
a placeholder DocCheckResult. The error message distinguishes:
|
||||
- 'Auf der Website nicht gefunden' (discovery was attempted)
|
||||
- 'Nicht eingereicht' (no submitted URLs to crawl from)
|
||||
|
||||
Preserves the canonical ordering from _ALL_DOC_TYPES so the report
|
||||
layout is stable.
|
||||
"""
|
||||
from ..agent_doc_check_routes import DocCheckResult
|
||||
attempted = discovery_attempted or set()
|
||||
|
||||
by_type: dict[str, object] = {}
|
||||
for r in results:
|
||||
canon = "dse" if r.doc_type in ("datenschutz", "privacy") else r.doc_type
|
||||
by_type[canon] = r
|
||||
|
||||
ordered: list = []
|
||||
for dt in _ALL_DOC_TYPES:
|
||||
if dt in by_type:
|
||||
ordered.append(by_type[dt])
|
||||
continue
|
||||
if dt in attempted:
|
||||
msg = ("Auf der Website nicht gefunden — bitte URL des "
|
||||
"Dokuments manuell eintragen, falls vorhanden")
|
||||
else:
|
||||
msg = "Nicht eingereicht — Quelle nicht angegeben"
|
||||
ordered.append(DocCheckResult(
|
||||
label=_doc_type_label(dt),
|
||||
url="",
|
||||
doc_type=dt,
|
||||
word_count=0,
|
||||
completeness_pct=0,
|
||||
correctness_pct=0,
|
||||
checks=[],
|
||||
findings_count=0,
|
||||
error=msg,
|
||||
scenario="missing",
|
||||
))
|
||||
|
||||
extras = [r for r in results
|
||||
if (r.doc_type if r.doc_type not in ("datenschutz", "privacy") else "dse")
|
||||
not in _ALL_DOC_TYPES]
|
||||
ordered.extend(extras)
|
||||
return ordered
|
||||
|
||||
|
||||
def _result_to_dict(r) -> dict:
|
||||
"""Convert DocCheckResult to JSON-serializable dict."""
|
||||
fields = ("id", "label", "passed", "severity", "matched_text",
|
||||
"level", "parent", "skipped", "hint")
|
||||
return {
|
||||
"label": r.label, "url": r.url, "doc_type": r.doc_type,
|
||||
"word_count": r.word_count, "completeness_pct": r.completeness_pct,
|
||||
"correctness_pct": r.correctness_pct,
|
||||
"checks": [{f: getattr(c, f) for f in fields} for c in r.checks],
|
||||
"findings_count": r.findings_count, "error": r.error,
|
||||
"scenario": getattr(r, "scenario", ""),
|
||||
}
|
||||
|
||||
|
||||
def _build_profile_html(profile) -> str:
|
||||
from ..agent_doc_check_report import build_profile_html
|
||||
return build_profile_html(profile)
|
||||
Reference in New Issue
Block a user