c2c8783fee
Phase-5 split of agent_compliance_check_routes.py — the 2700-line
monolith was decomposed into 19 modules in compliance/api/agent_check/:
- Phase A-F: resolve / profile+check / banner+TCF / vendors raw+finalize /
HTML blocks top+mid+bot / email / persist
- Helpers: _constants, _helpers, _fetch, _discovery, _single_check
- Schemas + State + thin _orchestrator
A1 ZIP-Anhang nativ in _phase_e_email: evidence_zip_builder.py bundles
slices + manifest.json + audit_metadata.json (SHA256 per slice +
build_sha + source_url). smtp_sender.py erweitert um attachments-Parameter.
B1 COOKIE-CONSENT-UX-001 (Mobile Reachability): consent_reachability_check.py
parses footer anchors, classifies intent (reopen_cmp / info_only /
browser_deflect) + target (same_page_cmp / new_tab / external).
_b1_wiring.py fetches homepage with iPhone-UA + renders Art-7-Abs-3
severity-coloured block.
B3 TH-RETENTION (Cross-Doc Speicherdauer): retention_comparator.py
compares DSI claim ↔ cookie-table duration ↔ actual Max-Age/expires
with 5% tolerance + severity hierarchy (dsi_under_actual HIGH,
table_under_actual HIGH, dsi_vs_table MEDIUM, actual_under_table LOW
Safari-ITP-Hint). _b3_wiring.py + Top-10 mismatches table in mail.
Side-effects:
- Fixed silent UnboundLocalError in original Step 5 (gf_one_pager used
audit_quality_findings before declaration, caught by surrounding
except → block never rendered). New _phase_d3_blocks_bot.py runs
audit-quality FIRST.
- agent_compliance_check_routes.py removed from loc-exceptions.txt
("Phase 5 split target" — done).
Tests: 55/55 grün (B1 22 + B3 27 + saving_scan 6).
E2E: smoke against Elli DSE+Cookie produced HIGH/missing B1 finding,
TH-RETENTION table (17 cookies / 3 ✓ / 3 ✗ / 11 ?), evidence-zip
with 2 slices + manifest + audit_metadata (12089B, SHA256-chained,
source verified), email sent (attachments=1).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
94 lines
4.1 KiB
Python
94 lines
4.1 KiB
Python
"""Module-level constants + shared job state for the compliance-check
|
|
route.
|
|
|
|
`_compliance_check_jobs` is the SINGLE source of truth for in-flight
|
|
job progress. Other modules MUST import the same object — never
|
|
re-declare it — otherwise progress updates land in a detached dict.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
# Internal hostname of the consent-tester container.
|
|
CONSENT_TESTER_URL = "http://bp-compliance-consent-tester:8094"
|
|
|
|
# In-memory job registry. Keyed by check_id. Values:
|
|
# {"status": "running"|"completed"|"failed"|"skipped_tdm",
|
|
# "progress": str, "progress_pct": int, "result": dict, ...}
|
|
# Read/written by:
|
|
# - agent_compliance_check_routes (start/status/_run/_update)
|
|
# - saving_scan_routes (start)
|
|
# - agent_migration_routes (status mirror)
|
|
_compliance_check_jobs: dict[str, dict] = {}
|
|
|
|
|
|
# Canonical doc types in the same order the frontend
|
|
# ComplianceCheckTab renders them. The route pads `results` to always
|
|
# include an entry for each — missing rows are flagged as 'Nicht
|
|
# eingereicht' or 'Auf der Website nicht gefunden'.
|
|
#
|
|
# DSB-Kontakt is NOT canonical: per GDPR practice the DSB is named
|
|
# inside the DSI/datenschutz document (email or contact block), not as
|
|
# a separate page. We check 'DSB benannt' as a sub-check of the DSE.
|
|
_ALL_DOC_TYPES = [
|
|
"dse", "impressum", "social_media", "cookie",
|
|
"agb", "nutzungsbedingungen", "widerruf",
|
|
]
|
|
|
|
|
|
# Human-readable labels per doc_type. Used in the report + emails.
|
|
_DOC_TYPE_LABELS = {
|
|
"dse": "Datenschutzerklaerung",
|
|
"datenschutz": "Datenschutzerklaerung",
|
|
"privacy": "Datenschutzerklaerung",
|
|
"impressum": "Impressum",
|
|
"agb": "AGB",
|
|
"widerruf": "Widerrufsbelehrung",
|
|
"cookie": "Cookie-Richtlinie",
|
|
"avv": "Auftragsverarbeitung",
|
|
"loeschkonzept": "Loeschkonzept",
|
|
"dsfa": "Datenschutz-Folgenabschaetzung",
|
|
"social_media": "Social Media Datenschutz",
|
|
"nutzungsbedingungen": "Nutzungsbedingungen",
|
|
"dsb": "DSB-Kontakt",
|
|
# P74: Legal-Notice / Rechtliche Hinweise (IP, Forward-Looking, Risiko)
|
|
"legal_notice": "Rechtliche Hinweise",
|
|
# P96: Digital Services Act-Pflichtangaben (Art. 12+17 DSA)
|
|
"dsa": "DSA-Pflichtangaben",
|
|
# P97: Lizenzhinweise Dritter (OSS-Compliance)
|
|
"lizenzhinweise": "Lizenzhinweise Dritter",
|
|
}
|
|
|
|
|
|
# Title/URL keywords → canonical doc_type. Order matters: most-specific first.
|
|
_DISCOVERY_RULES: list[tuple[str, tuple[str, ...]]] = [
|
|
("cookie", ("cookie", "kuche", "biscuit", "cookies-")),
|
|
("widerruf", ("widerruf", "rueckgabe", "rückgabe", "cancellation",
|
|
"right-of-withdrawal", "ruecktritts", "rücktritts")),
|
|
("social_media", ("social-media", "soziale-medien", "social_media",
|
|
"social-media-policy")),
|
|
# P23: 'terms-and-conditions' kann Allgemeine Geschaeftsbedingungen ODER
|
|
# Nutzungsbedingungen meinen. Discovery-Funktion klassifiziert spaeter
|
|
# praeziser per Titel + Inhalt. Hier nur Url-Hint:
|
|
("agb", ("/agb", "geschaeftsbedingungen", "geschäftsbedingungen",
|
|
"general-terms")),
|
|
("nutzungsbedingungen", ("nutzungsbedingung", "nutzungsbedingungen",
|
|
"terms-of-use", "terms-and-conditions",
|
|
"nutzungsordnung", "terms-of-service",
|
|
"allgemeine-nutzungsbedingungen")),
|
|
("dsb", ("datenschutzbeauftragt", "data-protection-officer",
|
|
"dpo-contact", "/dsb")),
|
|
("impressum", ("impressum", "imprint", "legal-notice", "site-notice",
|
|
"anbieterkennzeichnung", "legal-disclaimer-pool")),
|
|
("dse", ("data-privacy", "datenschutz", "data-protection",
|
|
"privacy-policy", "privacy-notice", "dsgvo",
|
|
"data_privacy", "datenschutzinformation")),
|
|
]
|
|
|
|
|
|
# Compound TLDs that count as 2 labels when extracting the second-level
|
|
# domain (e.g. shop.example.co.uk → 'example', not 'co').
|
|
_COMPOUND_TLDS = {
|
|
"co.uk", "co.jp", "co.nz", "co.kr", "co.za", "co.in",
|
|
"com.au", "com.br", "com.mx", "com.tr", "com.sg",
|
|
}
|