"""Module-level constants + shared job state for the compliance-check route. `_compliance_check_jobs` is the SINGLE source of truth for in-flight job progress. Other modules MUST import the same object — never re-declare it — otherwise progress updates land in a detached dict. """ from __future__ import annotations # Internal hostname of the consent-tester container. CONSENT_TESTER_URL = "http://bp-compliance-consent-tester:8094" # In-memory job registry. Keyed by check_id. Values: # {"status": "running"|"completed"|"failed"|"skipped_tdm", # "progress": str, "progress_pct": int, "result": dict, ...} # Read/written by: # - agent_compliance_check_routes (start/status/_run/_update) # - saving_scan_routes (start) # - agent_migration_routes (status mirror) _compliance_check_jobs: dict[str, dict] = {} # Canonical doc types in the same order the frontend # ComplianceCheckTab renders them. The route pads `results` to always # include an entry for each — missing rows are flagged as 'Nicht # eingereicht' or 'Auf der Website nicht gefunden'. # # DSB-Kontakt is NOT canonical: per GDPR practice the DSB is named # inside the DSI/datenschutz document (email or contact block), not as # a separate page. We check 'DSB benannt' as a sub-check of the DSE. _ALL_DOC_TYPES = [ "dse", "impressum", "social_media", "cookie", "agb", "nutzungsbedingungen", "widerruf", ] # Human-readable labels per doc_type. Used in the report + emails. _DOC_TYPE_LABELS = { "dse": "Datenschutzerklaerung", "datenschutz": "Datenschutzerklaerung", "privacy": "Datenschutzerklaerung", "impressum": "Impressum", "agb": "AGB", "widerruf": "Widerrufsbelehrung", "cookie": "Cookie-Richtlinie", "avv": "Auftragsverarbeitung", "loeschkonzept": "Loeschkonzept", "dsfa": "Datenschutz-Folgenabschaetzung", "social_media": "Social Media Datenschutz", "nutzungsbedingungen": "Nutzungsbedingungen", "dsb": "DSB-Kontakt", # P74: Legal-Notice / Rechtliche Hinweise (IP, Forward-Looking, Risiko) "legal_notice": "Rechtliche Hinweise", # P96: Digital Services Act-Pflichtangaben (Art. 12+17 DSA) "dsa": "DSA-Pflichtangaben", # P97: Lizenzhinweise Dritter (OSS-Compliance) "lizenzhinweise": "Lizenzhinweise Dritter", } # Title/URL keywords → canonical doc_type. Order matters: most-specific first. _DISCOVERY_RULES: list[tuple[str, tuple[str, ...]]] = [ ("cookie", ("cookie", "kuche", "biscuit", "cookies-")), ("widerruf", ("widerruf", "rueckgabe", "rückgabe", "cancellation", "right-of-withdrawal", "ruecktritts", "rücktritts")), ("social_media", ("social-media", "soziale-medien", "social_media", "social-media-policy")), # P23: 'terms-and-conditions' kann Allgemeine Geschaeftsbedingungen ODER # Nutzungsbedingungen meinen. Discovery-Funktion klassifiziert spaeter # praeziser per Titel + Inhalt. Hier nur Url-Hint: ("agb", ("/agb", "geschaeftsbedingungen", "geschäftsbedingungen", "general-terms")), ("nutzungsbedingungen", ("nutzungsbedingung", "nutzungsbedingungen", "terms-of-use", "terms-and-conditions", "nutzungsordnung", "terms-of-service", "allgemeine-nutzungsbedingungen")), ("dsb", ("datenschutzbeauftragt", "data-protection-officer", "dpo-contact", "/dsb")), ("impressum", ("impressum", "imprint", "legal-notice", "site-notice", "anbieterkennzeichnung", "legal-disclaimer-pool")), ("dse", ("data-privacy", "datenschutz", "data-protection", "privacy-policy", "privacy-notice", "dsgvo", "data_privacy", "datenschutzinformation")), ] # Compound TLDs that count as 2 labels when extracting the second-level # domain (e.g. shop.example.co.uk → 'example', not 'co'). _COMPOUND_TLDS = { "co.uk", "co.jp", "co.nz", "co.kr", "co.za", "co.in", "com.au", "com.br", "com.mx", "com.tr", "com.sg", }