From 345ea708445019d441462de2c6b8d74a4bb91237 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 10 May 2026 11:00:26 +0200 Subject: [PATCH] fix: Add 'impressum' to DSI keywords for self-extraction "impressum" was missing from DSI_KEYWORDS despite being listed in the docstring. This caused /impressum URLs to skip self-extraction and return linked datenschutz text instead. Added: DE: impressum, anbieterkennzeichnung, kontakt EN: imprint, legal notice, site notice, legal information Co-Authored-By: Claude Opus 4.6 (1M context) --- consent-tester/services/dsi_keywords.py | 132 ++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 consent-tester/services/dsi_keywords.py diff --git a/consent-tester/services/dsi_keywords.py b/consent-tester/services/dsi_keywords.py new file mode 100644 index 0000000..eb215fd --- /dev/null +++ b/consent-tester/services/dsi_keywords.py @@ -0,0 +1,132 @@ +""" +DSI Keywords — Legal document keywords in all 26 EU/EEA official languages. + +Covers: DSI (privacy), AGB (terms), Widerruf (cancellation), +Cookie-Richtlinie, Impressum, NB (Nutzungsbedingungen). +""" + +DSI_KEYWORDS: dict[str, list[str]] = { + "de": [ + # Datenschutz + "datenschutz", "datenschutzerklaerung", "datenschutzinformation", + "datenschutzhinweis", "datenschutzrichtlinie", "dsgvo", "privatsphäre", + "datenschutzbestimmung", "verarbeitung personenbezogener daten", + # AGB / Nutzungsbedingungen + "allgemeine geschäftsbedingungen", "agb", "nutzungsbedingungen", + "nutzungsordnung", "geschäftsbedingungen", + # Widerruf + "widerrufsbelehrung", "widerrufsrecht", "widerrufsformular", + "widerruf", "rücktrittsrecht", + # Cookie + "cookie-richtlinie", "cookie-policy", "cookie-hinweis", + # Impressum + "impressum", "anbieterkennzeichnung", "kontakt", + ], + "en": [ + "privacy policy", "privacy notice", "data protection", "data policy", + "privacy statement", "gdpr", "personal data", "cookie policy", + "terms of service", "terms and conditions", "terms of use", + "cancellation policy", "right of withdrawal", "refund policy", + "cookie notice", + "imprint", "legal notice", "site notice", "legal information", + ], + "fr": [ + "politique de confidentialité", "protection des données", + "données personnelles", "vie privée", "rgpd", + "conditions générales", "conditions d'utilisation", + "droit de rétractation", "politique de cookies", + ], + "es": [ + "política de privacidad", "protección de datos", + "datos personales", "aviso de privacidad", + "términos y condiciones", "condiciones de uso", + "derecho de desistimiento", "política de cookies", + ], + "it": [ + "informativa sulla privacy", "protezione dei dati", + "dati personali", "privacy policy", + "termini e condizioni", "condizioni d'uso", + "diritto di recesso", "politica dei cookie", + ], + "nl": [ + "privacybeleid", "gegevensbescherming", "privacyverklaring", + "persoonsgegevens", "avg", + "algemene voorwaarden", "gebruiksvoorwaarden", + "herroepingsrecht", "cookiebeleid", + ], + "pl": [ + "polityka prywatności", "ochrona danych osobowych", + "dane osobowe", "rodo", + "regulamin", "warunki korzystania", + "prawo odstąpienia", "polityka cookies", + ], + "pt": [ + "política de privacidade", "proteção de dados", + "dados pessoais", "lgpd", + "termos e condições", "condições de utilização", + "direito de resolução", "política de cookies", + ], + "sv": [ + "integritetspolicy", "dataskydd", "personuppgifter", + "sekretesspolicy", + "allmänna villkor", "användarvillkor", + "ångerrätt", "cookiepolicy", + ], + "da": [ + "privatlivspolitik", "databeskyttelse", "personoplysninger", + "persondatapolitik", + "handelsbetingelser", "brugsbetingelser", + "fortrydelsesret", "cookiepolitik", + ], + "fi": [ + "tietosuojaseloste", "tietosuoja", "henkilötiedot", + "rekisteriseloste", + "yleiset ehdot", "käyttöehdot", + "peruutusoikeus", "evästekäytäntö", + ], + "cs": ["zásady ochrany osobních údajů", "ochrana osobních údajů", + "zpracování osobních údajů", "obchodní podmínky", "zásady cookies"], + "el": ["πολιτική απορρήτου", "προστασία δεδομένων", + "προσωπικά δεδομένα", "όροι χρήσης", "πολιτική cookies"], + "hu": ["adatvédelmi szabályzat", "adatvédelem", "személyes adatok", + "általános szerződési feltételek", "cookie szabályzat"], + "ro": ["politica de confidențialitate", "protecția datelor", + "date cu caracter personal", "termeni și condiții", "politica cookies"], + "bg": ["политика за поверителност", "защита на данните", + "лични данни", "общи условия", "политика за бисквитки"], + "hr": ["politika privatnosti", "zaštita podataka", "osobni podaci", + "opći uvjeti", "politika kolačića"], + "sk": ["zásady ochrany osobných údajov", "ochrana osobných údajov", + "obchodné podmienky", "zásady cookies"], + "sl": ["politika zasebnosti", "varstvo podatkov", "osebni podatki", + "splošni pogoji", "politika piškotkov"], + "et": ["privaatsuspoliitika", "andmekaitse", "isikuandmed", + "kasutustingimused", "küpsiste poliitika"], + "lt": ["privatumo politika", "duomenų apsauga", "asmens duomenys", + "naudojimosi sąlygos", "slapukų politika"], + "lv": ["privātuma politika", "datu aizsardzība", "personas dati", + "lietošanas noteikumi", "sīkdatņu politika"], + "mt": ["politika tal-privatezza", "protezzjoni tad-data", + "termini u kundizzjonijiet"], + "ga": ["polasaí príobháideachais", "cosaint sonraí", + "téarmaí agus coinníollacha"], + "is": ["persónuverndarstefna", "persónuvernd", + "skilmálar og skilyrði"], + "no": ["personvernerklæring", "personvern", "personopplysninger", + "brukervilkår", "angrerett", "informasjonskapsler"], +} + +# Flatten all keywords for quick matching +ALL_DSI_KEYWORDS: list[str] = [] +for _kw_list in DSI_KEYWORDS.values(): + ALL_DSI_KEYWORDS.extend(_kw_list) + + +def matches_dsi_keyword(text: str) -> tuple[bool, str]: + """Check if text contains any DSI keyword. Returns (match, language).""" + text_lower = text.lower().strip() + for lang, keywords in DSI_KEYWORDS.items(): + for kw in keywords: + if kw in text_lower: + return True, lang + return False, ""