fix: Add 'impressum' to DSI keywords for self-extraction

"impressum" was missing from DSI_KEYWORDS despite being listed in
the docstring. This caused /impressum URLs to skip self-extraction
and return linked datenschutz text instead.

Added: DE: impressum, anbieterkennzeichnung, kontakt
       EN: imprint, legal notice, site notice, legal information

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-10 11:00:26 +02:00
parent a14e5ad97d
commit 345ea70844
+132
View File
@@ -0,0 +1,132 @@
"""
DSI Keywords — Legal document keywords in all 26 EU/EEA official languages.
Covers: DSI (privacy), AGB (terms), Widerruf (cancellation),
Cookie-Richtlinie, Impressum, NB (Nutzungsbedingungen).
"""
DSI_KEYWORDS: dict[str, list[str]] = {
"de": [
# Datenschutz
"datenschutz", "datenschutzerklaerung", "datenschutzinformation",
"datenschutzhinweis", "datenschutzrichtlinie", "dsgvo", "privatsphäre",
"datenschutzbestimmung", "verarbeitung personenbezogener daten",
# AGB / Nutzungsbedingungen
"allgemeine geschäftsbedingungen", "agb", "nutzungsbedingungen",
"nutzungsordnung", "geschäftsbedingungen",
# Widerruf
"widerrufsbelehrung", "widerrufsrecht", "widerrufsformular",
"widerruf", "rücktrittsrecht",
# Cookie
"cookie-richtlinie", "cookie-policy", "cookie-hinweis",
# Impressum
"impressum", "anbieterkennzeichnung", "kontakt",
],
"en": [
"privacy policy", "privacy notice", "data protection", "data policy",
"privacy statement", "gdpr", "personal data", "cookie policy",
"terms of service", "terms and conditions", "terms of use",
"cancellation policy", "right of withdrawal", "refund policy",
"cookie notice",
"imprint", "legal notice", "site notice", "legal information",
],
"fr": [
"politique de confidentialité", "protection des données",
"données personnelles", "vie privée", "rgpd",
"conditions générales", "conditions d'utilisation",
"droit de rétractation", "politique de cookies",
],
"es": [
"política de privacidad", "protección de datos",
"datos personales", "aviso de privacidad",
"términos y condiciones", "condiciones de uso",
"derecho de desistimiento", "política de cookies",
],
"it": [
"informativa sulla privacy", "protezione dei dati",
"dati personali", "privacy policy",
"termini e condizioni", "condizioni d'uso",
"diritto di recesso", "politica dei cookie",
],
"nl": [
"privacybeleid", "gegevensbescherming", "privacyverklaring",
"persoonsgegevens", "avg",
"algemene voorwaarden", "gebruiksvoorwaarden",
"herroepingsrecht", "cookiebeleid",
],
"pl": [
"polityka prywatności", "ochrona danych osobowych",
"dane osobowe", "rodo",
"regulamin", "warunki korzystania",
"prawo odstąpienia", "polityka cookies",
],
"pt": [
"política de privacidade", "proteção de dados",
"dados pessoais", "lgpd",
"termos e condições", "condições de utilização",
"direito de resolução", "política de cookies",
],
"sv": [
"integritetspolicy", "dataskydd", "personuppgifter",
"sekretesspolicy",
"allmänna villkor", "användarvillkor",
"ångerrätt", "cookiepolicy",
],
"da": [
"privatlivspolitik", "databeskyttelse", "personoplysninger",
"persondatapolitik",
"handelsbetingelser", "brugsbetingelser",
"fortrydelsesret", "cookiepolitik",
],
"fi": [
"tietosuojaseloste", "tietosuoja", "henkilötiedot",
"rekisteriseloste",
"yleiset ehdot", "käyttöehdot",
"peruutusoikeus", "evästekäytäntö",
],
"cs": ["zásady ochrany osobních údajů", "ochrana osobních údajů",
"zpracování osobních údajů", "obchodní podmínky", "zásady cookies"],
"el": ["πολιτική απορρήτου", "προστασία δεδομένων",
"προσωπικά δεδομένα", "όροι χρήσης", "πολιτική cookies"],
"hu": ["adatvédelmi szabályzat", "adatvédelem", "személyes adatok",
"általános szerződési feltételek", "cookie szabályzat"],
"ro": ["politica de confidențialitate", "protecția datelor",
"date cu caracter personal", "termeni și condiții", "politica cookies"],
"bg": ["политика за поверителност", "защита на данните",
"лични данни", "общи условия", "политика за бисквитки"],
"hr": ["politika privatnosti", "zaštita podataka", "osobni podaci",
"opći uvjeti", "politika kolačića"],
"sk": ["zásady ochrany osobných údajov", "ochrana osobných údajov",
"obchodné podmienky", "zásady cookies"],
"sl": ["politika zasebnosti", "varstvo podatkov", "osebni podatki",
"splošni pogoji", "politika piškotkov"],
"et": ["privaatsuspoliitika", "andmekaitse", "isikuandmed",
"kasutustingimused", "küpsiste poliitika"],
"lt": ["privatumo politika", "duomenų apsauga", "asmens duomenys",
"naudojimosi sąlygos", "slapukų politika"],
"lv": ["privātuma politika", "datu aizsardzība", "personas dati",
"lietošanas noteikumi", "sīkdatņu politika"],
"mt": ["politika tal-privatezza", "protezzjoni tad-data",
"termini u kundizzjonijiet"],
"ga": ["polasaí príobháideachais", "cosaint sonraí",
"téarmaí agus coinníollacha"],
"is": ["persónuverndarstefna", "persónuvernd",
"skilmálar og skilyrði"],
"no": ["personvernerklæring", "personvern", "personopplysninger",
"brukervilkår", "angrerett", "informasjonskapsler"],
}
# Flatten all keywords for quick matching
ALL_DSI_KEYWORDS: list[str] = []
for _kw_list in DSI_KEYWORDS.values():
ALL_DSI_KEYWORDS.extend(_kw_list)
def matches_dsi_keyword(text: str) -> tuple[bool, str]:
"""Check if text contains any DSI keyword. Returns (match, language)."""
text_lower = text.lower().strip()
for lang, keywords in DSI_KEYWORDS.items():
for kw in keywords:
if kw in text_lower:
return True, lang
return False, ""