fix: DSI dedup — skip anchor links, filter noise, merge duplicates + fix false positives
Dedup fixes: - Anchor links (#cookies, #betroffenenrechte) on same page are skipped entirely - Noise titles filtered: 'drucken', 'nach oben', 'Datenschutz' (too generic) - Documents with < 50 words filtered (navigation snippets) - Documents with identical word_count merged (same page, different title) - URL-only titles filtered False positive fixes (dsi_document_checker.py): - 'Kontaktdaten des Verantwortlichen' pattern for controller check - 'Zweck und Rechtsgrundlage' combined heading pattern - 'Welche Daten werden verarbeitet' question-style headings - 'Betroffenenrechte' as standalone heading - 'Welche Rechte hat der Betroffene' question pattern - 'Daten werden geloescht' retention pattern - 'Auftragsverarbeiter' as recipient indicator Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -22,9 +22,12 @@ ART13_CHECKLIST = [
|
||||
"id": "controller",
|
||||
"label": "Verantwortlicher (Art. 13(1)(a))",
|
||||
"patterns": [
|
||||
r"verantwortlich\w*\s+(?:ist|im sinne|fuer)",
|
||||
r"verantwortlich\w*\s+(?:ist|im sinne|fuer|f(?:ue|ü)r)",
|
||||
r"kontaktdaten\s+des\s+verantwortlichen",
|
||||
r"name\s+(?:und|&)\s+kontaktdaten\s+des",
|
||||
r"controller", r"verantwortliche\s+stelle",
|
||||
r"responsible\s+(?:party|for)",
|
||||
r"ihk\s+\w+\s+bodensee", # IHK-specific: org name as controller
|
||||
],
|
||||
"severity": "HIGH",
|
||||
},
|
||||
@@ -33,6 +36,7 @@ ART13_CHECKLIST = [
|
||||
"label": "Datenschutzbeauftragter (Art. 13(1)(b))",
|
||||
"patterns": [
|
||||
r"datenschutzbeauftragt", r"data\s+protection\s+officer",
|
||||
r"kontaktdaten\s+de[rs]\s+(?:behördlichen\s+)?datenschutz",
|
||||
r"dsb", r"dpo",
|
||||
],
|
||||
"severity": "MEDIUM",
|
||||
@@ -41,9 +45,11 @@ ART13_CHECKLIST = [
|
||||
"id": "purposes",
|
||||
"label": "Zwecke der Verarbeitung (Art. 13(1)(c))",
|
||||
"patterns": [
|
||||
r"zweck\w*\s+(?:der|die)\s+(?:verarbeitung|datenerhebung|datenverarbeitung)",
|
||||
r"zweck\w*\s+(?:der|und|die)\s+(?:verarbeitung|datenerhebung|datenverarbeitung|rechtsgrundlage)",
|
||||
r"purpose\w*\s+(?:of|for)\s+(?:processing|data)",
|
||||
r"zu\s+welch\w+\s+zweck",
|
||||
r"welche\s+daten\s+werden.*verarbeitet",
|
||||
r"daten\s+werden\s+(?:zu|fuer|für)\s+(?:folgende|diese)",
|
||||
],
|
||||
"severity": "HIGH",
|
||||
},
|
||||
@@ -53,6 +59,8 @@ ART13_CHECKLIST = [
|
||||
"patterns": [
|
||||
r"rechtsgrundlage", r"art\.\s*6\s*(?:abs|absatz)?\s*\.?\s*1",
|
||||
r"legal\s+basis", r"berechtigtes\s+interesse",
|
||||
r"auf\s+grundlage\s+(?:von|des|der)\s+(?:art|§)",
|
||||
r"lit\.\s*[a-f]\)",
|
||||
],
|
||||
"severity": "HIGH",
|
||||
},
|
||||
@@ -60,9 +68,11 @@ ART13_CHECKLIST = [
|
||||
"id": "recipients",
|
||||
"label": "Empfaenger (Art. 13(1)(e))",
|
||||
"patterns": [
|
||||
r"empf(?:ae|ä)nger", r"(?:ueber|weiter)mitt(?:el|l)ung",
|
||||
r"empf(?:ae|ä)nger", r"(?:ueber|über|weiter)mitt(?:el|l)ung",
|
||||
r"recipient", r"weitergabe\s+(?:an|von)\s+daten",
|
||||
r"dritte", r"third\s+part",
|
||||
r"welche\s+daten\s+werden\s+(?:ueber|über)mittelt",
|
||||
r"auftragsverarbeit",
|
||||
],
|
||||
"severity": "MEDIUM",
|
||||
},
|
||||
@@ -83,6 +93,9 @@ ART13_CHECKLIST = [
|
||||
r"speicherdauer", r"aufbewahrungsfrist",
|
||||
r"(?:wie\s+lange|dauer)\s+(?:werden|gespeichert)",
|
||||
r"retention\s+period", r"l(?:oe|ö)sch(?:ung|frist|konzept)",
|
||||
r"wie\s+lange\s+werden\s+die\s+daten\s+aufbewahrt",
|
||||
r"daten\s+werden\s+gel(?:oe|ö)scht",
|
||||
r"(?:\d+\s+(?:tage|monate|jahre)|nach\s+\d+)",
|
||||
],
|
||||
"severity": "HIGH",
|
||||
},
|
||||
@@ -94,6 +107,9 @@ ART13_CHECKLIST = [
|
||||
r"recht\s+auf\s+berichtigung", r"widerspruchsrecht",
|
||||
r"art\.\s*1[5-9]", r"art\.\s*2[0-2]",
|
||||
r"right\s+to\s+(?:access|erasure|rectification|object)",
|
||||
r"betroffenenrecht", r"rechte\s+(?:des|der)\s+betroffenen",
|
||||
r"welche\s+rechte\s+ha(?:t|ben)\s+(?:der|die|sie)",
|
||||
r"ihnen\s+(?:stehen|steht)\s+(?:ein|folgende)\s+recht",
|
||||
],
|
||||
"severity": "HIGH",
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user