fix: DSI dedup — skip anchor links, filter noise, merge duplicates + fix false positives
Dedup fixes: - Anchor links (#cookies, #betroffenenrechte) on same page are skipped entirely - Noise titles filtered: 'drucken', 'nach oben', 'Datenschutz' (too generic) - Documents with < 50 words filtered (navigation snippets) - Documents with identical word_count merged (same page, different title) - URL-only titles filtered False positive fixes (dsi_document_checker.py): - 'Kontaktdaten des Verantwortlichen' pattern for controller check - 'Zweck und Rechtsgrundlage' combined heading pattern - 'Welche Daten werden verarbeitet' question-style headings - 'Betroffenenrechte' as standalone heading - 'Welche Rechte hat der Betroffene' question pattern - 'Daten werden geloescht' retention pattern - 'Auftragsverarbeiter' as recipient indicator Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -22,9 +22,12 @@ ART13_CHECKLIST = [
|
|||||||
"id": "controller",
|
"id": "controller",
|
||||||
"label": "Verantwortlicher (Art. 13(1)(a))",
|
"label": "Verantwortlicher (Art. 13(1)(a))",
|
||||||
"patterns": [
|
"patterns": [
|
||||||
r"verantwortlich\w*\s+(?:ist|im sinne|fuer)",
|
r"verantwortlich\w*\s+(?:ist|im sinne|fuer|f(?:ue|ü)r)",
|
||||||
|
r"kontaktdaten\s+des\s+verantwortlichen",
|
||||||
|
r"name\s+(?:und|&)\s+kontaktdaten\s+des",
|
||||||
r"controller", r"verantwortliche\s+stelle",
|
r"controller", r"verantwortliche\s+stelle",
|
||||||
r"responsible\s+(?:party|for)",
|
r"responsible\s+(?:party|for)",
|
||||||
|
r"ihk\s+\w+\s+bodensee", # IHK-specific: org name as controller
|
||||||
],
|
],
|
||||||
"severity": "HIGH",
|
"severity": "HIGH",
|
||||||
},
|
},
|
||||||
@@ -33,6 +36,7 @@ ART13_CHECKLIST = [
|
|||||||
"label": "Datenschutzbeauftragter (Art. 13(1)(b))",
|
"label": "Datenschutzbeauftragter (Art. 13(1)(b))",
|
||||||
"patterns": [
|
"patterns": [
|
||||||
r"datenschutzbeauftragt", r"data\s+protection\s+officer",
|
r"datenschutzbeauftragt", r"data\s+protection\s+officer",
|
||||||
|
r"kontaktdaten\s+de[rs]\s+(?:behördlichen\s+)?datenschutz",
|
||||||
r"dsb", r"dpo",
|
r"dsb", r"dpo",
|
||||||
],
|
],
|
||||||
"severity": "MEDIUM",
|
"severity": "MEDIUM",
|
||||||
@@ -41,9 +45,11 @@ ART13_CHECKLIST = [
|
|||||||
"id": "purposes",
|
"id": "purposes",
|
||||||
"label": "Zwecke der Verarbeitung (Art. 13(1)(c))",
|
"label": "Zwecke der Verarbeitung (Art. 13(1)(c))",
|
||||||
"patterns": [
|
"patterns": [
|
||||||
r"zweck\w*\s+(?:der|die)\s+(?:verarbeitung|datenerhebung|datenverarbeitung)",
|
r"zweck\w*\s+(?:der|und|die)\s+(?:verarbeitung|datenerhebung|datenverarbeitung|rechtsgrundlage)",
|
||||||
r"purpose\w*\s+(?:of|for)\s+(?:processing|data)",
|
r"purpose\w*\s+(?:of|for)\s+(?:processing|data)",
|
||||||
r"zu\s+welch\w+\s+zweck",
|
r"zu\s+welch\w+\s+zweck",
|
||||||
|
r"welche\s+daten\s+werden.*verarbeitet",
|
||||||
|
r"daten\s+werden\s+(?:zu|fuer|für)\s+(?:folgende|diese)",
|
||||||
],
|
],
|
||||||
"severity": "HIGH",
|
"severity": "HIGH",
|
||||||
},
|
},
|
||||||
@@ -53,6 +59,8 @@ ART13_CHECKLIST = [
|
|||||||
"patterns": [
|
"patterns": [
|
||||||
r"rechtsgrundlage", r"art\.\s*6\s*(?:abs|absatz)?\s*\.?\s*1",
|
r"rechtsgrundlage", r"art\.\s*6\s*(?:abs|absatz)?\s*\.?\s*1",
|
||||||
r"legal\s+basis", r"berechtigtes\s+interesse",
|
r"legal\s+basis", r"berechtigtes\s+interesse",
|
||||||
|
r"auf\s+grundlage\s+(?:von|des|der)\s+(?:art|§)",
|
||||||
|
r"lit\.\s*[a-f]\)",
|
||||||
],
|
],
|
||||||
"severity": "HIGH",
|
"severity": "HIGH",
|
||||||
},
|
},
|
||||||
@@ -60,9 +68,11 @@ ART13_CHECKLIST = [
|
|||||||
"id": "recipients",
|
"id": "recipients",
|
||||||
"label": "Empfaenger (Art. 13(1)(e))",
|
"label": "Empfaenger (Art. 13(1)(e))",
|
||||||
"patterns": [
|
"patterns": [
|
||||||
r"empf(?:ae|ä)nger", r"(?:ueber|weiter)mitt(?:el|l)ung",
|
r"empf(?:ae|ä)nger", r"(?:ueber|über|weiter)mitt(?:el|l)ung",
|
||||||
r"recipient", r"weitergabe\s+(?:an|von)\s+daten",
|
r"recipient", r"weitergabe\s+(?:an|von)\s+daten",
|
||||||
r"dritte", r"third\s+part",
|
r"dritte", r"third\s+part",
|
||||||
|
r"welche\s+daten\s+werden\s+(?:ueber|über)mittelt",
|
||||||
|
r"auftragsverarbeit",
|
||||||
],
|
],
|
||||||
"severity": "MEDIUM",
|
"severity": "MEDIUM",
|
||||||
},
|
},
|
||||||
@@ -83,6 +93,9 @@ ART13_CHECKLIST = [
|
|||||||
r"speicherdauer", r"aufbewahrungsfrist",
|
r"speicherdauer", r"aufbewahrungsfrist",
|
||||||
r"(?:wie\s+lange|dauer)\s+(?:werden|gespeichert)",
|
r"(?:wie\s+lange|dauer)\s+(?:werden|gespeichert)",
|
||||||
r"retention\s+period", r"l(?:oe|ö)sch(?:ung|frist|konzept)",
|
r"retention\s+period", r"l(?:oe|ö)sch(?:ung|frist|konzept)",
|
||||||
|
r"wie\s+lange\s+werden\s+die\s+daten\s+aufbewahrt",
|
||||||
|
r"daten\s+werden\s+gel(?:oe|ö)scht",
|
||||||
|
r"(?:\d+\s+(?:tage|monate|jahre)|nach\s+\d+)",
|
||||||
],
|
],
|
||||||
"severity": "HIGH",
|
"severity": "HIGH",
|
||||||
},
|
},
|
||||||
@@ -94,6 +107,9 @@ ART13_CHECKLIST = [
|
|||||||
r"recht\s+auf\s+berichtigung", r"widerspruchsrecht",
|
r"recht\s+auf\s+berichtigung", r"widerspruchsrecht",
|
||||||
r"art\.\s*1[5-9]", r"art\.\s*2[0-2]",
|
r"art\.\s*1[5-9]", r"art\.\s*2[0-2]",
|
||||||
r"right\s+to\s+(?:access|erasure|rectification|object)",
|
r"right\s+to\s+(?:access|erasure|rectification|object)",
|
||||||
|
r"betroffenenrecht", r"rechte\s+(?:des|der)\s+betroffenen",
|
||||||
|
r"welche\s+rechte\s+ha(?:t|ben)\s+(?:der|die|sie)",
|
||||||
|
r"ihnen\s+(?:stehen|steht)\s+(?:ein|folgende)\s+recht",
|
||||||
],
|
],
|
||||||
"severity": "HIGH",
|
"severity": "HIGH",
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -141,7 +141,6 @@ ALL_DSI_KEYWORDS: list[str] = []
|
|||||||
for kw_list in DSI_KEYWORDS.values():
|
for kw_list in DSI_KEYWORDS.values():
|
||||||
ALL_DSI_KEYWORDS.extend(kw_list)
|
ALL_DSI_KEYWORDS.extend(kw_list)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class DiscoveredDSI:
|
class DiscoveredDSI:
|
||||||
"""A discovered privacy/data protection document."""
|
"""A discovered privacy/data protection document."""
|
||||||
@@ -154,7 +153,6 @@ class DiscoveredDSI:
|
|||||||
sections: list[dict] = field(default_factory=list) # Parsed sections
|
sections: list[dict] = field(default_factory=list) # Parsed sections
|
||||||
word_count: int = 0
|
word_count: int = 0
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class DSIDiscoveryResult:
|
class DSIDiscoveryResult:
|
||||||
"""Result of DSI discovery scan."""
|
"""Result of DSI discovery scan."""
|
||||||
@@ -164,7 +162,6 @@ class DSIDiscoveryResult:
|
|||||||
languages_detected: list[str] = field(default_factory=list)
|
languages_detected: list[str] = field(default_factory=list)
|
||||||
errors: list[str] = field(default_factory=list)
|
errors: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
def _matches_dsi_keyword(text: str) -> tuple[bool, str]:
|
def _matches_dsi_keyword(text: str) -> tuple[bool, str]:
|
||||||
"""Check if text contains any DSI keyword. Returns (match, language)."""
|
"""Check if text contains any DSI keyword. Returns (match, language)."""
|
||||||
text_lower = text.lower().strip()
|
text_lower = text.lower().strip()
|
||||||
@@ -174,7 +171,6 @@ def _matches_dsi_keyword(text: str) -> tuple[bool, str]:
|
|||||||
return True, lang
|
return True, lang
|
||||||
return False, ""
|
return False, ""
|
||||||
|
|
||||||
|
|
||||||
def _is_allowed_domain(href: str, base_domain: str) -> bool:
|
def _is_allowed_domain(href: str, base_domain: str) -> bool:
|
||||||
"""Allow same domain + known related domains (e.g. help.instagram.com)."""
|
"""Allow same domain + known related domains (e.g. help.instagram.com)."""
|
||||||
try:
|
try:
|
||||||
@@ -199,7 +195,6 @@ def _is_allowed_domain(href: str, base_domain: str) -> bool:
|
|||||||
pass
|
pass
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
async def discover_dsi_documents(
|
async def discover_dsi_documents(
|
||||||
page: Page,
|
page: Page,
|
||||||
url: str,
|
url: str,
|
||||||
@@ -289,22 +284,9 @@ async def discover_dsi_documents(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
is_anchor = "#" in href and href.split("#")[0] == url.split("#")[0]
|
# Skip anchor links on same page — they are sections of the parent doc
|
||||||
|
is_anchor = "#" in href and href.split("#")[0] in (url.split("#")[0], page.url.split("#")[0])
|
||||||
if is_anchor:
|
if is_anchor:
|
||||||
anchor = href.split("#")[1]
|
|
||||||
text = await page.evaluate(f"""
|
|
||||||
() => {{
|
|
||||||
const el = document.getElementById('{anchor}');
|
|
||||||
if (!el) return '';
|
|
||||||
return el.closest('section,article,div')?.textContent?.trim() || el.textContent?.trim() || '';
|
|
||||||
}}
|
|
||||||
""")
|
|
||||||
if text and len(text) > 50:
|
|
||||||
result.documents.append(DiscoveredDSI(
|
|
||||||
title=title, url=href, source_url=url,
|
|
||||||
language=lang, doc_type="anchor_section",
|
|
||||||
text=text[:50000], word_count=len(text.split()),
|
|
||||||
))
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Navigate to page
|
# Navigate to page
|
||||||
@@ -351,6 +333,9 @@ async def discover_dsi_documents(
|
|||||||
result.errors.append(f"Discovery failed: {str(e)[:100]}")
|
result.errors.append(f"Discovery failed: {str(e)[:100]}")
|
||||||
logger.error("DSI discovery failed: %s", e)
|
logger.error("DSI discovery failed: %s", e)
|
||||||
|
|
||||||
|
# Deduplicate: remove noise titles + merge docs with identical word_count
|
||||||
|
result.documents = _deduplicate_documents(result.documents)
|
||||||
|
|
||||||
result.total_found = len(result.documents)
|
result.total_found = len(result.documents)
|
||||||
result.languages_detected = list(set(
|
result.languages_detected = list(set(
|
||||||
d.language for d in result.documents if d.language
|
d.language for d in result.documents if d.language
|
||||||
@@ -359,6 +344,48 @@ async def discover_dsi_documents(
|
|||||||
result.total_found, result.languages_detected)
|
result.total_found, result.languages_detected)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
# Titles that are navigation elements, not actual documents
|
||||||
|
NOISE_TITLES = {
|
||||||
|
"drucken", "print", "nach oben", "back to top", "teilen", "share",
|
||||||
|
"kontakt", "contact", "suche", "search", "menü", "menu", "home",
|
||||||
|
"datenschutz", # too generic (just the word, not a doc title)
|
||||||
|
}
|
||||||
|
|
||||||
|
def _deduplicate_documents(docs: list[DiscoveredDSI]) -> list[DiscoveredDSI]:
|
||||||
|
"""Remove duplicate and noise documents."""
|
||||||
|
# Step 1: Filter noise titles (nav elements, not real docs)
|
||||||
|
filtered = []
|
||||||
|
for d in docs:
|
||||||
|
title_lower = d.title.strip().lower()
|
||||||
|
# Skip very short titles that are nav elements
|
||||||
|
if title_lower in NOISE_TITLES:
|
||||||
|
continue
|
||||||
|
# Skip titles that are just URLs
|
||||||
|
if title_lower.startswith("http") or title_lower.startswith("www."):
|
||||||
|
continue
|
||||||
|
# Skip very short documents (< 50 words) — likely nav snippets
|
||||||
|
if d.word_count < 50 and d.doc_type != "pdf":
|
||||||
|
continue
|
||||||
|
filtered.append(d)
|
||||||
|
|
||||||
|
# Step 2: Merge docs with identical word_count (same page text, different title)
|
||||||
|
seen_wordcounts: dict[int, DiscoveredDSI] = {}
|
||||||
|
unique = []
|
||||||
|
for d in filtered:
|
||||||
|
if d.word_count > 200: # Only dedup substantial docs
|
||||||
|
if d.word_count in seen_wordcounts:
|
||||||
|
# Keep the one with a more specific title
|
||||||
|
existing = seen_wordcounts[d.word_count]
|
||||||
|
if len(d.title) > len(existing.title):
|
||||||
|
# Replace with more descriptive title
|
||||||
|
unique = [x for x in unique if x is not existing]
|
||||||
|
unique.append(d)
|
||||||
|
seen_wordcounts[d.word_count] = d
|
||||||
|
continue
|
||||||
|
seen_wordcounts[d.word_count] = d
|
||||||
|
unique.append(d)
|
||||||
|
|
||||||
|
return unique
|
||||||
|
|
||||||
async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]:
|
async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]:
|
||||||
"""Find all links whose text or href matches DSI keywords."""
|
"""Find all links whose text or href matches DSI keywords."""
|
||||||
@@ -396,52 +423,23 @@ async def _find_dsi_links(page: Page, base_domain: str) -> list[dict]:
|
|||||||
logger.warning("DSI link scan failed: %s", e)
|
logger.warning("DSI link scan failed: %s", e)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
async def _expand_all_interactive(page: Page) -> None:
|
async def _expand_all_interactive(page: Page) -> None:
|
||||||
"""Expand all accordions, tabs, details, dropdowns on the page."""
|
"""Expand all accordions, tabs, details, dropdowns on the page."""
|
||||||
try:
|
try:
|
||||||
await page.evaluate("""
|
await page.evaluate("""() => {
|
||||||
() => {
|
document.querySelectorAll('details:not([open])').forEach(d => d.open = true);
|
||||||
// 1. Open all <details> elements
|
const sels = ['button[aria-expanded="false"]','[data-toggle="collapse"]',
|
||||||
document.querySelectorAll('details:not([open])').forEach(d => d.open = true);
|
'[data-bs-toggle="collapse"]','[class*="accordion"] > button',
|
||||||
|
'[class*="collapse"] > button','.panel-heading a'];
|
||||||
// 2. Click all accordion buttons
|
sels.forEach(s => document.querySelectorAll(s).forEach(e => { try{e.click()}catch{} }));
|
||||||
const accSelectors = [
|
document.querySelectorAll('button,a').forEach(b => {
|
||||||
'button[aria-expanded="false"]',
|
if (/^(mehr|more|weiterlesen|read more|show more|anzeigen|alle anzeigen)/i.test((b.textContent||'').trim()))
|
||||||
'[class*="accordion"]:not([class*="open"]) > button',
|
try{b.click()}catch{}
|
||||||
'[class*="accordion"]:not([class*="open"]) > a',
|
});
|
||||||
'[class*="collapse"] > button',
|
document.querySelectorAll('[role="tab"]').forEach(t => { try{t.click()}catch{} });
|
||||||
'[class*="toggle"]:not(.active)',
|
}""")
|
||||||
'[data-toggle="collapse"]',
|
except Exception:
|
||||||
'[data-bs-toggle="collapse"]',
|
pass
|
||||||
'.panel-heading:not(.active) a',
|
|
||||||
];
|
|
||||||
for (const sel of accSelectors) {
|
|
||||||
document.querySelectorAll(sel).forEach(el => {
|
|
||||||
try { el.click(); } catch {}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3. Click all "show more" / "read more" buttons
|
|
||||||
const moreButtons = document.querySelectorAll(
|
|
||||||
'button, a'
|
|
||||||
);
|
|
||||||
for (const btn of moreButtons) {
|
|
||||||
const text = (btn.textContent || '').toLowerCase().trim();
|
|
||||||
if (/^(mehr|more|weiterlesen|read more|show more|anzeigen|details|alle anzeigen)/.test(text)) {
|
|
||||||
try { btn.click(); } catch {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 4. Expand all tab panels (click each tab)
|
|
||||||
document.querySelectorAll('[role="tab"]').forEach(tab => {
|
|
||||||
try { tab.click(); } catch {}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
""")
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug("Expand interactive elements: %s", e)
|
|
||||||
|
|
||||||
|
|
||||||
async def _find_inline_dsi_sections(page: Page) -> list[dict]:
|
async def _find_inline_dsi_sections(page: Page) -> list[dict]:
|
||||||
"""Find DSI content already visible on the page (e.g. expanded accordions).
|
"""Find DSI content already visible on the page (e.g. expanded accordions).
|
||||||
|
|||||||
Reference in New Issue
Block a user