feat: Fix DSFA dedup + expand all checklists to 56 total checks
Fixes: - 'Risikoabwaegung' is sub-section of DSFA → added to SKIP_HEADINGS - 'Social Media' standalone heading → recognized as social_media DSE - Removed 'risikobew' from DSFA pattern (was too broad) Expanded checklists: - Widerruf: 4→7 checks (+Empfaenger, kein Grund, §312k Button) - AGB: 4→9 checks (+Zahlung, Lieferung, Gewaehrleistung, Kuendigung, Datenschutz) - Social Media: +1 (Social Bookmarks) - DSFA: +1 (LFDI Richtlinie) Total: 47→56 Regex-Checks across 7 document types: DSI=9, Cookie=5, Social Media=10, DSFA=8, Impressum=6, Widerruf=7, AGB=9 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -264,7 +264,8 @@ SECTION_TYPE_MAP = [
|
||||
(r"^impressum$", "impressum"),
|
||||
(r"^(?:agb|allgemeine geschäftsbedingungen|nutzungsbedingungen)$", "agb"),
|
||||
# DSFA MUST be checked BEFORE social_media (both can contain "Social Media")
|
||||
(r"datenschutzfolge|dsfa|risikoanalyse|risikobew(?:ae|ä)gung", "dsfa"),
|
||||
(r"datenschutzfolge|dsfa|risikoanalyse", "dsfa"),
|
||||
(r"^social\s*media$", "social_media"), # Standalone heading "Social Media" = DSE
|
||||
(r"datenschutzerkl(?:ae|ä)rung.*social|datenschutz\s+f(?:ue|ü)r\s+social", "social_media"),
|
||||
]
|
||||
|
||||
@@ -326,10 +327,21 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]:
|
||||
return sections
|
||||
|
||||
|
||||
# Headings to skip — sub-sections of other documents, not standalone
|
||||
SKIP_HEADINGS = {
|
||||
"nutzungskonzept social media", # Internal concept, no legal checklist
|
||||
"risikoabwägung und datenschutzfolgenabschätzung", # Sub-section of DSFA
|
||||
"risikoabwaegung und datenschutzfolgenabschaetzung",
|
||||
}
|
||||
|
||||
|
||||
def _classify_section(heading: str) -> str | None:
|
||||
"""Classify a section heading into a document type."""
|
||||
import re as _re
|
||||
heading_lower = heading.lower()
|
||||
heading_lower = heading.lower().strip()
|
||||
# Skip known sub-sections
|
||||
if heading_lower in SKIP_HEADINGS:
|
||||
return None
|
||||
for pattern, doc_type in SECTION_TYPE_MAP:
|
||||
if _re.search(pattern, heading_lower):
|
||||
return doc_type
|
||||
|
||||
Reference in New Issue
Block a user