6aad774fc1
Erkennt: in derselben DSE / Cookie-Richtlinie nennt der Anbieter für
DIESELBE Datenkategorie mehrere unterschiedliche Speicherdauern.
GT-Anker (Elli): Logfiles "7 Tage" + "30 Tage" im selben DSE → eine
Angabe ist falsch oder veraltet.
Heuristik:
- Satz-Boundary-Scope (kein ±N-Zeichen-Fenster) verhindert
Cross-Category-Leakage
- Pro Satz: Kategorie-Anchor + Retention-Werte beide drin
- Tag-Cluster mit ±20 %-Toleranz: "30 Tage" und "1 Monat" =
1 Cluster; "7 Tage" und "30 Tage" = 2 Cluster → Finding
Kategorien (Phase 1):
- logfile, contact_form, application, newsletter, invoice,
session_cookie
Severity: MEDIUM (DSGVO Art. 5 Abs. 1 lit. a + Art. 13 Abs. 2 lit. a).
Tests: 11/11 grün (Cluster-Logik 5, Check-Pfade 6, inkl. Cross-
Category-Leakage-Regression).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
87 lines
3.3 KiB
Python
87 lines
3.3 KiB
Python
"""Tests for B14 retention-conflict-Detector (GT TH-RETENTION-001)."""
|
|
|
|
from compliance.services.retention_conflict_check import (
|
|
_cluster_values,
|
|
check_retention_conflicts,
|
|
)
|
|
|
|
|
|
class TestClusterValues:
|
|
def test_empty(self):
|
|
assert _cluster_values([]) == []
|
|
|
|
def test_single_value(self):
|
|
assert _cluster_values([7]) == [[7]]
|
|
|
|
def test_two_close_values_one_cluster(self):
|
|
# 30 and 31 days within 20% tolerance
|
|
assert _cluster_values([30, 31]) == [[30, 31]]
|
|
|
|
def test_two_distant_values_two_clusters(self):
|
|
# 7 and 30 days — well outside 20% tolerance
|
|
clusters = _cluster_values([7, 30])
|
|
assert len(clusters) == 2
|
|
|
|
def test_equivalent_durations_collapse(self):
|
|
# 30 Tage and 1 Monat (==30 Tage) → one cluster
|
|
clusters = _cluster_values([30, 30])
|
|
assert clusters == [[30, 30]]
|
|
|
|
|
|
class TestCheckRetentionConflicts:
|
|
def test_no_doc_no_findings(self):
|
|
assert check_retention_conflicts({}) == []
|
|
|
|
def test_logfile_7_vs_30_finding(self):
|
|
text = (
|
|
"Server-Logfiles werden für 7 Tage gespeichert. "
|
|
"Bei Sicherheitsvorfällen werden die Logfiles bis zu 30 Tage "
|
|
"aufbewahrt."
|
|
)
|
|
findings = check_retention_conflicts({"doc_texts": {"dse": text}})
|
|
assert len(findings) == 1
|
|
f = findings[0]
|
|
assert f["check_id"] == "RETENTION-CONFLICT-001"
|
|
assert f["category"] == "logfile"
|
|
assert f["doc_type"] == "dse"
|
|
assert 7.0 in f["values_days"]
|
|
assert 30.0 in f["values_days"]
|
|
|
|
def test_logfile_single_value_no_finding(self):
|
|
text = "Logfiles werden 7 Tage aufbewahrt."
|
|
assert check_retention_conflicts({"doc_texts": {"dse": text}}) == []
|
|
|
|
def test_logfile_close_values_no_finding(self):
|
|
# 30 days vs ~1 Monat — same cluster
|
|
text = (
|
|
"Logfiles werden 30 Tage gespeichert. "
|
|
"Die Aufbewahrungsdauer beträgt 1 Monat."
|
|
)
|
|
# NOTE: parse_duration_to_days('1 Monat') → 30 days; same cluster.
|
|
findings = check_retention_conflicts({"doc_texts": {"dse": text}})
|
|
# Either no finding (preferred) or zero because clusters collapse.
|
|
cf = [f for f in findings if f["category"] == "logfile"]
|
|
assert cf == []
|
|
|
|
def test_only_categorisations_with_two_clusters_emit(self):
|
|
# Logfile two values + contact_form single → only logfile fires.
|
|
text = (
|
|
"Server-Logfiles werden 7 Tage gespeichert. "
|
|
"Außerdem speichern wir Logfiles bis zu 90 Tage. "
|
|
"Kontaktformular-Daten werden 6 Monate aufbewahrt."
|
|
)
|
|
findings = check_retention_conflicts({"doc_texts": {"dse": text}})
|
|
cats = [f["category"] for f in findings]
|
|
assert "logfile" in cats
|
|
assert "contact_form" not in cats
|
|
|
|
def test_dse_and_cookie_doc_separately(self):
|
|
text_dse = "Logfiles werden 7 Tage gespeichert. Logfiles 30 Tage."
|
|
text_cookie = "Session-Cookie läuft nach 1 Tag ab."
|
|
findings = check_retention_conflicts({
|
|
"doc_texts": {"dse": text_dse, "cookie": text_cookie}
|
|
})
|
|
# Only logfile conflict in dse, nothing in cookie.
|
|
assert len(findings) == 1
|
|
assert findings[0]["doc_type"] == "dse"
|