"""B7 — Doc-Staleness: Datum extrahieren + Aktualität bewerten. Findings, wenn ein rechtliches Dokument (AGB, Nutzungsbedingungen, Widerruf, DSE) über N Jahre alt ist. Default-Cap: 3 Jahre für AGB/ Nutzungsbedingungen (TERMS-STALENESS-001), 2 Jahre für DSE. Heuristik für Datumsextraktion: - "Stand: November 2018" / "Stand November 2018" / "Stand: Dezember 2018" - "Letzte Aktualisierung: 2018-12-01" - "Version vom 1.12.2018" - "Last updated: December 2018" """ from __future__ import annotations import logging import re from datetime import datetime logger = logging.getLogger(__name__) _MONTHS_DE = { "januar": 1, "februar": 2, "märz": 3, "maerz": 3, "april": 4, "mai": 5, "juni": 6, "juli": 7, "august": 8, "september": 9, "oktober": 10, "november": 11, "dezember": 12, } _MONTHS_EN = { "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6, "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12, } # Match patterns like "Stand: Dezember 2018" / "Stand November 2018" _PAT_STAND = re.compile( r"(?:stand|version|letzte\s+aktualisierung|last\s+updated|" r"last\s+revised)\s*[:\-]?\s*" r"(?:vom\s+)?" r"(?:(?P\d{1,2})[.\-/])?" r"(?P" r"januar|februar|m[äa]rz|april|mai|juni|juli|august|september|" r"oktober|november|dezember|" r"january|february|march|april|may|june|july|august|september|" r"october|november|december|" r"\d{1,2}" r")" r"[.\s\-/]+" r"(?P\d{4})", re.I, ) _AGE_THRESHOLDS_YEARS = { "agb": 3, "nutzungsbedingungen": 3, "widerruf": 2, "dse": 2, "impressum": 5, # less critical "cookie": 2, } def _extract_date(text: str) -> tuple[int, int, int] | None: """Return (year, month, day) of the most recent revision date.""" if not text: return None candidates: list[tuple[int, int, int]] = [] for m in _PAT_STAND.finditer(text): try: year = int(m.group("year")) mon_str = (m.group("month") or "").lower() day = int(m.group("day") or 1) if mon_str.isdigit(): month = int(mon_str) else: month = (_MONTHS_DE.get(mon_str) or _MONTHS_EN.get(mon_str)) if not month or not (1 <= month <= 12): continue if year < 2000 or year > 2100: continue candidates.append((year, month, day)) except (ValueError, TypeError): continue if not candidates: return None # newest date wins candidates.sort(reverse=True) return candidates[0] def check_staleness(state: dict) -> list[dict]: """Run staleness check across legal docs.""" findings: list[dict] = [] doc_texts = state.get("doc_texts") or {} today = datetime.utcnow() for doc_type, text in doc_texts.items(): threshold_years = _AGE_THRESHOLDS_YEARS.get(doc_type) if not threshold_years: continue date = _extract_date(text) if not date: continue year, month, day = date try: doc_date = datetime(year, month, min(day, 28)) except ValueError: continue age_years = (today - doc_date).days / 365.25 if age_years < threshold_years: continue sev = "HIGH" if age_years > threshold_years * 2 else "MEDIUM" findings.append({ "check_id": f"DOC-STALENESS-{doc_type.upper()}", "doc_type": doc_type, "severity": sev, "severity_reason": "incomplete", "title": ( f"{doc_type.title()} ist {int(age_years)} Jahre alt " f"(Stand {year:04d}-{month:02d})" ), "norm": "Sorgfaltspflicht (laufende Anpassung an Rechtsänderungen)", "doc_date": f"{year:04d}-{month:02d}-{day:02d}", "age_years": round(age_years, 1), "threshold_years": threshold_years, "action": ( f"{doc_type.title()} überprüfen und an aktuelle " "Gesetzeslage anpassen (DSGVO-Updates, AI Act, DSA, " "neue BGH-Rechtsprechung). Stand-Datum aktualisieren." ), }) if findings: logger.info("B7 staleness: %d findings", len(findings)) return findings