fix(audit): parse_flat_cookie_text — Anchor-Pattern fuer VW-textContent

VW Cookie-Doc-textContent verkettet HTML-Tabellen-Zellen OHNE Whitespace: 'Permanent/Protokoll_fbcTracking Cookies (Marketing)...' Neues Pattern hat 2 Anker: * Davor: typisches End-Token einer vorherigen Zelle (Permanent/Protokoll, Session Cookie, Persistent Cookie, TagePersistent, ...) * Danach: Kategorie-Token (Tracking Cookies, Funktionscookie, Marketing, Analytics, Necessary) Dazwischen: Cookie-Name (3-50 Zeichen, alphanum/_/-) VW-Test (snapshot 4a465783): findet jetzt 40 unique Cookie-Namen, aggregiert zu 6 Vendors (Google, DoubleClick, Cloudflare, Borlabs, Meta, Unbekannter Anbieter mit 22 VW-internen Cookies). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
feat(licenses): Stufe 2 — auto-attribution footer in compliance PDF
2026-05-21 21:33:58 +02:00 · 2026-05-21 21:30:02 +02:00
2 changed files with 90 additions and 28 deletions
@@ -82,6 +82,8 @@ class CompliancePDFGenerator:
        self._add_consent_section(story, ss, tenant_id)
        # Org Roles
        self._add_role_section(story, ss, tenant_id, project_id)
        # Stufe 2 — Quellen- und Lizenz-Footer (Attribution-Renderer Task #23)
        self._add_attribution_footer(story, ss)
        # Footer
        story.append(Spacer(1, 15 * mm))
        story.append(Paragraph("Erstellt mit BreakPilot Compliance SDK", ss["Small"]))
@@ -214,3 +216,64 @@ class CompliancePDFGenerator:
                story.append(Paragraph("Keine Rollen zugewiesen.", ss["Body2"]))
        except Exception:
            story.append(Paragraph("Rollen-Tabelle nicht vorhanden.", ss["Small"]))
    def _add_attribution_footer(self, story, ss) -> None:
        """Stufe 2 of the attribution renderer (Task #23).
        Adds a "Quellen und Lizenzen" section listing the platform's
        license-rule distribution and, crucially, the mandatory
        attribution lines for Rule-2 sources (CC-BY-SA, OECD, Apache).
        For Rule 1 sources the attribution is optional but rendered as
        a brief reference list for auditability.
        The section is added to every generated compliance PDF so each
        export carries its own provenance footer — pauschale Hinweise
        in AGB/Impressum reichen rechtlich nicht (siehe
        project_attribution_strategy.md).
        """
        try:
            rows = self.db.execute(text("""
                SELECT cc.license_rule, COUNT(*) AS n,
                       array_agg(DISTINCT cpl.source_regulation ORDER BY cpl.source_regulation)
                         FILTER (WHERE cpl.source_regulation IS NOT NULL) AS sources
                FROM compliance.canonical_controls cc
                LEFT JOIN compliance.control_parent_links cpl ON cpl.control_uuid = cc.id
                WHERE cc.license_rule IS NOT NULL
                GROUP BY cc.license_rule
                ORDER BY cc.license_rule
            """)).fetchall()
        except Exception as e:
            logger.warning("attribution footer skipped: %s", e)
            return
        if not rows:
            return
        rule_labels = {1: "Hoheitsrecht/Public Domain (woertlich)",
                       2: "Mit Attribution (CC-BY u.ae.)",
                       3: "Nur Identifier-Verweis"}
        story.append(Spacer(1, 8 * mm))
        story.append(Paragraph("Quellen &amp; Lizenzen", ss["Section"]))
        story.append(Paragraph(
            "Dieser Bericht stuetzt sich auf klassifizierte Compliance-Controls "
            "aus den folgenden Quellen. Jede Quelle ist deterministisch in eine "
            "der drei Lizenzregeln (R1-R3) eingeordnet.", ss["Body2"]))
        for r in rows:
            rule = int(r.license_rule)
            sources = (r.sources or [])[:8]
            label = rule_labels.get(rule, f"Regel {rule}")
            head = f"<b>R{rule} — {label}</b> &nbsp; ({r.n} Controls)"
            story.append(Paragraph(head, ss["Body2"]))
            if sources:
                src_text = "; ".join(sources)
                if len(r.sources or []) > 8:
                    src_text += f" und {len(r.sources) - 8} weitere"
                story.append(Paragraph(src_text, ss["Small"]))
            if rule == 2:
                story.append(Paragraph(
                    "Pflicht-Attribution: Inhalte aus den oben genannten Quellen sind "
                    "unter den jeweiligen freien Lizenzen (z.B. CC-BY-SA, OECD-Public, "
                    "Apache-2.0) wiedergegeben. Original-Urheber bleibt in jeder "
                    "Weiterverwendung zu nennen.", ss["Small"]))
            story.append(Spacer(1, 2 * mm))
@@ -189,35 +189,41 @@ def parse_cookie_table(text: str) -> list[dict]:
    return out
 # textContent-Output von HTML-Tabellen verkettet Zellen ohne Whitespace
 # (z.B. VW: "Permanent/Protokoll_fbcTracking Cookies (Marketing)..."). Wir
 # erkennen Cookie-Eintraege ueber 2 Anker:
 #   - Davor: typisches End-Token einer vorherigen Tabellen-Zelle
 #     (Speicherdauer-Suffix wie Permanent/Protokoll, Session Cookie, ...)
 #   - Danach: Kategorie-Token (Tracking Cookies, Funktionscookie, ...)
 # Dazwischen: der Cookie-Name (3-50 Zeichen, alphanum/underscore/dash).
 _FLAT_ROW_RE = re.compile(
-    r"\b([A-Za-z_][A-Za-z0-9_\-\.]{1,40})\s+"
+    r"(?:Permanent/Protokoll|Session Cookie|Persistent Cookie|"
-    r"((?:Tracking|Session|Funktional|Marketing|Analytics|Performance|"
+    r"TagePersistent|TageSitzungs-Cookie|TageSession Cookie|"
-    r"Notwendig|Strictly\s+Necessary|Statistik|Personalisierung)"
+    r"MinutenPersistent|MinutenSession Cookie|StundenPersistent|"
-    r"[A-Za-zäöüÄÖÜß \-\(\)]*?Cookies?[^A-Z]{0,400}?)"
+    r"MonatePersistent|JahrePersistent)"
-    r"(?:(\d+)\s*(Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr|day|month|year)|"
+    r"([A-Za-z_][A-Za-z0-9_\-\.]{1,40}?)"
-    r"\b(Session|Permanent)\b)",
+    r"(?=Tracking Cookies|Session Cookies|Funktionscookie|Funktional|"
-    re.I | re.S,
+    r"Marketing|Analytics|Necessary)",
    re.I,
 )
 def parse_flat_cookie_text(text: str) -> list[dict]:
    """Variante fuer Sites wie VW die ihre Cookie-Tabelle als flachen
-    Text liefern (Cookie-Name + Kategorie + Beschreibung + Dauer in
+    Text liefern (textContent-Output ohne Whitespace zwischen Zellen).
    einem Block hintereinander, ohne klare Trenner).
-    Regex sucht nach 'NAME [Tracking|Session|Funktional...] Cookies
+    Regex anchored auf vorherige Speicherdauer-Suffixe + folgende
-    ... [13 Monate|Session|Permanent]' und behandelt jeden Match als
+    Kategorie-Token → extrahiert den Cookie-Namen dazwischen.
    eine Tabellen-Zeile.
    """
    if not text or len(text) < 500:
        return []
-    matches = list(_FLAT_ROW_RE.finditer(text))
+    names = _FLAT_ROW_RE.findall(text)
-    if len(matches) < 3:
+    if len(names) < 3:
        return []
    by_vendor: dict[str, dict] = {}
    seen_names: set[str] = set()
-    for m in matches:
+    for raw in names:
-        name = m.group(1).strip()
+        name = raw.strip()
        nl = name.lower()
        if nl in seen_names:
            continue
@@ -226,30 +232,23 @@ def parse_flat_cookie_text(text: str) -> list[dict]:
                   "marketing", "analytics", "werbung", "anbieter",
                   "tracking", "cookie", "cookies", "und", "von",
                   "einer", "ist", "alle", "noch", "auch", "name",
-                   "art", "zweck", "dauer"):
+                   "art", "zweck", "dauer", "test"):
            continue
        if len(name) < 3 or len(name) > 60:
            continue
        seen_names.add(nl)
        category = _normalize_category(m.group(2) or "")
        persistence = ""
        if m.group(3):
            persistence = f"{m.group(3)} {m.group(4)}"
        elif m.group(5):
            persistence = m.group(5)
        purpose = (m.group(2) or "").strip()[:300]
        vendor = _guess_vendor(name) or "Unbekannter Anbieter"
        entry = by_vendor.setdefault(vendor, {
            "name": vendor, "country": "",
-            "purpose": purpose, "category": category,
+            "purpose": "", "category": "",
            "opt_out_url": "", "privacy_policy_url": "",
-            "persistence": persistence,
+            "persistence": "",
            "cookies": [],
            "source": "flat_pattern",
        })
        entry["cookies"].append({
-            "name": name, "purpose": purpose[:200],
+            "name": name, "purpose": "",
-            "expiry": persistence, "is_third_party": True,
+            "expiry": "", "is_third_party": True,
        })
    out = list(by_vendor.values())
    logger.info("parse_flat_cookie_text: %d vendors / %d cookies",