Compare commits

...

2 Commits

Author SHA1 Message Date
Benjamin Admin 4434e3827b fix(audit): parse_flat_cookie_text — Anchor-Pattern fuer VW-textContent
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / detect-changes (push) Successful in 10s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 17s
CI / loc-budget (push) Failing after 17s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 40s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
VW Cookie-Doc-textContent verkettet HTML-Tabellen-Zellen OHNE Whitespace:
'Permanent/Protokoll_fbcTracking Cookies (Marketing)...'

Neues Pattern hat 2 Anker:
* Davor: typisches End-Token einer vorherigen Zelle (Permanent/Protokoll,
  Session Cookie, Persistent Cookie, TagePersistent, ...)
* Danach: Kategorie-Token (Tracking Cookies, Funktionscookie, Marketing,
  Analytics, Necessary)
Dazwischen: Cookie-Name (3-50 Zeichen, alphanum/_/-)

VW-Test (snapshot 4a465783): findet jetzt 40 unique Cookie-Namen,
aggregiert zu 6 Vendors (Google, DoubleClick, Cloudflare, Borlabs,
Meta, Unbekannter Anbieter mit 22 VW-internen Cookies).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 21:33:58 +02:00
Benjamin Admin 07cc00da11 feat(licenses): Stufe 2 — auto-attribution footer in compliance PDF
Extends CompliancePDFGenerator with a "Quellen & Lizenzen" section
appended to every generated compliance PDF.

The footer is built from compliance.canonical_controls + control_parent_links
directly (no HTTP hop to /licenses/aggregate — same DB connection
already open in the generator). It groups by license_rule and lists
the top 8 source regulations per bucket.

For Rule-2 entries (CC-BY-SA, OECD-Public, Apache, etc.) it emits the
mandatory attribution paragraph required by the underlying licenses.
For Rule 1 a brief reference list satisfies the auditability goal
without legal obligation. Rule 3 is identifier-only by design.

Architecture decision: this is a PLATFORM-level footer (which sources
the platform draws on overall), not a per-export filter of "only the
sources actually cited in THIS document". The latter would require
control-uuid tracking across all sections (TOM/VVT/DSFA/etc.) which
the current PDF generator does not surface — that's a follow-up scope.
The platform-level footer fulfils the immediate legal mandate that
attribution be present on the work, not buried in AGB/Impressum.

Part of Attribution-Renderer Task #23. Stufe 1 (overview page) +
Stufe 3 (SourceBadge component) already shipped in commit dfac940.
Stufe 4 (tech-file appendix) remains for the IACE tech-file generator
in a separate iteration.
2026-05-21 21:30:02 +02:00
2 changed files with 90 additions and 28 deletions
@@ -82,6 +82,8 @@ class CompliancePDFGenerator:
self._add_consent_section(story, ss, tenant_id)
# Org Roles
self._add_role_section(story, ss, tenant_id, project_id)
# Stufe 2 — Quellen- und Lizenz-Footer (Attribution-Renderer Task #23)
self._add_attribution_footer(story, ss)
# Footer
story.append(Spacer(1, 15 * mm))
story.append(Paragraph("Erstellt mit BreakPilot Compliance SDK", ss["Small"]))
@@ -214,3 +216,64 @@ class CompliancePDFGenerator:
story.append(Paragraph("Keine Rollen zugewiesen.", ss["Body2"]))
except Exception:
story.append(Paragraph("Rollen-Tabelle nicht vorhanden.", ss["Small"]))
def _add_attribution_footer(self, story, ss) -> None:
"""Stufe 2 of the attribution renderer (Task #23).
Adds a "Quellen und Lizenzen" section listing the platform's
license-rule distribution and, crucially, the mandatory
attribution lines for Rule-2 sources (CC-BY-SA, OECD, Apache).
For Rule 1 sources the attribution is optional but rendered as
a brief reference list for auditability.
The section is added to every generated compliance PDF so each
export carries its own provenance footer — pauschale Hinweise
in AGB/Impressum reichen rechtlich nicht (siehe
project_attribution_strategy.md).
"""
try:
rows = self.db.execute(text("""
SELECT cc.license_rule, COUNT(*) AS n,
array_agg(DISTINCT cpl.source_regulation ORDER BY cpl.source_regulation)
FILTER (WHERE cpl.source_regulation IS NOT NULL) AS sources
FROM compliance.canonical_controls cc
LEFT JOIN compliance.control_parent_links cpl ON cpl.control_uuid = cc.id
WHERE cc.license_rule IS NOT NULL
GROUP BY cc.license_rule
ORDER BY cc.license_rule
""")).fetchall()
except Exception as e:
logger.warning("attribution footer skipped: %s", e)
return
if not rows:
return
rule_labels = {1: "Hoheitsrecht/Public Domain (woertlich)",
2: "Mit Attribution (CC-BY u.ae.)",
3: "Nur Identifier-Verweis"}
story.append(Spacer(1, 8 * mm))
story.append(Paragraph("Quellen &amp; Lizenzen", ss["Section"]))
story.append(Paragraph(
"Dieser Bericht stuetzt sich auf klassifizierte Compliance-Controls "
"aus den folgenden Quellen. Jede Quelle ist deterministisch in eine "
"der drei Lizenzregeln (R1-R3) eingeordnet.", ss["Body2"]))
for r in rows:
rule = int(r.license_rule)
sources = (r.sources or [])[:8]
label = rule_labels.get(rule, f"Regel {rule}")
head = f"<b>R{rule}{label}</b> &nbsp; ({r.n} Controls)"
story.append(Paragraph(head, ss["Body2"]))
if sources:
src_text = "; ".join(sources)
if len(r.sources or []) > 8:
src_text += f" und {len(r.sources) - 8} weitere"
story.append(Paragraph(src_text, ss["Small"]))
if rule == 2:
story.append(Paragraph(
"Pflicht-Attribution: Inhalte aus den oben genannten Quellen sind "
"unter den jeweiligen freien Lizenzen (z.B. CC-BY-SA, OECD-Public, "
"Apache-2.0) wiedergegeben. Original-Urheber bleibt in jeder "
"Weiterverwendung zu nennen.", ss["Small"]))
story.append(Spacer(1, 2 * mm))
@@ -189,35 +189,41 @@ def parse_cookie_table(text: str) -> list[dict]:
return out
# textContent-Output von HTML-Tabellen verkettet Zellen ohne Whitespace
# (z.B. VW: "Permanent/Protokoll_fbcTracking Cookies (Marketing)..."). Wir
# erkennen Cookie-Eintraege ueber 2 Anker:
# - Davor: typisches End-Token einer vorherigen Tabellen-Zelle
# (Speicherdauer-Suffix wie Permanent/Protokoll, Session Cookie, ...)
# - Danach: Kategorie-Token (Tracking Cookies, Funktionscookie, ...)
# Dazwischen: der Cookie-Name (3-50 Zeichen, alphanum/underscore/dash).
_FLAT_ROW_RE = re.compile(
r"\b([A-Za-z_][A-Za-z0-9_\-\.]{1,40})\s+"
r"((?:Tracking|Session|Funktional|Marketing|Analytics|Performance|"
r"Notwendig|Strictly\s+Necessary|Statistik|Personalisierung)"
r"[A-Za-zäöüÄÖÜß \-\(\)]*?Cookies?[^A-Z]{0,400}?)"
r"(?:(\d+)\s*(Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr|day|month|year)|"
r"\b(Session|Permanent)\b)",
re.I | re.S,
r"(?:Permanent/Protokoll|Session Cookie|Persistent Cookie|"
r"TagePersistent|TageSitzungs-Cookie|TageSession Cookie|"
r"MinutenPersistent|MinutenSession Cookie|StundenPersistent|"
r"MonatePersistent|JahrePersistent)"
r"([A-Za-z_][A-Za-z0-9_\-\.]{1,40}?)"
r"(?=Tracking Cookies|Session Cookies|Funktionscookie|Funktional|"
r"Marketing|Analytics|Necessary)",
re.I,
)
def parse_flat_cookie_text(text: str) -> list[dict]:
"""Variante fuer Sites wie VW die ihre Cookie-Tabelle als flachen
Text liefern (Cookie-Name + Kategorie + Beschreibung + Dauer in
einem Block hintereinander, ohne klare Trenner).
Text liefern (textContent-Output ohne Whitespace zwischen Zellen).
Regex sucht nach 'NAME [Tracking|Session|Funktional...] Cookies
... [13 Monate|Session|Permanent]' und behandelt jeden Match als
eine Tabellen-Zeile.
Regex anchored auf vorherige Speicherdauer-Suffixe + folgende
Kategorie-Token → extrahiert den Cookie-Namen dazwischen.
"""
if not text or len(text) < 500:
return []
matches = list(_FLAT_ROW_RE.finditer(text))
if len(matches) < 3:
names = _FLAT_ROW_RE.findall(text)
if len(names) < 3:
return []
by_vendor: dict[str, dict] = {}
seen_names: set[str] = set()
for m in matches:
name = m.group(1).strip()
for raw in names:
name = raw.strip()
nl = name.lower()
if nl in seen_names:
continue
@@ -226,30 +232,23 @@ def parse_flat_cookie_text(text: str) -> list[dict]:
"marketing", "analytics", "werbung", "anbieter",
"tracking", "cookie", "cookies", "und", "von",
"einer", "ist", "alle", "noch", "auch", "name",
"art", "zweck", "dauer"):
"art", "zweck", "dauer", "test"):
continue
if len(name) < 3 or len(name) > 60:
continue
seen_names.add(nl)
category = _normalize_category(m.group(2) or "")
persistence = ""
if m.group(3):
persistence = f"{m.group(3)} {m.group(4)}"
elif m.group(5):
persistence = m.group(5)
purpose = (m.group(2) or "").strip()[:300]
vendor = _guess_vendor(name) or "Unbekannter Anbieter"
entry = by_vendor.setdefault(vendor, {
"name": vendor, "country": "",
"purpose": purpose, "category": category,
"purpose": "", "category": "",
"opt_out_url": "", "privacy_policy_url": "",
"persistence": persistence,
"persistence": "",
"cookies": [],
"source": "flat_pattern",
})
entry["cookies"].append({
"name": name, "purpose": purpose[:200],
"expiry": persistence, "is_third_party": True,
"name": name, "purpose": "",
"expiry": "", "is_third_party": True,
})
out = list(by_vendor.values())
logger.info("parse_flat_cookie_text: %d vendors / %d cookies",