Compare commits
2 Commits
cb5dad1a2f
...
1451873194
| Author | SHA1 | Date | |
|---|---|---|---|
| 1451873194 | |||
| dfac940272 |
@@ -0,0 +1,160 @@
|
||||
'use client'
|
||||
|
||||
import { useEffect, useState } from 'react'
|
||||
|
||||
// Stufe 1 of the Attribution Renderer (Task #23): the global
|
||||
// "Quellen & Lizenzen" overview. Aggregates all 314k canonical_controls
|
||||
// by their license_rule and shows the source regulations behind each
|
||||
// bucket. Drives the footer link and gives auditors a one-page view of
|
||||
// what licence classes the platform is operating under.
|
||||
|
||||
type SourceCount = {
|
||||
regulation_id: string
|
||||
regulation_name_de: string | null
|
||||
license_rule: number
|
||||
license_type: string | null
|
||||
attribution: string | null
|
||||
jurisdiction: string | null
|
||||
source_type: string | null
|
||||
n_controls: number
|
||||
}
|
||||
|
||||
type RuleBucket = {
|
||||
rule: number
|
||||
label_de: string
|
||||
label_en: string
|
||||
attribution_required: boolean
|
||||
render_full_text: boolean
|
||||
total_controls: number
|
||||
distinct_sources: number
|
||||
sources: SourceCount[]
|
||||
}
|
||||
|
||||
type Overview = {
|
||||
total_controls: number
|
||||
buckets: RuleBucket[]
|
||||
}
|
||||
|
||||
const RULE_COLOR: Record<number, string> = {
|
||||
1: 'border-emerald-200 bg-emerald-50',
|
||||
2: 'border-amber-200 bg-amber-50',
|
||||
3: 'border-slate-200 bg-slate-50',
|
||||
}
|
||||
|
||||
const RULE_BADGE: Record<number, string> = {
|
||||
1: 'bg-emerald-600 text-white',
|
||||
2: 'bg-amber-600 text-white',
|
||||
3: 'bg-slate-600 text-white',
|
||||
}
|
||||
|
||||
export default function LicensesPage() {
|
||||
const [data, setData] = useState<Overview | null>(null)
|
||||
const [error, setError] = useState<string | null>(null)
|
||||
|
||||
useEffect(() => {
|
||||
fetch('/api/sdk/v1/compliance/licenses/overview')
|
||||
.then((r) => (r.ok ? r.json() : Promise.reject(`HTTP ${r.status}`)))
|
||||
.then(setData)
|
||||
.catch((e) => setError(String(e)))
|
||||
}, [])
|
||||
|
||||
if (error) {
|
||||
return (
|
||||
<div className="p-6">
|
||||
<h1 className="text-xl font-semibold mb-2">Quellen & Lizenzen</h1>
|
||||
<p className="text-red-600">Fehler beim Laden: {error}</p>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
if (!data) {
|
||||
return (
|
||||
<div className="p-6">
|
||||
<h1 className="text-xl font-semibold">Quellen & Lizenzen</h1>
|
||||
<p className="text-slate-500 mt-2">Lade …</p>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="p-6 max-w-7xl">
|
||||
<header className="mb-6">
|
||||
<h1 className="text-2xl font-semibold">Quellen & Lizenzen</h1>
|
||||
<p className="text-sm text-slate-600 mt-1">
|
||||
Diese Plattform stützt sich auf {data.total_controls.toLocaleString('de-DE')}{' '}
|
||||
klassifizierte Compliance-Controls aus den unten genannten Quellen.
|
||||
Jeder Control trägt eine deterministische Lizenzregel (R1–R3), die das
|
||||
Render-Verhalten in Berichten und im Frontend steuert.
|
||||
</p>
|
||||
</header>
|
||||
|
||||
<section className="mb-8">
|
||||
<h2 className="text-lg font-medium mb-3">Klassifizierungs-Schema</h2>
|
||||
<div className="grid grid-cols-1 md:grid-cols-3 gap-3 text-sm">
|
||||
{data.buckets.map((b) => (
|
||||
<div key={b.rule} className={`rounded border ${RULE_COLOR[b.rule] ?? 'border-slate-200'} p-3`}>
|
||||
<div className="flex items-center gap-2 mb-2">
|
||||
<span className={`inline-flex items-center justify-center w-7 h-7 rounded-full text-xs font-bold ${RULE_BADGE[b.rule] ?? 'bg-slate-600 text-white'}`}>
|
||||
R{b.rule}
|
||||
</span>
|
||||
<span className="font-medium">{b.label_de}</span>
|
||||
</div>
|
||||
<ul className="text-xs text-slate-700 space-y-1">
|
||||
<li>{b.total_controls.toLocaleString('de-DE')} Controls</li>
|
||||
<li>{b.distinct_sources} Quellen</li>
|
||||
<li>{b.render_full_text ? 'Volltext-Anzeige erlaubt' : 'Nur Identifier-Verweis'}</li>
|
||||
<li>{b.attribution_required ? 'Attribution-Pflicht in Output' : 'keine Attribution-Pflicht'}</li>
|
||||
</ul>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</section>
|
||||
|
||||
{data.buckets.map((b) => (
|
||||
<section key={b.rule} className="mb-8">
|
||||
<h2 className="text-lg font-medium mb-3 flex items-center gap-2">
|
||||
<span className={`inline-flex items-center justify-center w-7 h-7 rounded-full text-xs font-bold ${RULE_BADGE[b.rule] ?? 'bg-slate-600 text-white'}`}>
|
||||
R{b.rule}
|
||||
</span>
|
||||
{b.label_de}{' '}
|
||||
<span className="text-sm text-slate-500 font-normal">
|
||||
({b.total_controls.toLocaleString('de-DE')} Controls aus {b.distinct_sources} Quellen)
|
||||
</span>
|
||||
</h2>
|
||||
|
||||
<div className="overflow-x-auto border rounded">
|
||||
<table className="w-full text-sm">
|
||||
<thead className="bg-slate-100 text-slate-700">
|
||||
<tr>
|
||||
<th className="text-left p-2">Quelle</th>
|
||||
<th className="text-left p-2">Lizenztyp</th>
|
||||
<th className="text-left p-2">Rechtsraum</th>
|
||||
<th className="text-left p-2">Attribution</th>
|
||||
<th className="text-right p-2">Controls</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{b.sources.map((s) => (
|
||||
<tr key={`${b.rule}-${s.regulation_id}`} className="border-t">
|
||||
<td className="p-2">{s.regulation_name_de ?? s.regulation_id}</td>
|
||||
<td className="p-2 text-slate-600">{s.license_type ?? '—'}</td>
|
||||
<td className="p-2 text-slate-600">{s.jurisdiction ?? '—'}</td>
|
||||
<td className="p-2 text-slate-600">{s.attribution ?? '—'}</td>
|
||||
<td className="p-2 text-right tabular-nums">{s.n_controls.toLocaleString('de-DE')}</td>
|
||||
</tr>
|
||||
))}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</section>
|
||||
))}
|
||||
|
||||
<footer className="text-xs text-slate-500 border-t pt-4 mt-8">
|
||||
Klassifizierung: deterministisch über parent_control_uuid-Vererbung,
|
||||
control_parent_links → regulation_registry, source_citation,
|
||||
canonical_processed_chunks (Pipeline-Ground-Truth) und LLM-Aggregat-
|
||||
Identifikation für eigene Werke. Audit-Skripte unter
|
||||
breakpilot-core/control-pipeline/scripts/.
|
||||
</footer>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,138 @@
|
||||
'use client'
|
||||
|
||||
import { useEffect, useState } from 'react'
|
||||
|
||||
// Stufe 3 of the Attribution Renderer (Task #23): an inline source
|
||||
// badge that any rendered control/hazard/measure can attach to itself.
|
||||
//
|
||||
// Visually a small license-rule pill (R1/R2/R3); on hover/click it
|
||||
// reveals the underlying regulation, license type, and — for Rule 2 —
|
||||
// the mandatory attribution string.
|
||||
//
|
||||
// Usage:
|
||||
// <SourceBadge controlUuid={hazard.id} />
|
||||
//
|
||||
// The component lazily fetches /licenses/source-info/{uuid} on first
|
||||
// expand so the surrounding list view stays cheap.
|
||||
|
||||
type SourceInfo = {
|
||||
control_uuid: string
|
||||
license_rule: number | null
|
||||
license_label_de: string | null
|
||||
attribution_required: boolean
|
||||
render_full_text: boolean
|
||||
regulation_id: string | null
|
||||
regulation_name_de: string | null
|
||||
license_type: string | null
|
||||
attribution: string | null
|
||||
source_url: string | null
|
||||
}
|
||||
|
||||
const RULE_BADGE: Record<number, string> = {
|
||||
1: 'bg-emerald-100 text-emerald-800 border-emerald-300',
|
||||
2: 'bg-amber-100 text-amber-800 border-amber-300',
|
||||
3: 'bg-slate-100 text-slate-700 border-slate-300',
|
||||
}
|
||||
|
||||
const RULE_TITLE: Record<number, string> = {
|
||||
1: 'R1 — wörtlich übernehmbar',
|
||||
2: 'R2 — wörtlich mit Attribution',
|
||||
3: 'R3 — nur Identifier zitieren',
|
||||
}
|
||||
|
||||
interface SourceBadgeProps {
|
||||
controlUuid: string
|
||||
/** Optional: skip the fetch and render from already-known data. */
|
||||
prefetched?: SourceInfo
|
||||
/** Compact mode for tight UI rows (smaller pill). */
|
||||
compact?: boolean
|
||||
}
|
||||
|
||||
export function SourceBadge({ controlUuid, prefetched, compact }: SourceBadgeProps) {
|
||||
const [data, setData] = useState<SourceInfo | null>(prefetched ?? null)
|
||||
const [open, setOpen] = useState(false)
|
||||
const [loading, setLoading] = useState(false)
|
||||
const [error, setError] = useState<string | null>(null)
|
||||
|
||||
useEffect(() => {
|
||||
if (!open || data) return
|
||||
setLoading(true)
|
||||
fetch(`/api/sdk/v1/compliance/licenses/source-info/${controlUuid}`)
|
||||
.then((r) => (r.ok ? r.json() : Promise.reject(`HTTP ${r.status}`)))
|
||||
.then(setData)
|
||||
.catch((e) => setError(String(e)))
|
||||
.finally(() => setLoading(false))
|
||||
}, [open, data, controlUuid])
|
||||
|
||||
const rule = data?.license_rule ?? prefetched?.license_rule ?? null
|
||||
const badgeClass = rule ? RULE_BADGE[rule] ?? RULE_BADGE[3] : 'bg-slate-100 text-slate-500 border-slate-200'
|
||||
const sizeClass = compact ? 'text-[10px] px-1.5 py-0.5' : 'text-xs px-2 py-0.5'
|
||||
|
||||
return (
|
||||
<span className="relative inline-block">
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => setOpen((v) => !v)}
|
||||
className={`inline-flex items-center gap-1 rounded border font-medium ${sizeClass} ${badgeClass} hover:opacity-80 transition`}
|
||||
title={rule ? RULE_TITLE[rule] : 'Lizenz unbekannt'}
|
||||
aria-expanded={open}
|
||||
>
|
||||
<svg width="10" height="10" viewBox="0 0 16 16" fill="currentColor" aria-hidden>
|
||||
<path d="M8 0a8 8 0 1 0 0 16A8 8 0 0 0 8 0Zm0 4.5a1 1 0 1 1 0 2 1 1 0 0 1 0-2ZM7 8h2v4.5H7V8Z" />
|
||||
</svg>
|
||||
{rule ? `R${rule}` : '?'}
|
||||
</button>
|
||||
|
||||
{open && (
|
||||
<div className="absolute left-0 mt-1 z-40 w-80 rounded-md border border-slate-200 bg-white shadow-lg p-3 text-xs">
|
||||
{loading && <p className="text-slate-500">Lade Quellen-Info…</p>}
|
||||
{error && <p className="text-red-600">Fehler: {error}</p>}
|
||||
{data && (
|
||||
<div className="space-y-2">
|
||||
<div className="font-semibold text-slate-800">
|
||||
{data.license_label_de ?? 'Lizenz unbekannt'}
|
||||
</div>
|
||||
{data.regulation_name_de && (
|
||||
<div>
|
||||
<span className="text-slate-500">Quelle:</span>{' '}
|
||||
<span className="text-slate-800">{data.regulation_name_de}</span>
|
||||
</div>
|
||||
)}
|
||||
{data.license_type && (
|
||||
<div>
|
||||
<span className="text-slate-500">Lizenztyp:</span>{' '}
|
||||
<span className="text-slate-700">{data.license_type}</span>
|
||||
</div>
|
||||
)}
|
||||
{data.attribution && (
|
||||
<div className="rounded bg-amber-50 border border-amber-200 px-2 py-1.5">
|
||||
<div className="text-[10px] font-semibold text-amber-800 uppercase tracking-wide">
|
||||
Attribution-Pflicht
|
||||
</div>
|
||||
<div className="text-amber-900">{data.attribution}</div>
|
||||
</div>
|
||||
)}
|
||||
{!data.render_full_text && (
|
||||
<div className="text-[10px] text-slate-500 italic">
|
||||
Volltext wird im Output nicht gerendert — nur Identifier-Verweis.
|
||||
</div>
|
||||
)}
|
||||
{data.source_url && (
|
||||
<a
|
||||
href={data.source_url}
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="inline-block text-[10px] text-blue-600 hover:underline mt-1"
|
||||
>
|
||||
Originalquelle öffnen ↗
|
||||
</a>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</span>
|
||||
)
|
||||
}
|
||||
|
||||
export default SourceBadge
|
||||
@@ -72,6 +72,7 @@ _ROUTER_MODULES = [
|
||||
"whistleblower_routes",
|
||||
"tcf_routes",
|
||||
"founding_wizard_routes",
|
||||
"licenses_routes",
|
||||
]
|
||||
|
||||
_loaded_count = 0
|
||||
|
||||
@@ -862,16 +862,19 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
except Exception as e:
|
||||
logger.warning("html_table parse failed: %s", e)
|
||||
|
||||
# B — cookies_table_parser auch auf gecrawltem Cookie-Text
|
||||
# (nicht nur bei User-Paste). Wenn der Crawler Tab/Pipe-
|
||||
# getrennte Tabellen-Reihen erhalten hat, parsen wir sie
|
||||
# deterministisch und mergen die Vendor-Records.
|
||||
# B — cookies_table_parser auch auf gecrawltem Cookie-Text.
|
||||
# Erst Standard-Parse (Tab/Pipe-getrennt). Wenn der nichts
|
||||
# findet (kein Separator), Flat-Pattern-Parse fuer Sites wie
|
||||
# VW die ihre Tabelle als flachen Text liefern.
|
||||
if cookie_text and len(cookie_text) >= 500:
|
||||
try:
|
||||
from compliance.services.cookies_table_parser import (
|
||||
parse_cookie_table as _parse_ct,
|
||||
parse_flat_cookie_text as _parse_flat,
|
||||
)
|
||||
crawled_table_vendors = _parse_ct(cookie_text)
|
||||
if not crawled_table_vendors:
|
||||
crawled_table_vendors = _parse_flat(cookie_text)
|
||||
if crawled_table_vendors:
|
||||
existing = {(v.get("name") or "").strip().lower()
|
||||
for v in cmp_vendors}
|
||||
|
||||
@@ -0,0 +1,306 @@
|
||||
"""License attribution endpoints — Task #23 Stufe 1-4.
|
||||
|
||||
The audit (Task #22) classified all 314,811 canonical_controls into
|
||||
license_rule 1/2/3. The frontend, PDF renderer, and tech-file generator
|
||||
now need to surface that classification in the form of:
|
||||
|
||||
- Stufe 1: a global /licenses overview page
|
||||
- Stufe 2: an auto-footer in every exported PDF
|
||||
- Stufe 3: an inline source badge on every rendered hazard/measure
|
||||
- Stufe 4: a sources appendix in tech-file bundles
|
||||
|
||||
This module exposes three endpoints that all four stages consume:
|
||||
|
||||
GET /api/compliance/licenses/overview
|
||||
Global aggregation by rule + per-source counts. Drives Stufe 1.
|
||||
|
||||
POST /api/compliance/licenses/aggregate
|
||||
Body: {"control_uuids": ["uuid1", ...]}.
|
||||
Returns per-rule grouping with source breakdown. Used by PDF
|
||||
footer (Stufe 2) and tech-file appendix (Stufe 4) to build the
|
||||
"sources used in this document" list.
|
||||
|
||||
GET /api/compliance/licenses/source-info/{control_uuid}
|
||||
Single-control lookup for the inline source badge tooltip
|
||||
(Stufe 3). Returns rule, source regulation, attribution text.
|
||||
|
||||
Why a new module instead of extending canonical_control_routes:
|
||||
- canonical_control_routes serves the legacy SPDX-style license matrix
|
||||
(canonical_control_licenses + canonical_control_sources, ~10 rows).
|
||||
- This module is built on regulation_registry (252 rows) + the
|
||||
license_rule on each control. Both schemas coexist; this module
|
||||
doesn't disturb the legacy endpoints.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any, Optional
|
||||
from uuid import UUID
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from classroom_engine.database import get_db
|
||||
|
||||
router = APIRouter(prefix="/licenses", tags=["licenses"])
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Rule labels — used by frontend renderer
|
||||
# ============================================================================
|
||||
|
||||
RULE_LABELS = {
|
||||
1: {
|
||||
"code": "R1",
|
||||
"label_de": "Wörtlich übernehmbar",
|
||||
"label_en": "Verbatim, no attribution required",
|
||||
"render_full_text": True,
|
||||
"attribution_required": False,
|
||||
},
|
||||
2: {
|
||||
"code": "R2",
|
||||
"label_de": "Wörtlich mit Attribution",
|
||||
"label_en": "Verbatim with attribution",
|
||||
"render_full_text": True,
|
||||
"attribution_required": True,
|
||||
},
|
||||
3: {
|
||||
"code": "R3",
|
||||
"label_de": "Nur Identifier zitieren",
|
||||
"label_en": "Identifier citation only",
|
||||
"render_full_text": False,
|
||||
"attribution_required": False,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Response Schemas
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class SourceCount(BaseModel):
|
||||
regulation_id: str
|
||||
regulation_name_de: Optional[str]
|
||||
license_rule: int
|
||||
license_type: Optional[str]
|
||||
attribution: Optional[str]
|
||||
jurisdiction: Optional[str]
|
||||
source_type: Optional[str]
|
||||
n_controls: int
|
||||
|
||||
|
||||
class RuleBucket(BaseModel):
|
||||
rule: int
|
||||
label_de: str
|
||||
label_en: str
|
||||
attribution_required: bool
|
||||
render_full_text: bool
|
||||
total_controls: int
|
||||
distinct_sources: int
|
||||
sources: list[SourceCount]
|
||||
|
||||
|
||||
class OverviewResponse(BaseModel):
|
||||
total_controls: int
|
||||
buckets: list[RuleBucket]
|
||||
|
||||
|
||||
class AggregateRequest(BaseModel):
|
||||
control_uuids: list[UUID]
|
||||
|
||||
|
||||
class AggregateResponse(BaseModel):
|
||||
total_in_request: int
|
||||
matched: int
|
||||
buckets: list[RuleBucket]
|
||||
|
||||
|
||||
class SourceInfo(BaseModel):
|
||||
control_uuid: UUID
|
||||
license_rule: Optional[int]
|
||||
license_label_de: Optional[str]
|
||||
attribution_required: bool
|
||||
render_full_text: bool
|
||||
regulation_id: Optional[str]
|
||||
regulation_name_de: Optional[str]
|
||||
license_type: Optional[str]
|
||||
attribution: Optional[str]
|
||||
source_url: Optional[str]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Endpoints
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _bucket(rule: int, sources: list[SourceCount]) -> RuleBucket:
|
||||
meta = RULE_LABELS.get(rule, RULE_LABELS[3])
|
||||
return RuleBucket(
|
||||
rule=rule,
|
||||
label_de=meta["label_de"],
|
||||
label_en=meta["label_en"],
|
||||
attribution_required=meta["attribution_required"],
|
||||
render_full_text=meta["render_full_text"],
|
||||
total_controls=sum(s.n_controls for s in sources),
|
||||
distinct_sources=len(sources),
|
||||
sources=sources,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/overview", response_model=OverviewResponse)
|
||||
def licenses_overview(db: Session = Depends(get_db)) -> OverviewResponse:
|
||||
"""Global aggregation: total controls by rule, with per-source breakdown.
|
||||
|
||||
Drives Stufe 1 (the /licenses page).
|
||||
"""
|
||||
rows = db.execute(text("""
|
||||
SELECT
|
||||
COALESCE(cpl.source_regulation, '(no source)') AS regulation_name,
|
||||
cc.license_rule,
|
||||
COUNT(DISTINCT cc.id) AS n
|
||||
FROM compliance.canonical_controls cc
|
||||
LEFT JOIN compliance.control_parent_links cpl ON cpl.control_uuid = cc.id
|
||||
WHERE cc.license_rule IS NOT NULL
|
||||
GROUP BY 1, 2
|
||||
""")).fetchall()
|
||||
|
||||
reg_rows = db.execute(text("""
|
||||
SELECT regulation_name_de, regulation_id, license_type, attribution,
|
||||
jurisdiction, source_type
|
||||
FROM compliance.regulation_registry
|
||||
""")).fetchall()
|
||||
reg_by_name = {r.regulation_name_de: r for r in reg_rows if r.regulation_name_de}
|
||||
|
||||
by_rule: dict[int, list[SourceCount]] = {1: [], 2: [], 3: []}
|
||||
seen: dict[tuple[int, str], int] = {}
|
||||
total = 0
|
||||
for row in rows:
|
||||
rule = int(row.license_rule)
|
||||
name = row.regulation_name
|
||||
n = int(row.n)
|
||||
key = (rule, name)
|
||||
# multiple cpl entries per control deduplicate via DISTINCT, but a
|
||||
# control with several source_regulations still gets counted once
|
||||
# per regulation — that's the design.
|
||||
seen[key] = seen.get(key, 0) + n
|
||||
total += n
|
||||
|
||||
for (rule, name), n in seen.items():
|
||||
reg = reg_by_name.get(name)
|
||||
by_rule.setdefault(rule, []).append(SourceCount(
|
||||
regulation_id=reg.regulation_id if reg else name,
|
||||
regulation_name_de=name,
|
||||
license_rule=rule,
|
||||
license_type=reg.license_type if reg else None,
|
||||
attribution=reg.attribution if reg else None,
|
||||
jurisdiction=reg.jurisdiction if reg else None,
|
||||
source_type=reg.source_type if reg else None,
|
||||
n_controls=n,
|
||||
))
|
||||
|
||||
for r in by_rule.values():
|
||||
r.sort(key=lambda s: -s.n_controls)
|
||||
buckets = [_bucket(rule, sources) for rule, sources in sorted(by_rule.items())]
|
||||
return OverviewResponse(total_controls=total, buckets=buckets)
|
||||
|
||||
|
||||
@router.post("/aggregate", response_model=AggregateResponse)
|
||||
def aggregate_for_controls(
|
||||
body: AggregateRequest,
|
||||
db: Session = Depends(get_db),
|
||||
) -> AggregateResponse:
|
||||
"""Per-control license aggregation for PDF footer (Stufe 2) and
|
||||
tech-file sources appendix (Stufe 4).
|
||||
|
||||
Returns a per-rule breakdown of which sources contributed to the
|
||||
supplied control set. The frontend renderer turns this into the
|
||||
"Verwendete Quellen" footer.
|
||||
"""
|
||||
if not body.control_uuids:
|
||||
return AggregateResponse(total_in_request=0, matched=0, buckets=[])
|
||||
|
||||
rows = db.execute(text("""
|
||||
SELECT
|
||||
COALESCE(cpl.source_regulation, '(unknown)') AS regulation_name,
|
||||
cc.license_rule,
|
||||
COUNT(DISTINCT cc.id) AS n
|
||||
FROM compliance.canonical_controls cc
|
||||
LEFT JOIN compliance.control_parent_links cpl ON cpl.control_uuid = cc.id
|
||||
WHERE cc.id = ANY(:ids) AND cc.license_rule IS NOT NULL
|
||||
GROUP BY 1, 2
|
||||
"""), {"ids": [str(u) for u in body.control_uuids]}).fetchall()
|
||||
|
||||
reg_rows = db.execute(text("""
|
||||
SELECT regulation_name_de, regulation_id, license_type, attribution,
|
||||
jurisdiction, source_type
|
||||
FROM compliance.regulation_registry
|
||||
""")).fetchall()
|
||||
reg_by_name = {r.regulation_name_de: r for r in reg_rows if r.regulation_name_de}
|
||||
|
||||
by_rule: dict[int, list[SourceCount]] = {1: [], 2: [], 3: []}
|
||||
matched_total = 0
|
||||
for row in rows:
|
||||
rule = int(row.license_rule)
|
||||
n = int(row.n)
|
||||
matched_total += n
|
||||
reg = reg_by_name.get(row.regulation_name)
|
||||
by_rule.setdefault(rule, []).append(SourceCount(
|
||||
regulation_id=reg.regulation_id if reg else row.regulation_name,
|
||||
regulation_name_de=row.regulation_name,
|
||||
license_rule=rule,
|
||||
license_type=reg.license_type if reg else None,
|
||||
attribution=reg.attribution if reg else None,
|
||||
jurisdiction=reg.jurisdiction if reg else None,
|
||||
source_type=reg.source_type if reg else None,
|
||||
n_controls=n,
|
||||
))
|
||||
for r in by_rule.values():
|
||||
r.sort(key=lambda s: -s.n_controls)
|
||||
buckets = [_bucket(rule, sources) for rule, sources in sorted(by_rule.items()) if sources]
|
||||
return AggregateResponse(
|
||||
total_in_request=len(body.control_uuids),
|
||||
matched=matched_total,
|
||||
buckets=buckets,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/source-info/{control_uuid}", response_model=SourceInfo)
|
||||
def source_info_for_control(
|
||||
control_uuid: UUID,
|
||||
db: Session = Depends(get_db),
|
||||
) -> SourceInfo:
|
||||
"""Single-control source info for the inline source badge (Stufe 3).
|
||||
|
||||
Used by the React `<SourceBadge>` component to populate its tooltip.
|
||||
"""
|
||||
row = db.execute(text("""
|
||||
SELECT cc.license_rule, cpl.source_regulation AS regulation_name,
|
||||
r.regulation_id, r.license_type, r.attribution, r.url AS source_url
|
||||
FROM compliance.canonical_controls cc
|
||||
LEFT JOIN compliance.control_parent_links cpl ON cpl.control_uuid = cc.id
|
||||
LEFT JOIN compliance.regulation_registry r ON r.regulation_name_de = cpl.source_regulation
|
||||
WHERE cc.id = :uuid
|
||||
LIMIT 1
|
||||
"""), {"uuid": str(control_uuid)}).fetchone()
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="control not found")
|
||||
|
||||
rule = int(row.license_rule) if row.license_rule is not None else None
|
||||
meta = RULE_LABELS.get(rule, {}) if rule else {}
|
||||
return SourceInfo(
|
||||
control_uuid=control_uuid,
|
||||
license_rule=rule,
|
||||
license_label_de=meta.get("label_de"),
|
||||
attribution_required=meta.get("attribution_required", False),
|
||||
render_full_text=meta.get("render_full_text", False),
|
||||
regulation_id=row.regulation_id,
|
||||
regulation_name_de=row.regulation_name,
|
||||
license_type=row.license_type,
|
||||
attribution=row.attribution,
|
||||
source_url=row.source_url,
|
||||
)
|
||||
@@ -189,6 +189,74 @@ def parse_cookie_table(text: str) -> list[dict]:
|
||||
return out
|
||||
|
||||
|
||||
_FLAT_ROW_RE = re.compile(
|
||||
r"\b([A-Za-z_][A-Za-z0-9_\-\.]{1,40})\s+"
|
||||
r"((?:Tracking|Session|Funktional|Marketing|Analytics|Performance|"
|
||||
r"Notwendig|Strictly\s+Necessary|Statistik|Personalisierung)"
|
||||
r"[A-Za-zäöüÄÖÜß \-\(\)]*?Cookies?[^A-Z]{0,400}?)"
|
||||
r"(?:(\d+)\s*(Sekunde|Minute|Stunde|Tag|Woche|Monat|Jahr|day|month|year)|"
|
||||
r"\b(Session|Permanent)\b)",
|
||||
re.I | re.S,
|
||||
)
|
||||
|
||||
|
||||
def parse_flat_cookie_text(text: str) -> list[dict]:
|
||||
"""Variante fuer Sites wie VW die ihre Cookie-Tabelle als flachen
|
||||
Text liefern (Cookie-Name + Kategorie + Beschreibung + Dauer in
|
||||
einem Block hintereinander, ohne klare Trenner).
|
||||
|
||||
Regex sucht nach 'NAME [Tracking|Session|Funktional...] Cookies
|
||||
... [13 Monate|Session|Permanent]' und behandelt jeden Match als
|
||||
eine Tabellen-Zeile.
|
||||
"""
|
||||
if not text or len(text) < 500:
|
||||
return []
|
||||
matches = list(_FLAT_ROW_RE.finditer(text))
|
||||
if len(matches) < 3:
|
||||
return []
|
||||
by_vendor: dict[str, dict] = {}
|
||||
seen_names: set[str] = set()
|
||||
for m in matches:
|
||||
name = m.group(1).strip()
|
||||
nl = name.lower()
|
||||
if nl in seen_names:
|
||||
continue
|
||||
if nl in ("dieser", "diese", "ein", "der", "die", "das",
|
||||
"session", "permanent", "funktional", "notwendig",
|
||||
"marketing", "analytics", "werbung", "anbieter",
|
||||
"tracking", "cookie", "cookies", "und", "von",
|
||||
"einer", "ist", "alle", "noch", "auch", "name",
|
||||
"art", "zweck", "dauer"):
|
||||
continue
|
||||
if len(name) < 3 or len(name) > 60:
|
||||
continue
|
||||
seen_names.add(nl)
|
||||
category = _normalize_category(m.group(2) or "")
|
||||
persistence = ""
|
||||
if m.group(3):
|
||||
persistence = f"{m.group(3)} {m.group(4)}"
|
||||
elif m.group(5):
|
||||
persistence = m.group(5)
|
||||
purpose = (m.group(2) or "").strip()[:300]
|
||||
vendor = _guess_vendor(name) or "Unbekannter Anbieter"
|
||||
entry = by_vendor.setdefault(vendor, {
|
||||
"name": vendor, "country": "",
|
||||
"purpose": purpose, "category": category,
|
||||
"opt_out_url": "", "privacy_policy_url": "",
|
||||
"persistence": persistence,
|
||||
"cookies": [],
|
||||
"source": "flat_pattern",
|
||||
})
|
||||
entry["cookies"].append({
|
||||
"name": name, "purpose": purpose[:200],
|
||||
"expiry": persistence, "is_third_party": True,
|
||||
})
|
||||
out = list(by_vendor.values())
|
||||
logger.info("parse_flat_cookie_text: %d vendors / %d cookies",
|
||||
len(out), sum(len(v["cookies"]) for v in out))
|
||||
return out
|
||||
|
||||
|
||||
_VENDOR_GUESS = (
|
||||
("_ga", "Google"), ("_gid", "Google"), ("_gcl_", "Google"),
|
||||
("ANID", "Google"), ("AID", "Google"), ("FPGCLDC", "Google"),
|
||||
|
||||
@@ -182,6 +182,35 @@ class DSIDiscoveryResult:
|
||||
# not the homepage navigation that DOM extraction returns.
|
||||
cmp_cookie_text: str = ""
|
||||
|
||||
async def _extract_dom_tables(page) -> list[list[str]]:
|
||||
"""D — extrahiert alle <table>-Elemente aus dem aktuellen DOM als
|
||||
list[list[str]] (jede Tabelle = Array von Tab-getrennten Zeilen).
|
||||
|
||||
Wird VOR der Navigation woandershin von jeder Document-Loading-
|
||||
Funktion aufgerufen damit jede DiscoveredDSI ihre Tabellen behaelt.
|
||||
"""
|
||||
try:
|
||||
return await page.evaluate("""
|
||||
() => {
|
||||
const out = [];
|
||||
document.querySelectorAll('table').forEach(t => {
|
||||
const rows = [];
|
||||
t.querySelectorAll('tr').forEach(tr => {
|
||||
const cells = [];
|
||||
tr.querySelectorAll('th, td').forEach(c => {
|
||||
cells.push((c.innerText || c.textContent || '').trim().replace(/\\s+/g, ' '));
|
||||
});
|
||||
if (cells.length >= 2) rows.push(cells.join('\\t'));
|
||||
});
|
||||
if (rows.length >= 3) out.push(rows);
|
||||
});
|
||||
return out.slice(0, 10);
|
||||
}
|
||||
""") or []
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def _matches_dsi_keyword(text: str) -> tuple[bool, str]:
|
||||
"""Check if text contains any DSI keyword. Returns (match, language)."""
|
||||
text_lower = text.lower().strip()
|
||||
|
||||
Reference in New Issue
Block a user