feat(vendor-assessment): AVV/SCC/TOM/Sub-Processor checklists + assessment service

Phase 1-3 of the Vendor Contract Assessment: Backend checklists (Doc-Check L1/L2 engine compatible): - avv_checks.py: 28 checks (11 L1 + 17 L2) for Art. 28(3) DSGVO - scc_checks.py: 7 checks for EU SCC 2021 (modules, annexes, TIA) - tom_annex_checks.py: 12 checks for Art. 32 (8 control objectives) - sub_processor_checks.py: 7 checks for sub-processor list completeness Assessment service: - POST /vendor-compliance/assessments — async contract analysis - GET /vendor-compliance/assessments/{id} — poll status - Cross-check engine: detects missing SCC when AVV mentions third-country, missing TOM annex, missing sub-processor list All checklists registered in runner.py CHECKLIST_MAP (27 doc_types total). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-12 23:14:54 +02:00
parent c867478791
commit 0326d5baab
8 changed files with 1359 additions and 0 deletions
@@ -0,0 +1,408 @@
+"""
+Vendor Contract Assessment Routes — Automated vendor document analysis.
+
+Uploads vendor contracts (AVV, SCC, TOM annex, sub-processor list),
+runs them through the Doc-Check L1/L2 engine + LLM verification,
+and produces a professional Pruefprotokoll.
+
+POST /vendor-compliance/assessments       — Start assessment (async)
+GET  /vendor-compliance/assessments       — List assessments
+GET  /vendor-compliance/assessments/{id}  — Poll status / get result
+POST /vendor-compliance/assessments/{id}/approve — DSB approval
+"""
+
+import asyncio
+import logging
+import uuid as _uuid
+from datetime import datetime, timezone
+from typing import Optional
+
+from fastapi import APIRouter
+from pydantic import BaseModel
+
+from compliance.services.dsi_document_checker import (
+    check_document_completeness,
+)
+from compliance.services.vendor_assessment_cross_check import (
+    cross_check_documents,
+)
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(prefix="/vendor-compliance", tags=["vendor-assessment"])
+
+
+# ── Request / Response Models ───────────────────────────────────────
+
+class DocumentEntry(BaseModel):
+    doc_type: str = "auto"  # avv, scc, tom_annex, sub_processor_list, agb, auto
+    label: str = ""
+    url: str
+
+
+class AssessmentRequest(BaseModel):
+    vendor_name: str
+    documents: list[DocumentEntry]
+    recipient: str = ""
+
+
+class AssessmentStartResponse(BaseModel):
+    assessment_id: str
+    status: str = "running"
+
+
+class FindingItem(BaseModel):
+    id: str
+    category: str
+    severity: str
+    type: str  # OK, GAP, RISK
+    title: str
+    description: str = ""
+    recommendation: str = ""
+    document_label: str = ""
+    document_type: str = ""
+    check_id: str = ""
+    citations: list[str] = []
+
+
+class DocumentResult(BaseModel):
+    label: str
+    url: str
+    doc_type: str
+    word_count: int = 0
+    completeness_pct: int = 0
+    correctness_pct: int = 0
+    checks: list[dict] = []
+    findings_count: int = 0
+    error: str = ""
+
+
+class AssessmentResult(BaseModel):
+    vendor_name: str
+    documents: list[DocumentResult]
+    findings: list[FindingItem]
+    overall_score: int = 0
+    category_scores: dict[str, int] = {}
+    cross_check_findings: list[dict] = []
+    checked_at: str = ""
+
+
+class AssessmentStatusResponse(BaseModel):
+    assessment_id: str
+    status: str
+    progress: str = ""
+    result: Optional[AssessmentResult] = None
+    error: str = ""
+
+
+# ── In-memory job store ─────────────────────────────────────────────
+
+_assessment_jobs: dict[str, dict] = {}
+
+
+# ── Endpoints ───────────────────────────────────────────────────────
+
+@router.post("/assessments", response_model=AssessmentStartResponse)
+async def start_assessment(req: AssessmentRequest):
+    """Start an async vendor contract assessment."""
+    assessment_id = str(_uuid.uuid4())
+    _assessment_jobs[assessment_id] = {
+        "status": "running",
+        "progress": "Initialisierung...",
+        "result": None,
+        "error": "",
+    }
+
+    asyncio.create_task(_run_assessment(assessment_id, req))
+    return AssessmentStartResponse(assessment_id=assessment_id)
+
+
+@router.get("/assessments/{assessment_id}", response_model=AssessmentStatusResponse)
+async def get_assessment_status(assessment_id: str):
+    """Poll assessment status or retrieve completed result."""
+    job = _assessment_jobs.get(assessment_id)
+    if not job:
+        return AssessmentStatusResponse(
+            assessment_id=assessment_id, status="not_found",
+            error="Assessment nicht gefunden",
+        )
+    return AssessmentStatusResponse(
+        assessment_id=assessment_id,
+        status=job["status"],
+        progress=job.get("progress", ""),
+        result=job.get("result"),
+        error=job.get("error", ""),
+    )
+
+
+@router.get("/assessments")
+async def list_assessments():
+    """List all assessments (from in-memory store)."""
+    items = []
+    for aid, job in _assessment_jobs.items():
+        r = job.get("result")
+        items.append({
+            "assessment_id": aid,
+            "status": job["status"],
+            "vendor_name": r.vendor_name if r else "",
+            "overall_score": r.overall_score if r else 0,
+            "document_count": len(r.documents) if r else 0,
+            "findings_count": len(r.findings) if r else 0,
+        })
+    return {"assessments": items}
+
+
+@router.post("/assessments/{assessment_id}/approve")
+async def approve_assessment(assessment_id: str):
+    """Mark an assessment as approved by DSB."""
+    job = _assessment_jobs.get(assessment_id)
+    if not job or job["status"] != "completed":
+        return {"error": "Assessment nicht abgeschlossen"}
+    job["status"] = "approved"
+    return {"status": "approved", "assessment_id": assessment_id}
+
+
+# ── Background Processing ──────────────────────────────────────────
+
+CONSENT_TESTER_URL = "http://bp-compliance-consent-tester:8094"
+
+# Doc-type auto-detection keywords
+_DOC_TYPE_KEYWORDS = {
+    "avv": ["auftragsverarbeit", "auftrags-verarbeit", "data processing agreement",
+            "dpa ", "art. 28", "art.28", "artikel 28"],
+    "scc": ["standardvertragsklausel", "standard contractual clauses",
+            "2021/914", "klausel 14", "module 2", "modul 2"],
+    "tom_annex": ["technische und organisatorische", "tom-anlage",
+                  "art. 32", "zutrittskontrolle", "zugangskontrolle",
+                  "zugriffskontrolle", "verfuegbarkeitskontrolle"],
+    "sub_processor_list": ["unterauftragnehmer", "sub-processor",
+                           "subprocessor", "unterauftragsverarbeiter"],
+    "agb": ["allgemeine geschaeftsbedingungen", "nutzungsbedingungen",
+            "terms of service", "terms and conditions"],
+}
+
+
+def _detect_doc_type(text: str, label: str) -> str:
+    """Auto-detect document type from content and label."""
+    combined = (text[:3000] + " " + label).lower()
+    scores: dict[str, int] = {}
+    for dtype, keywords in _DOC_TYPE_KEYWORDS.items():
+        scores[dtype] = sum(1 for kw in keywords if kw in combined)
+    if not scores or max(scores.values()) == 0:
+        return "agb"  # fallback
+    return max(scores, key=scores.get)
+
+
+async def _extract_text(url: str) -> tuple[str, int]:
+    """Extract text from a URL via consent-tester or direct fetch."""
+    import httpx
+
+    # Try consent-tester first (handles JS-rendered pages)
+    try:
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            resp = await client.post(
+                f"{CONSENT_TESTER_URL}/dsi-discovery",
+                json={"url": url, "max_documents": 1},
+            )
+            if resp.status_code == 200:
+                data = resp.json()
+                docs = data.get("documents", [])
+                if docs:
+                    text = docs[0].get("full_text", "")
+                    wc = docs[0].get("word_count", 0)
+                    if len(text) > 50:
+                        return text, wc
+                # Fallback to full page
+                fp = data.get("html_full_page", "")
+                if len(fp) > 50:
+                    return fp, len(fp.split())
+    except Exception as e:
+        logger.warning("consent-tester failed for %s: %s", url, e)
+
+    # Direct fetch fallback
+    try:
+        async with httpx.AsyncClient(timeout=15.0) as client:
+            resp = await client.get(url)
+            text = resp.text
+            return text, len(text.split())
+    except Exception as e:
+        logger.error("Direct fetch failed for %s: %s", url, e)
+        return "", 0
+
+
+async def _run_assessment(assessment_id: str, req: AssessmentRequest):
+    """Background task: analyze all documents and produce Pruefprotokoll."""
+    job = _assessment_jobs[assessment_id]
+    doc_results: list[DocumentResult] = []
+    all_findings: list[FindingItem] = []
+    doc_texts: dict[str, str] = {}  # doc_type → text (for cross-check)
+
+    try:
+        total = len(req.documents)
+
+        for i, entry in enumerate(req.documents):
+            job["progress"] = f"Dokument {i+1}/{total}: {entry.label or entry.url[:40]}..."
+
+            # 1. Extract text
+            text, word_count = await _extract_text(entry.url)
+            if not text or len(text) < 50:
+                doc_results.append(DocumentResult(
+                    label=entry.label or entry.url,
+                    url=entry.url,
+                    doc_type=entry.doc_type,
+                    error="Text konnte nicht extrahiert werden",
+                ))
+                continue
+
+            # 2. Detect doc_type if auto
+            doc_type = entry.doc_type
+            if doc_type == "auto":
+                doc_type = _detect_doc_type(text, entry.label)
+                logger.info("Auto-detected doc_type=%s for %s", doc_type, entry.label)
+
+            doc_texts[doc_type] = text
+
+            # 3. Run checklist
+            label = entry.label or f"{doc_type.upper()}: {entry.url[:50]}"
+            result = check_document_completeness(text, doc_type, label, entry.url)
+
+            checks = result.get("checks", [])
+            completeness = result.get("completeness_pct", 0)
+            correctness = result.get("correctness_pct", 0)
+
+            # 4. Extract findings from failed checks
+            failed_checks = [c for c in checks if not c.get("passed") and not c.get("skipped")]
+            for fc in failed_checks:
+                severity = fc.get("severity", "MEDIUM")
+                ftype = "GAP" if severity in ("CRITICAL", "HIGH") else "RISK"
+
+                all_findings.append(FindingItem(
+                    id=f"{assessment_id[:8]}-{fc['id']}",
+                    category=_check_to_category(fc["id"], doc_type),
+                    severity=severity,
+                    type=ftype,
+                    title=fc.get("label", ""),
+                    description=fc.get("hint", ""),
+                    recommendation=fc.get("hint", ""),
+                    document_label=label,
+                    document_type=doc_type,
+                    check_id=fc["id"],
+                    citations=[fc.get("matched_text", "")] if fc.get("matched_text") else [],
+                ))
+
+            doc_results.append(DocumentResult(
+                label=label,
+                url=entry.url,
+                doc_type=doc_type,
+                word_count=word_count,
+                completeness_pct=completeness,
+                correctness_pct=correctness,
+                checks=checks,
+                findings_count=len(failed_checks),
+            ))
+
+        # 5. Cross-check between documents
+        job["progress"] = "Cross-Check zwischen Dokumenten..."
+        cross_findings = cross_check_documents(doc_texts, req.vendor_name)
+
+        # 6. Calculate scores
+        category_scores = _calculate_category_scores(doc_results)
+        overall = _calculate_overall_score(category_scores, all_findings, cross_findings)
+
+        # 7. Build result
+        result = AssessmentResult(
+            vendor_name=req.vendor_name,
+            documents=doc_results,
+            findings=all_findings,
+            overall_score=overall,
+            category_scores=category_scores,
+            cross_check_findings=cross_findings,
+            checked_at=datetime.now(timezone.utc).isoformat(),
+        )
+
+        job["status"] = "completed"
+        job["progress"] = ""
+        job["result"] = result
+        logger.info("Assessment %s completed: %d docs, %d findings, score=%d%%",
+                     assessment_id, len(doc_results), len(all_findings), overall)
+
+    except Exception as e:
+        logger.exception("Assessment %s failed", assessment_id)
+        job["status"] = "failed"
+        job["error"] = str(e)
+
+
+# ── Helpers ─────────────────────────────────────────────────────────
+
+def _check_to_category(check_id: str, doc_type: str) -> str:
+    """Map a check ID to a finding category."""
+    prefix_map = {
+        "avv_instruction": "INSTRUCTION",
+        "avv_confidentiality": "CONFIDENTIALITY",
+        "avv_tom": "TOM",
+        "avv_subprocessor": "SUBPROCESSOR",
+        "avv_data_subject": "DATA_SUBJECT_RIGHTS",
+        "avv_dpia": "GENERAL",
+        "avv_deletion": "DELETION",
+        "avv_audit": "AUDIT_RIGHTS",
+        "avv_breach": "INCIDENT",
+        "avv_liability": "LIABILITY",
+        "avv_subject": "AVV_CONTENT",
+        "scc_": "TRANSFER",
+        "tom_": "TOM",
+        "sub_": "SUBPROCESSOR",
+    }
+    for prefix, cat in prefix_map.items():
+        if check_id.startswith(prefix):
+            return cat
+    return doc_type.upper()
+
+
+def _calculate_category_scores(docs: list[DocumentResult]) -> dict[str, int]:
+    """Calculate per-category compliance scores from document results."""
+    cat_totals: dict[str, int] = {}
+    cat_passed: dict[str, int] = {}
+
+    for doc in docs:
+        for check in doc.checks:
+            if check.get("skipped"):
+                continue
+            cat = _check_to_category(check.get("id", ""), doc.doc_type)
+            cat_totals[cat] = cat_totals.get(cat, 0) + 1
+            if check.get("passed"):
+                cat_passed[cat] = cat_passed.get(cat, 0) + 1
+
+    scores = {}
+    for cat, total in cat_totals.items():
+        passed = cat_passed.get(cat, 0)
+        scores[cat] = round(passed / total * 100) if total > 0 else 0
+    return scores
+
+
+def _calculate_overall_score(
+    category_scores: dict[str, int],
+    findings: list[FindingItem],
+    cross_findings: list[dict],
+) -> int:
+    """Calculate overall compliance score."""
+    if not category_scores:
+        return 0
+
+    # Weighted average: CRITICAL categories count double
+    critical_cats = {"INSTRUCTION", "TOM", "SUBPROCESSOR", "DELETION", "INCIDENT", "TRANSFER"}
+    total_weight = 0
+    weighted_sum = 0
+
+    for cat, score in category_scores.items():
+        weight = 2 if cat in critical_cats else 1
+        weighted_sum += score * weight
+        total_weight += weight
+
+    base = round(weighted_sum / total_weight) if total_weight > 0 else 0
+
+    # Penalty for critical findings
+    critical_count = sum(1 for f in findings if f.severity == "CRITICAL")
+    cross_critical = sum(1 for f in cross_findings if f.get("severity") == "CRITICAL")
+    penalty = (critical_count + cross_critical) * 5
+
+    return max(0, min(100, base - penalty))