feat(vendor-assessment): AVV/SCC/TOM/Sub-Processor checklists + assessment service
Phase 1-3 of the Vendor Contract Assessment:
Backend checklists (Doc-Check L1/L2 engine compatible):
- avv_checks.py: 28 checks (11 L1 + 17 L2) for Art. 28(3) DSGVO
- scc_checks.py: 7 checks for EU SCC 2021 (modules, annexes, TIA)
- tom_annex_checks.py: 12 checks for Art. 32 (8 control objectives)
- sub_processor_checks.py: 7 checks for sub-processor list completeness
Assessment service:
- POST /vendor-compliance/assessments — async contract analysis
- GET /vendor-compliance/assessments/{id} — poll status
- Cross-check engine: detects missing SCC when AVV mentions third-country,
missing TOM annex, missing sub-processor list
All checklists registered in runner.py CHECKLIST_MAP (27 doc_types total).
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,408 @@
|
||||
"""
|
||||
Vendor Contract Assessment Routes — Automated vendor document analysis.
|
||||
|
||||
Uploads vendor contracts (AVV, SCC, TOM annex, sub-processor list),
|
||||
runs them through the Doc-Check L1/L2 engine + LLM verification,
|
||||
and produces a professional Pruefprotokoll.
|
||||
|
||||
POST /vendor-compliance/assessments — Start assessment (async)
|
||||
GET /vendor-compliance/assessments — List assessments
|
||||
GET /vendor-compliance/assessments/{id} — Poll status / get result
|
||||
POST /vendor-compliance/assessments/{id}/approve — DSB approval
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import uuid as _uuid
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter
|
||||
from pydantic import BaseModel
|
||||
|
||||
from compliance.services.dsi_document_checker import (
|
||||
check_document_completeness,
|
||||
)
|
||||
from compliance.services.vendor_assessment_cross_check import (
|
||||
cross_check_documents,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/vendor-compliance", tags=["vendor-assessment"])
|
||||
|
||||
|
||||
# ── Request / Response Models ───────────────────────────────────────
|
||||
|
||||
class DocumentEntry(BaseModel):
|
||||
doc_type: str = "auto" # avv, scc, tom_annex, sub_processor_list, agb, auto
|
||||
label: str = ""
|
||||
url: str
|
||||
|
||||
|
||||
class AssessmentRequest(BaseModel):
|
||||
vendor_name: str
|
||||
documents: list[DocumentEntry]
|
||||
recipient: str = ""
|
||||
|
||||
|
||||
class AssessmentStartResponse(BaseModel):
|
||||
assessment_id: str
|
||||
status: str = "running"
|
||||
|
||||
|
||||
class FindingItem(BaseModel):
|
||||
id: str
|
||||
category: str
|
||||
severity: str
|
||||
type: str # OK, GAP, RISK
|
||||
title: str
|
||||
description: str = ""
|
||||
recommendation: str = ""
|
||||
document_label: str = ""
|
||||
document_type: str = ""
|
||||
check_id: str = ""
|
||||
citations: list[str] = []
|
||||
|
||||
|
||||
class DocumentResult(BaseModel):
|
||||
label: str
|
||||
url: str
|
||||
doc_type: str
|
||||
word_count: int = 0
|
||||
completeness_pct: int = 0
|
||||
correctness_pct: int = 0
|
||||
checks: list[dict] = []
|
||||
findings_count: int = 0
|
||||
error: str = ""
|
||||
|
||||
|
||||
class AssessmentResult(BaseModel):
|
||||
vendor_name: str
|
||||
documents: list[DocumentResult]
|
||||
findings: list[FindingItem]
|
||||
overall_score: int = 0
|
||||
category_scores: dict[str, int] = {}
|
||||
cross_check_findings: list[dict] = []
|
||||
checked_at: str = ""
|
||||
|
||||
|
||||
class AssessmentStatusResponse(BaseModel):
|
||||
assessment_id: str
|
||||
status: str
|
||||
progress: str = ""
|
||||
result: Optional[AssessmentResult] = None
|
||||
error: str = ""
|
||||
|
||||
|
||||
# ── In-memory job store ─────────────────────────────────────────────
|
||||
|
||||
_assessment_jobs: dict[str, dict] = {}
|
||||
|
||||
|
||||
# ── Endpoints ───────────────────────────────────────────────────────
|
||||
|
||||
@router.post("/assessments", response_model=AssessmentStartResponse)
|
||||
async def start_assessment(req: AssessmentRequest):
|
||||
"""Start an async vendor contract assessment."""
|
||||
assessment_id = str(_uuid.uuid4())
|
||||
_assessment_jobs[assessment_id] = {
|
||||
"status": "running",
|
||||
"progress": "Initialisierung...",
|
||||
"result": None,
|
||||
"error": "",
|
||||
}
|
||||
|
||||
asyncio.create_task(_run_assessment(assessment_id, req))
|
||||
return AssessmentStartResponse(assessment_id=assessment_id)
|
||||
|
||||
|
||||
@router.get("/assessments/{assessment_id}", response_model=AssessmentStatusResponse)
|
||||
async def get_assessment_status(assessment_id: str):
|
||||
"""Poll assessment status or retrieve completed result."""
|
||||
job = _assessment_jobs.get(assessment_id)
|
||||
if not job:
|
||||
return AssessmentStatusResponse(
|
||||
assessment_id=assessment_id, status="not_found",
|
||||
error="Assessment nicht gefunden",
|
||||
)
|
||||
return AssessmentStatusResponse(
|
||||
assessment_id=assessment_id,
|
||||
status=job["status"],
|
||||
progress=job.get("progress", ""),
|
||||
result=job.get("result"),
|
||||
error=job.get("error", ""),
|
||||
)
|
||||
|
||||
|
||||
@router.get("/assessments")
|
||||
async def list_assessments():
|
||||
"""List all assessments (from in-memory store)."""
|
||||
items = []
|
||||
for aid, job in _assessment_jobs.items():
|
||||
r = job.get("result")
|
||||
items.append({
|
||||
"assessment_id": aid,
|
||||
"status": job["status"],
|
||||
"vendor_name": r.vendor_name if r else "",
|
||||
"overall_score": r.overall_score if r else 0,
|
||||
"document_count": len(r.documents) if r else 0,
|
||||
"findings_count": len(r.findings) if r else 0,
|
||||
})
|
||||
return {"assessments": items}
|
||||
|
||||
|
||||
@router.post("/assessments/{assessment_id}/approve")
|
||||
async def approve_assessment(assessment_id: str):
|
||||
"""Mark an assessment as approved by DSB."""
|
||||
job = _assessment_jobs.get(assessment_id)
|
||||
if not job or job["status"] != "completed":
|
||||
return {"error": "Assessment nicht abgeschlossen"}
|
||||
job["status"] = "approved"
|
||||
return {"status": "approved", "assessment_id": assessment_id}
|
||||
|
||||
|
||||
# ── Background Processing ──────────────────────────────────────────
|
||||
|
||||
CONSENT_TESTER_URL = "http://bp-compliance-consent-tester:8094"
|
||||
|
||||
# Doc-type auto-detection keywords
|
||||
_DOC_TYPE_KEYWORDS = {
|
||||
"avv": ["auftragsverarbeit", "auftrags-verarbeit", "data processing agreement",
|
||||
"dpa ", "art. 28", "art.28", "artikel 28"],
|
||||
"scc": ["standardvertragsklausel", "standard contractual clauses",
|
||||
"2021/914", "klausel 14", "module 2", "modul 2"],
|
||||
"tom_annex": ["technische und organisatorische", "tom-anlage",
|
||||
"art. 32", "zutrittskontrolle", "zugangskontrolle",
|
||||
"zugriffskontrolle", "verfuegbarkeitskontrolle"],
|
||||
"sub_processor_list": ["unterauftragnehmer", "sub-processor",
|
||||
"subprocessor", "unterauftragsverarbeiter"],
|
||||
"agb": ["allgemeine geschaeftsbedingungen", "nutzungsbedingungen",
|
||||
"terms of service", "terms and conditions"],
|
||||
}
|
||||
|
||||
|
||||
def _detect_doc_type(text: str, label: str) -> str:
|
||||
"""Auto-detect document type from content and label."""
|
||||
combined = (text[:3000] + " " + label).lower()
|
||||
scores: dict[str, int] = {}
|
||||
for dtype, keywords in _DOC_TYPE_KEYWORDS.items():
|
||||
scores[dtype] = sum(1 for kw in keywords if kw in combined)
|
||||
if not scores or max(scores.values()) == 0:
|
||||
return "agb" # fallback
|
||||
return max(scores, key=scores.get)
|
||||
|
||||
|
||||
async def _extract_text(url: str) -> tuple[str, int]:
|
||||
"""Extract text from a URL via consent-tester or direct fetch."""
|
||||
import httpx
|
||||
|
||||
# Try consent-tester first (handles JS-rendered pages)
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
resp = await client.post(
|
||||
f"{CONSENT_TESTER_URL}/dsi-discovery",
|
||||
json={"url": url, "max_documents": 1},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
docs = data.get("documents", [])
|
||||
if docs:
|
||||
text = docs[0].get("full_text", "")
|
||||
wc = docs[0].get("word_count", 0)
|
||||
if len(text) > 50:
|
||||
return text, wc
|
||||
# Fallback to full page
|
||||
fp = data.get("html_full_page", "")
|
||||
if len(fp) > 50:
|
||||
return fp, len(fp.split())
|
||||
except Exception as e:
|
||||
logger.warning("consent-tester failed for %s: %s", url, e)
|
||||
|
||||
# Direct fetch fallback
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15.0) as client:
|
||||
resp = await client.get(url)
|
||||
text = resp.text
|
||||
return text, len(text.split())
|
||||
except Exception as e:
|
||||
logger.error("Direct fetch failed for %s: %s", url, e)
|
||||
return "", 0
|
||||
|
||||
|
||||
async def _run_assessment(assessment_id: str, req: AssessmentRequest):
|
||||
"""Background task: analyze all documents and produce Pruefprotokoll."""
|
||||
job = _assessment_jobs[assessment_id]
|
||||
doc_results: list[DocumentResult] = []
|
||||
all_findings: list[FindingItem] = []
|
||||
doc_texts: dict[str, str] = {} # doc_type → text (for cross-check)
|
||||
|
||||
try:
|
||||
total = len(req.documents)
|
||||
|
||||
for i, entry in enumerate(req.documents):
|
||||
job["progress"] = f"Dokument {i+1}/{total}: {entry.label or entry.url[:40]}..."
|
||||
|
||||
# 1. Extract text
|
||||
text, word_count = await _extract_text(entry.url)
|
||||
if not text or len(text) < 50:
|
||||
doc_results.append(DocumentResult(
|
||||
label=entry.label or entry.url,
|
||||
url=entry.url,
|
||||
doc_type=entry.doc_type,
|
||||
error="Text konnte nicht extrahiert werden",
|
||||
))
|
||||
continue
|
||||
|
||||
# 2. Detect doc_type if auto
|
||||
doc_type = entry.doc_type
|
||||
if doc_type == "auto":
|
||||
doc_type = _detect_doc_type(text, entry.label)
|
||||
logger.info("Auto-detected doc_type=%s for %s", doc_type, entry.label)
|
||||
|
||||
doc_texts[doc_type] = text
|
||||
|
||||
# 3. Run checklist
|
||||
label = entry.label or f"{doc_type.upper()}: {entry.url[:50]}"
|
||||
result = check_document_completeness(text, doc_type, label, entry.url)
|
||||
|
||||
checks = result.get("checks", [])
|
||||
completeness = result.get("completeness_pct", 0)
|
||||
correctness = result.get("correctness_pct", 0)
|
||||
|
||||
# 4. Extract findings from failed checks
|
||||
failed_checks = [c for c in checks if not c.get("passed") and not c.get("skipped")]
|
||||
for fc in failed_checks:
|
||||
severity = fc.get("severity", "MEDIUM")
|
||||
ftype = "GAP" if severity in ("CRITICAL", "HIGH") else "RISK"
|
||||
|
||||
all_findings.append(FindingItem(
|
||||
id=f"{assessment_id[:8]}-{fc['id']}",
|
||||
category=_check_to_category(fc["id"], doc_type),
|
||||
severity=severity,
|
||||
type=ftype,
|
||||
title=fc.get("label", ""),
|
||||
description=fc.get("hint", ""),
|
||||
recommendation=fc.get("hint", ""),
|
||||
document_label=label,
|
||||
document_type=doc_type,
|
||||
check_id=fc["id"],
|
||||
citations=[fc.get("matched_text", "")] if fc.get("matched_text") else [],
|
||||
))
|
||||
|
||||
doc_results.append(DocumentResult(
|
||||
label=label,
|
||||
url=entry.url,
|
||||
doc_type=doc_type,
|
||||
word_count=word_count,
|
||||
completeness_pct=completeness,
|
||||
correctness_pct=correctness,
|
||||
checks=checks,
|
||||
findings_count=len(failed_checks),
|
||||
))
|
||||
|
||||
# 5. Cross-check between documents
|
||||
job["progress"] = "Cross-Check zwischen Dokumenten..."
|
||||
cross_findings = cross_check_documents(doc_texts, req.vendor_name)
|
||||
|
||||
# 6. Calculate scores
|
||||
category_scores = _calculate_category_scores(doc_results)
|
||||
overall = _calculate_overall_score(category_scores, all_findings, cross_findings)
|
||||
|
||||
# 7. Build result
|
||||
result = AssessmentResult(
|
||||
vendor_name=req.vendor_name,
|
||||
documents=doc_results,
|
||||
findings=all_findings,
|
||||
overall_score=overall,
|
||||
category_scores=category_scores,
|
||||
cross_check_findings=cross_findings,
|
||||
checked_at=datetime.now(timezone.utc).isoformat(),
|
||||
)
|
||||
|
||||
job["status"] = "completed"
|
||||
job["progress"] = ""
|
||||
job["result"] = result
|
||||
logger.info("Assessment %s completed: %d docs, %d findings, score=%d%%",
|
||||
assessment_id, len(doc_results), len(all_findings), overall)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("Assessment %s failed", assessment_id)
|
||||
job["status"] = "failed"
|
||||
job["error"] = str(e)
|
||||
|
||||
|
||||
# ── Helpers ─────────────────────────────────────────────────────────
|
||||
|
||||
def _check_to_category(check_id: str, doc_type: str) -> str:
|
||||
"""Map a check ID to a finding category."""
|
||||
prefix_map = {
|
||||
"avv_instruction": "INSTRUCTION",
|
||||
"avv_confidentiality": "CONFIDENTIALITY",
|
||||
"avv_tom": "TOM",
|
||||
"avv_subprocessor": "SUBPROCESSOR",
|
||||
"avv_data_subject": "DATA_SUBJECT_RIGHTS",
|
||||
"avv_dpia": "GENERAL",
|
||||
"avv_deletion": "DELETION",
|
||||
"avv_audit": "AUDIT_RIGHTS",
|
||||
"avv_breach": "INCIDENT",
|
||||
"avv_liability": "LIABILITY",
|
||||
"avv_subject": "AVV_CONTENT",
|
||||
"scc_": "TRANSFER",
|
||||
"tom_": "TOM",
|
||||
"sub_": "SUBPROCESSOR",
|
||||
}
|
||||
for prefix, cat in prefix_map.items():
|
||||
if check_id.startswith(prefix):
|
||||
return cat
|
||||
return doc_type.upper()
|
||||
|
||||
|
||||
def _calculate_category_scores(docs: list[DocumentResult]) -> dict[str, int]:
|
||||
"""Calculate per-category compliance scores from document results."""
|
||||
cat_totals: dict[str, int] = {}
|
||||
cat_passed: dict[str, int] = {}
|
||||
|
||||
for doc in docs:
|
||||
for check in doc.checks:
|
||||
if check.get("skipped"):
|
||||
continue
|
||||
cat = _check_to_category(check.get("id", ""), doc.doc_type)
|
||||
cat_totals[cat] = cat_totals.get(cat, 0) + 1
|
||||
if check.get("passed"):
|
||||
cat_passed[cat] = cat_passed.get(cat, 0) + 1
|
||||
|
||||
scores = {}
|
||||
for cat, total in cat_totals.items():
|
||||
passed = cat_passed.get(cat, 0)
|
||||
scores[cat] = round(passed / total * 100) if total > 0 else 0
|
||||
return scores
|
||||
|
||||
|
||||
def _calculate_overall_score(
|
||||
category_scores: dict[str, int],
|
||||
findings: list[FindingItem],
|
||||
cross_findings: list[dict],
|
||||
) -> int:
|
||||
"""Calculate overall compliance score."""
|
||||
if not category_scores:
|
||||
return 0
|
||||
|
||||
# Weighted average: CRITICAL categories count double
|
||||
critical_cats = {"INSTRUCTION", "TOM", "SUBPROCESSOR", "DELETION", "INCIDENT", "TRANSFER"}
|
||||
total_weight = 0
|
||||
weighted_sum = 0
|
||||
|
||||
for cat, score in category_scores.items():
|
||||
weight = 2 if cat in critical_cats else 1
|
||||
weighted_sum += score * weight
|
||||
total_weight += weight
|
||||
|
||||
base = round(weighted_sum / total_weight) if total_weight > 0 else 0
|
||||
|
||||
# Penalty for critical findings
|
||||
critical_count = sum(1 for f in findings if f.severity == "CRITICAL")
|
||||
cross_critical = sum(1 for f in cross_findings if f.get("severity") == "CRITICAL")
|
||||
penalty = (critical_count + cross_critical) * 5
|
||||
|
||||
return max(0, min(100, base - penalty))
|
||||
Reference in New Issue
Block a user