""" Vendor Contract Assessment Routes — Automated vendor document analysis. Uploads vendor contracts (AVV, SCC, TOM annex, sub-processor list), runs them through the Doc-Check L1/L2 engine + LLM verification, and produces a professional Pruefprotokoll. POST /vendor-compliance/assessments — Start assessment (async) GET /vendor-compliance/assessments — List assessments GET /vendor-compliance/assessments/{id} — Poll status / get result POST /vendor-compliance/assessments/{id}/approve — DSB approval """ import asyncio import logging import uuid as _uuid from datetime import datetime, timezone from typing import Optional from fastapi import APIRouter from pydantic import BaseModel from compliance.services.dsi_document_checker import ( check_document_completeness, ) from compliance.services.vendor_assessment_cross_check import ( cross_check_documents, ) from compliance.services.vendor_assessment_report import ( build_pruefprotokoll, ) logger = logging.getLogger(__name__) router = APIRouter(prefix="/vendor-compliance", tags=["vendor-assessment"]) # ── Request / Response Models ─────────────────────────────────────── class DocumentEntry(BaseModel): doc_type: str = "auto" # avv, scc, tom_annex, sub_processor_list, agb, auto label: str = "" url: str class AssessmentRequest(BaseModel): vendor_name: str documents: list[DocumentEntry] recipient: str = "" class AssessmentStartResponse(BaseModel): assessment_id: str status: str = "running" class FindingItem(BaseModel): id: str category: str severity: str type: str # OK, GAP, RISK title: str description: str = "" recommendation: str = "" document_label: str = "" document_type: str = "" check_id: str = "" citations: list[str] = [] class DocumentResult(BaseModel): label: str url: str doc_type: str word_count: int = 0 completeness_pct: int = 0 correctness_pct: int = 0 checks: list[dict] = [] findings_count: int = 0 error: str = "" class AssessmentResult(BaseModel): vendor_name: str documents: list[DocumentResult] findings: list[FindingItem] overall_score: int = 0 category_scores: dict[str, int] = {} cross_check_findings: list[dict] = [] report_html: str = "" checked_at: str = "" class AssessmentStatusResponse(BaseModel): assessment_id: str status: str progress: str = "" result: Optional[AssessmentResult] = None error: str = "" # ── In-memory job store ───────────────────────────────────────────── _assessment_jobs: dict[str, dict] = {} # ── Endpoints ─────────────────────────────────────────────────────── @router.post("/assessments", response_model=AssessmentStartResponse) async def start_assessment(req: AssessmentRequest): """Start an async vendor contract assessment.""" assessment_id = str(_uuid.uuid4()) _assessment_jobs[assessment_id] = { "status": "running", "progress": "Initialisierung...", "result": None, "error": "", } asyncio.create_task(_run_assessment(assessment_id, req)) return AssessmentStartResponse(assessment_id=assessment_id) @router.get("/assessments/{assessment_id}", response_model=AssessmentStatusResponse) async def get_assessment_status(assessment_id: str): """Poll assessment status or retrieve completed result.""" job = _assessment_jobs.get(assessment_id) if not job: return AssessmentStatusResponse( assessment_id=assessment_id, status="not_found", error="Assessment nicht gefunden", ) return AssessmentStatusResponse( assessment_id=assessment_id, status=job["status"], progress=job.get("progress", ""), result=job.get("result"), error=job.get("error", ""), ) @router.get("/assessments") async def list_assessments(): """List all assessments (from in-memory store).""" items = [] for aid, job in _assessment_jobs.items(): r = job.get("result") items.append({ "assessment_id": aid, "status": job["status"], "vendor_name": r.vendor_name if r else "", "overall_score": r.overall_score if r else 0, "document_count": len(r.documents) if r else 0, "findings_count": len(r.findings) if r else 0, }) return {"assessments": items} @router.post("/assessments/{assessment_id}/approve") async def approve_assessment(assessment_id: str): """Mark an assessment as approved by DSB.""" job = _assessment_jobs.get(assessment_id) if not job or job["status"] != "completed": return {"error": "Assessment nicht abgeschlossen"} job["status"] = "approved" return {"status": "approved", "assessment_id": assessment_id} # ── Background Processing ────────────────────────────────────────── CONSENT_TESTER_URL = "http://bp-compliance-consent-tester:8094" # Doc-type auto-detection keywords _DOC_TYPE_KEYWORDS = { "avv": ["auftragsverarbeit", "auftrags-verarbeit", "data processing agreement", "dpa ", "art. 28", "art.28", "artikel 28"], "scc": ["standardvertragsklausel", "standard contractual clauses", "2021/914", "klausel 14", "module 2", "modul 2"], "tom_annex": ["technische und organisatorische", "tom-anlage", "art. 32", "zutrittskontrolle", "zugangskontrolle", "zugriffskontrolle", "verfuegbarkeitskontrolle"], "sub_processor_list": ["unterauftragnehmer", "sub-processor", "subprocessor", "unterauftragsverarbeiter"], "agb": ["allgemeine geschaeftsbedingungen", "nutzungsbedingungen", "terms of service", "terms and conditions"], } def _detect_doc_type(text: str, label: str) -> str: """Auto-detect document type from content and label.""" combined = (text[:3000] + " " + label).lower() scores: dict[str, int] = {} for dtype, keywords in _DOC_TYPE_KEYWORDS.items(): scores[dtype] = sum(1 for kw in keywords if kw in combined) if not scores or max(scores.values()) == 0: return "agb" # fallback return max(scores, key=scores.get) async def _extract_text(url: str) -> tuple[str, int]: """Extract text from a URL via consent-tester or direct fetch.""" import httpx # Try consent-tester first (handles JS-rendered pages) try: async with httpx.AsyncClient(timeout=30.0) as client: resp = await client.post( f"{CONSENT_TESTER_URL}/dsi-discovery", json={"url": url, "max_documents": 1}, ) if resp.status_code == 200: data = resp.json() docs = data.get("documents", []) if docs: text = docs[0].get("full_text", "") wc = docs[0].get("word_count", 0) if len(text) > 50: return text, wc # Fallback to full page fp = data.get("html_full_page", "") if len(fp) > 50: return fp, len(fp.split()) except Exception as e: logger.warning("consent-tester failed for %s: %s", url, e) # Direct fetch fallback try: async with httpx.AsyncClient(timeout=15.0) as client: resp = await client.get(url) text = resp.text return text, len(text.split()) except Exception as e: logger.error("Direct fetch failed for %s: %s", url, e) return "", 0 async def _run_assessment(assessment_id: str, req: AssessmentRequest): """Background task: analyze all documents and produce Pruefprotokoll.""" job = _assessment_jobs[assessment_id] doc_results: list[DocumentResult] = [] all_findings: list[FindingItem] = [] doc_texts: dict[str, str] = {} # doc_type → text (for cross-check) try: total = len(req.documents) for i, entry in enumerate(req.documents): job["progress"] = f"Dokument {i+1}/{total}: {entry.label or entry.url[:40]}..." # 1. Extract text text, word_count = await _extract_text(entry.url) if not text or len(text) < 50: doc_results.append(DocumentResult( label=entry.label or entry.url, url=entry.url, doc_type=entry.doc_type, error="Text konnte nicht extrahiert werden", )) continue # 2. Detect doc_type if auto doc_type = entry.doc_type if doc_type == "auto": doc_type = _detect_doc_type(text, entry.label) logger.info("Auto-detected doc_type=%s for %s", doc_type, entry.label) doc_texts[doc_type] = text # 3. Run checklist label = entry.label or f"{doc_type.upper()}: {entry.url[:50]}" result = check_document_completeness(text, doc_type, label, entry.url) checks = result.get("checks", []) completeness = result.get("completeness_pct", 0) correctness = result.get("correctness_pct", 0) # 4. Extract findings from failed checks failed_checks = [c for c in checks if not c.get("passed") and not c.get("skipped")] for fc in failed_checks: severity = fc.get("severity", "MEDIUM") ftype = "GAP" if severity in ("CRITICAL", "HIGH") else "RISK" all_findings.append(FindingItem( id=f"{assessment_id[:8]}-{fc['id']}", category=_check_to_category(fc["id"], doc_type), severity=severity, type=ftype, title=fc.get("label", ""), description=fc.get("hint", ""), recommendation=fc.get("hint", ""), document_label=label, document_type=doc_type, check_id=fc["id"], citations=[fc.get("matched_text", "")] if fc.get("matched_text") else [], )) doc_results.append(DocumentResult( label=label, url=entry.url, doc_type=doc_type, word_count=word_count, completeness_pct=completeness, correctness_pct=correctness, checks=checks, findings_count=len(failed_checks), )) # 5. Cross-check between documents job["progress"] = "Cross-Check zwischen Dokumenten..." cross_findings = cross_check_documents(doc_texts, req.vendor_name) # 6. Calculate scores category_scores = _calculate_category_scores(doc_results) overall = _calculate_overall_score(category_scores, all_findings, cross_findings) # 7. Build result result = AssessmentResult( vendor_name=req.vendor_name, documents=doc_results, findings=all_findings, overall_score=overall, category_scores=category_scores, cross_check_findings=cross_findings, checked_at=datetime.now(timezone.utc).isoformat(), ) # 8. Generate Pruefprotokoll HTML try: result.report_html = build_pruefprotokoll(result.model_dump()) except Exception as e: logger.warning("Report generation failed: %s", e) job["status"] = "completed" job["progress"] = "" job["result"] = result logger.info("Assessment %s completed: %d docs, %d findings, score=%d%%", assessment_id, len(doc_results), len(all_findings), overall) except Exception as e: logger.exception("Assessment %s failed", assessment_id) job["status"] = "failed" job["error"] = str(e) # ── Helpers ───────────────────────────────────────────────────────── def _check_to_category(check_id: str, doc_type: str) -> str: """Map a check ID to a finding category.""" prefix_map = { "avv_instruction": "INSTRUCTION", "avv_confidentiality": "CONFIDENTIALITY", "avv_tom": "TOM", "avv_subprocessor": "SUBPROCESSOR", "avv_data_subject": "DATA_SUBJECT_RIGHTS", "avv_dpia": "GENERAL", "avv_deletion": "DELETION", "avv_audit": "AUDIT_RIGHTS", "avv_breach": "INCIDENT", "avv_liability": "LIABILITY", "avv_subject": "AVV_CONTENT", "scc_": "TRANSFER", "tom_": "TOM", "sub_": "SUBPROCESSOR", } for prefix, cat in prefix_map.items(): if check_id.startswith(prefix): return cat return doc_type.upper() def _calculate_category_scores(docs: list[DocumentResult]) -> dict[str, int]: """Calculate per-category compliance scores from document results.""" cat_totals: dict[str, int] = {} cat_passed: dict[str, int] = {} for doc in docs: for check in doc.checks: if check.get("skipped"): continue cat = _check_to_category(check.get("id", ""), doc.doc_type) cat_totals[cat] = cat_totals.get(cat, 0) + 1 if check.get("passed"): cat_passed[cat] = cat_passed.get(cat, 0) + 1 scores = {} for cat, total in cat_totals.items(): passed = cat_passed.get(cat, 0) scores[cat] = round(passed / total * 100) if total > 0 else 0 return scores def _calculate_overall_score( category_scores: dict[str, int], findings: list[FindingItem], cross_findings: list[dict], ) -> int: """Calculate overall compliance score.""" if not category_scores: return 0 # Weighted average: CRITICAL categories count double critical_cats = {"INSTRUCTION", "TOM", "SUBPROCESSOR", "DELETION", "INCIDENT", "TRANSFER"} total_weight = 0 weighted_sum = 0 for cat, score in category_scores.items(): weight = 2 if cat in critical_cats else 1 weighted_sum += score * weight total_weight += weight base = round(weighted_sum / total_weight) if total_weight > 0 else 0 # Penalty for critical findings critical_count = sum(1 for f in findings if f.severity == "CRITICAL") cross_critical = sum(1 for f in cross_findings if f.get("severity") == "CRITICAL") penalty = (critical_count + cross_critical) * 5 return max(0, min(100, base - penalty))