feat: Vorbereitung-Module auf 100% — Persistenz, Backend-Services, UCCA Frontend
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-ai-compliance (push) Successful in 37s
CI / test-python-backend-compliance (push) Successful in 32s
CI / test-python-document-crawler (push) Successful in 22s
CI / test-python-dsms-gateway (push) Successful in 18s

Phase A: PostgreSQL State Store (sdk_states Tabelle, InMemory-Fallback)
Phase B: Modules dynamisch vom Backend, Scope DB-Persistenz, Source Policy State
Phase C: UCCA Frontend (3 Seiten, Wizard, RiskScoreGauge), Obligations Live-Daten
Phase D: Document Import (PDF/LLM/Gap-Analyse), System Screening (SBOM/OSV.dev)
Phase E: Company Profile CRUD mit Audit-Logging
Phase F: Tests (Python + TypeScript), flow-data.ts DB-Tabellen aktualisiert

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-02 11:04:31 +01:00
parent cd15ab0932
commit e6d666b89b
38 changed files with 4195 additions and 420 deletions

View File

@@ -0,0 +1,344 @@
"""
FastAPI routes for Company Profile CRUD with audit logging.
Endpoints:
- GET /v1/company-profile: Get company profile for a tenant
- POST /v1/company-profile: Create or update company profile
- GET /v1/company-profile/audit: Get audit log for a tenant
"""
import json
import logging
import uuid
from typing import Optional
from fastapi import APIRouter, HTTPException, Header
from pydantic import BaseModel
from database import SessionLocal
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/v1/company-profile", tags=["company-profile"])
# =============================================================================
# REQUEST/RESPONSE MODELS
# =============================================================================
class CompanyProfileRequest(BaseModel):
company_name: str = ""
legal_form: str = "GmbH"
industry: str = ""
founded_year: Optional[int] = None
business_model: str = "B2B"
offerings: list[str] = []
company_size: str = "small"
employee_count: str = "1-9"
annual_revenue: str = "< 2 Mio"
headquarters_country: str = "DE"
headquarters_city: str = ""
has_international_locations: bool = False
international_countries: list[str] = []
target_markets: list[str] = ["DE"]
primary_jurisdiction: str = "DE"
is_data_controller: bool = True
is_data_processor: bool = False
uses_ai: bool = False
ai_use_cases: list[str] = []
dpo_name: Optional[str] = None
dpo_email: Optional[str] = None
legal_contact_name: Optional[str] = None
legal_contact_email: Optional[str] = None
machine_builder: Optional[dict] = None
is_complete: bool = False
class CompanyProfileResponse(BaseModel):
id: str
tenant_id: str
company_name: str
legal_form: str
industry: str
founded_year: Optional[int]
business_model: str
offerings: list[str]
company_size: str
employee_count: str
annual_revenue: str
headquarters_country: str
headquarters_city: str
has_international_locations: bool
international_countries: list[str]
target_markets: list[str]
primary_jurisdiction: str
is_data_controller: bool
is_data_processor: bool
uses_ai: bool
ai_use_cases: list[str]
dpo_name: Optional[str]
dpo_email: Optional[str]
legal_contact_name: Optional[str]
legal_contact_email: Optional[str]
machine_builder: Optional[dict]
is_complete: bool
completed_at: Optional[str]
created_at: str
updated_at: str
class AuditEntryResponse(BaseModel):
id: str
action: str
changed_fields: Optional[dict]
changed_by: Optional[str]
created_at: str
class AuditListResponse(BaseModel):
entries: list[AuditEntryResponse]
total: int
# =============================================================================
# HELPERS
# =============================================================================
def row_to_response(row) -> CompanyProfileResponse:
"""Convert a DB row to response model."""
return CompanyProfileResponse(
id=str(row[0]),
tenant_id=row[1],
company_name=row[2] or "",
legal_form=row[3] or "GmbH",
industry=row[4] or "",
founded_year=row[5],
business_model=row[6] or "B2B",
offerings=row[7] if isinstance(row[7], list) else [],
company_size=row[8] or "small",
employee_count=row[9] or "1-9",
annual_revenue=row[10] or "< 2 Mio",
headquarters_country=row[11] or "DE",
headquarters_city=row[12] or "",
has_international_locations=row[13] or False,
international_countries=row[14] if isinstance(row[14], list) else [],
target_markets=row[15] if isinstance(row[15], list) else ["DE"],
primary_jurisdiction=row[16] or "DE",
is_data_controller=row[17] if row[17] is not None else True,
is_data_processor=row[18] or False,
uses_ai=row[19] or False,
ai_use_cases=row[20] if isinstance(row[20], list) else [],
dpo_name=row[21],
dpo_email=row[22],
legal_contact_name=row[23],
legal_contact_email=row[24],
machine_builder=row[25] if isinstance(row[25], dict) else None,
is_complete=row[26] or False,
completed_at=str(row[27]) if row[27] else None,
created_at=str(row[28]),
updated_at=str(row[29]),
)
def log_audit(db, tenant_id: str, action: str, changed_fields: dict | None, changed_by: str | None):
"""Write an audit log entry."""
try:
db.execute(
"""INSERT INTO compliance_company_profile_audit
(tenant_id, action, changed_fields, changed_by)
VALUES (:tenant_id, :action, :fields::jsonb, :changed_by)""",
{
"tenant_id": tenant_id,
"action": action,
"fields": json.dumps(changed_fields) if changed_fields else None,
"changed_by": changed_by,
},
)
except Exception as e:
logger.warning(f"Failed to write audit log: {e}")
# =============================================================================
# ROUTES
# =============================================================================
@router.get("", response_model=CompanyProfileResponse)
async def get_company_profile(
tenant_id: str = "default",
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
):
"""Get company profile for a tenant."""
tid = x_tenant_id or tenant_id
db = SessionLocal()
try:
result = db.execute(
"""SELECT id, tenant_id, company_name, legal_form, industry, founded_year,
business_model, offerings, company_size, employee_count, annual_revenue,
headquarters_country, headquarters_city, has_international_locations,
international_countries, target_markets, primary_jurisdiction,
is_data_controller, is_data_processor, uses_ai, ai_use_cases,
dpo_name, dpo_email, legal_contact_name, legal_contact_email,
machine_builder, is_complete, completed_at, created_at, updated_at
FROM compliance_company_profiles WHERE tenant_id = :tenant_id""",
{"tenant_id": tid},
)
row = result.fetchone()
if not row:
raise HTTPException(status_code=404, detail="Company profile not found")
return row_to_response(row)
finally:
db.close()
@router.post("", response_model=CompanyProfileResponse)
async def upsert_company_profile(
profile: CompanyProfileRequest,
tenant_id: str = "default",
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
):
"""Create or update company profile (upsert)."""
tid = x_tenant_id or tenant_id
db = SessionLocal()
try:
# Check if profile exists
existing = db.execute(
"SELECT id FROM compliance_company_profiles WHERE tenant_id = :tid",
{"tid": tid},
).fetchone()
action = "update" if existing else "create"
completed_at_clause = ", completed_at = NOW()" if profile.is_complete else ", completed_at = NULL"
db.execute(
f"""INSERT INTO compliance_company_profiles
(tenant_id, company_name, legal_form, industry, founded_year,
business_model, offerings, company_size, employee_count, annual_revenue,
headquarters_country, headquarters_city, has_international_locations,
international_countries, target_markets, primary_jurisdiction,
is_data_controller, is_data_processor, uses_ai, ai_use_cases,
dpo_name, dpo_email, legal_contact_name, legal_contact_email,
machine_builder, is_complete)
VALUES (:tid, :company_name, :legal_form, :industry, :founded_year,
:business_model, :offerings::jsonb, :company_size, :employee_count, :annual_revenue,
:hq_country, :hq_city, :has_intl, :intl_countries::jsonb,
:target_markets::jsonb, :jurisdiction,
:is_controller, :is_processor, :uses_ai, :ai_use_cases::jsonb,
:dpo_name, :dpo_email, :legal_name, :legal_email,
:machine_builder::jsonb, :is_complete)
ON CONFLICT (tenant_id) DO UPDATE SET
company_name = EXCLUDED.company_name,
legal_form = EXCLUDED.legal_form,
industry = EXCLUDED.industry,
founded_year = EXCLUDED.founded_year,
business_model = EXCLUDED.business_model,
offerings = EXCLUDED.offerings,
company_size = EXCLUDED.company_size,
employee_count = EXCLUDED.employee_count,
annual_revenue = EXCLUDED.annual_revenue,
headquarters_country = EXCLUDED.headquarters_country,
headquarters_city = EXCLUDED.headquarters_city,
has_international_locations = EXCLUDED.has_international_locations,
international_countries = EXCLUDED.international_countries,
target_markets = EXCLUDED.target_markets,
primary_jurisdiction = EXCLUDED.primary_jurisdiction,
is_data_controller = EXCLUDED.is_data_controller,
is_data_processor = EXCLUDED.is_data_processor,
uses_ai = EXCLUDED.uses_ai,
ai_use_cases = EXCLUDED.ai_use_cases,
dpo_name = EXCLUDED.dpo_name,
dpo_email = EXCLUDED.dpo_email,
legal_contact_name = EXCLUDED.legal_contact_name,
legal_contact_email = EXCLUDED.legal_contact_email,
machine_builder = EXCLUDED.machine_builder,
is_complete = EXCLUDED.is_complete,
updated_at = NOW()
{completed_at_clause}""",
{
"tid": tid,
"company_name": profile.company_name,
"legal_form": profile.legal_form,
"industry": profile.industry,
"founded_year": profile.founded_year,
"business_model": profile.business_model,
"offerings": json.dumps(profile.offerings),
"company_size": profile.company_size,
"employee_count": profile.employee_count,
"annual_revenue": profile.annual_revenue,
"hq_country": profile.headquarters_country,
"hq_city": profile.headquarters_city,
"has_intl": profile.has_international_locations,
"intl_countries": json.dumps(profile.international_countries),
"target_markets": json.dumps(profile.target_markets),
"jurisdiction": profile.primary_jurisdiction,
"is_controller": profile.is_data_controller,
"is_processor": profile.is_data_processor,
"uses_ai": profile.uses_ai,
"ai_use_cases": json.dumps(profile.ai_use_cases),
"dpo_name": profile.dpo_name,
"dpo_email": profile.dpo_email,
"legal_name": profile.legal_contact_name,
"legal_email": profile.legal_contact_email,
"machine_builder": json.dumps(profile.machine_builder) if profile.machine_builder else None,
"is_complete": profile.is_complete,
},
)
# Audit log
log_audit(db, tid, action, profile.model_dump(), None)
db.commit()
# Fetch and return
result = db.execute(
"""SELECT id, tenant_id, company_name, legal_form, industry, founded_year,
business_model, offerings, company_size, employee_count, annual_revenue,
headquarters_country, headquarters_city, has_international_locations,
international_countries, target_markets, primary_jurisdiction,
is_data_controller, is_data_processor, uses_ai, ai_use_cases,
dpo_name, dpo_email, legal_contact_name, legal_contact_email,
machine_builder, is_complete, completed_at, created_at, updated_at
FROM compliance_company_profiles WHERE tenant_id = :tid""",
{"tid": tid},
)
row = result.fetchone()
return row_to_response(row)
except Exception as e:
db.rollback()
logger.error(f"Failed to upsert company profile: {e}")
raise HTTPException(status_code=500, detail="Failed to save company profile")
finally:
db.close()
@router.get("/audit", response_model=AuditListResponse)
async def get_audit_log(
tenant_id: str = "default",
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
):
"""Get audit log for company profile changes."""
tid = x_tenant_id or tenant_id
db = SessionLocal()
try:
result = db.execute(
"""SELECT id, action, changed_fields, changed_by, created_at
FROM compliance_company_profile_audit
WHERE tenant_id = :tid
ORDER BY created_at DESC
LIMIT 100""",
{"tid": tid},
)
rows = result.fetchall()
entries = [
AuditEntryResponse(
id=str(r[0]),
action=r[1],
changed_fields=r[2] if isinstance(r[2], dict) else None,
changed_by=r[3],
created_at=str(r[4]),
)
for r in rows
]
return AuditListResponse(entries=entries, total=len(entries))
finally:
db.close()

View File

@@ -0,0 +1,380 @@
"""
FastAPI routes for Document Import and Gap Analysis.
Endpoints:
- POST /v1/import/analyze: Upload and analyze a compliance document
- GET /v1/import/documents: List imported documents for a tenant
- GET /v1/import/gap-analysis/{document_id}: Get gap analysis for a document
"""
import logging
import os
import uuid
from typing import Optional
import httpx
from fastapi import APIRouter, File, Form, UploadFile, HTTPException
from pydantic import BaseModel
from database import SessionLocal
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/v1/import", tags=["document-import"])
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
LLM_MODEL = os.getenv("COMPLIANCE_LLM_MODEL", "qwen3:30b-a3b")
# =============================================================================
# DOCUMENT TYPE DETECTION
# =============================================================================
DOCUMENT_TYPE_KEYWORDS = {
"DSFA": ["datenschutz-folgenabschaetzung", "dsfa", "dpia", "privacy impact"],
"TOM": ["technisch-organisatorische", "tom", "massnahmen", "technical measures"],
"VVT": ["verarbeitungsverzeichnis", "vvt", "processing activities", "art. 30"],
"PRIVACY_POLICY": ["datenschutzerklaerung", "privacy policy", "datenschutzhinweis"],
"AGB": ["allgemeine geschaeftsbedingungen", "agb", "terms and conditions"],
"COOKIE_POLICY": ["cookie", "tracking", "einwilligung"],
"RISK_ASSESSMENT": ["risikobewertung", "risk assessment", "risikoanalyse"],
"AUDIT_REPORT": ["audit", "pruefbericht", "zertifizierung"],
}
def detect_document_type(text: str) -> tuple[str, float]:
"""Detect document type from extracted text using keyword matching."""
text_lower = text.lower()
scores: dict[str, int] = {}
for doc_type, keywords in DOCUMENT_TYPE_KEYWORDS.items():
score = sum(1 for kw in keywords if kw in text_lower)
if score > 0:
scores[doc_type] = score
if not scores:
return "OTHER", 0.3
best_type = max(scores, key=scores.get)
confidence = min(0.95, 0.5 + scores[best_type] * 0.15)
return best_type, confidence
# =============================================================================
# GAP ANALYSIS
# =============================================================================
GAP_RULES = [
{
"category": "AI Act Compliance",
"regulation": "EU AI Act Art. 6",
"check_keywords": ["ki", "ai", "kuenstliche intelligenz", "machine learning"],
"gap_if_missing": ["risikoklassifizierung", "risk classification", "risikokategorie"],
"severity": "CRITICAL",
"action": "Risikoklassifizierung fuer KI-Systeme durchfuehren",
},
{
"category": "Transparenz",
"regulation": "DSGVO Art. 13, 14, 22",
"check_keywords": ["automatisiert", "automated", "profiling"],
"gap_if_missing": ["informationspflicht", "information obligation", "transparenz"],
"severity": "HIGH",
"action": "Informationspflichten bei automatisierten Entscheidungen ergaenzen",
},
{
"category": "TOMs",
"regulation": "DSGVO Art. 32",
"check_keywords": ["ki", "ai", "cloud", "saas"],
"gap_if_missing": ["technische massnahmen", "verschluesselung", "encryption"],
"severity": "MEDIUM",
"action": "Technisch-organisatorische Massnahmen um KI-Aspekte erweitern",
},
{
"category": "VVT",
"regulation": "DSGVO Art. 30",
"check_keywords": ["verarbeitung", "processing", "daten"],
"gap_if_missing": ["verarbeitungsverzeichnis", "vvt", "processing activities"],
"severity": "HIGH",
"action": "Verarbeitungsverzeichnis aktualisieren",
},
{
"category": "Menschliche Aufsicht",
"regulation": "EU AI Act Art. 14",
"check_keywords": ["ki", "ai", "autonom", "autonomous"],
"gap_if_missing": ["menschliche aufsicht", "human oversight", "human-in-the-loop"],
"severity": "MEDIUM",
"action": "Prozesse fuer menschliche Aufsicht definieren",
},
]
def analyze_gaps(text: str, doc_type: str) -> list[dict]:
"""Analyze document text for compliance gaps."""
text_lower = text.lower()
gaps = []
for rule in GAP_RULES:
# Check if rule applies (keywords present in document)
applies = any(kw in text_lower for kw in rule["check_keywords"])
if not applies:
continue
# Check if gap exists (required elements missing)
has_gap = not any(kw in text_lower for kw in rule["gap_if_missing"])
if has_gap:
gaps.append({
"id": f"gap-{uuid.uuid4().hex[:8]}",
"category": rule["category"],
"description": f"{rule['category']}: Luecke erkannt",
"severity": rule["severity"],
"regulation": rule["regulation"],
"required_action": rule["action"],
"related_step_id": doc_type.lower(),
})
return gaps
# =============================================================================
# TEXT EXTRACTION
# =============================================================================
def extract_text_from_pdf(content: bytes) -> str:
"""Extract text from PDF using PyMuPDF (fitz)."""
try:
import fitz
doc = fitz.open(stream=content, filetype="pdf")
text_parts = []
for page in doc:
text_parts.append(page.get_text())
doc.close()
return "\n".join(text_parts)
except ImportError:
logger.warning("PyMuPDF not available, returning empty text")
return ""
except Exception as e:
logger.error(f"PDF extraction failed: {e}")
return ""
# =============================================================================
# LLM CLASSIFICATION (optional enhancement)
# =============================================================================
async def classify_with_llm(text: str) -> Optional[tuple[str, float]]:
"""Use Ollama LLM to classify document type (optional, falls back to keywords)."""
try:
prompt = f"""Klassifiziere das folgende Dokument in eine dieser Kategorien:
DSFA, TOM, VVT, PRIVACY_POLICY, AGB, COOKIE_POLICY, RISK_ASSESSMENT, AUDIT_REPORT, OTHER
Antworte NUR mit dem Kategorienamen, nichts anderes.
Dokumenttext (erste 2000 Zeichen):
{text[:2000]}"""
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(
f"{OLLAMA_URL}/api/generate",
json={
"model": LLM_MODEL,
"prompt": prompt,
"stream": False,
"options": {"temperature": 0.1, "num_predict": 20},
},
)
if response.status_code == 200:
result = response.json()
answer = result.get("response", "").strip().upper()
# Validate answer
valid_types = {"DSFA", "TOM", "VVT", "PRIVACY_POLICY", "AGB",
"COOKIE_POLICY", "RISK_ASSESSMENT", "AUDIT_REPORT", "OTHER"}
if answer in valid_types:
return answer, 0.85
except Exception as e:
logger.warning(f"LLM classification failed, using keyword fallback: {e}")
return None
# =============================================================================
# RESPONSE MODELS
# =============================================================================
class DocumentAnalysisResponse(BaseModel):
document_id: str
filename: str
detected_type: str
confidence: float
extracted_entities: list[str]
recommendations: list[str]
gap_analysis: dict
class DocumentListResponse(BaseModel):
documents: list[dict]
total: int
# =============================================================================
# ROUTES
# =============================================================================
@router.post("/analyze", response_model=DocumentAnalysisResponse)
async def analyze_document(
file: UploadFile = File(...),
document_type: str = Form("OTHER"),
tenant_id: str = Form("default"),
):
"""Upload and analyze a compliance document."""
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
# Read file content
content = await file.read()
file_size = len(content)
# Extract text
if file.content_type == "application/pdf" or (file.filename and file.filename.endswith(".pdf")):
text = extract_text_from_pdf(content)
else:
# Try to decode as text
try:
text = content.decode("utf-8")
except UnicodeDecodeError:
text = ""
# Detect document type
if document_type == "OTHER" and text:
# Try LLM first, fallback to keywords
llm_result = await classify_with_llm(text)
if llm_result:
detected_type, confidence = llm_result
else:
detected_type, confidence = detect_document_type(text)
else:
detected_type = document_type
confidence = 1.0
# Extract key entities
entities = []
entity_keywords = ["DSGVO", "AI Act", "ISO 27001", "NIS2", "BDSG",
"Personenbezogene Daten", "Auftragsverarbeitung", "DSFA"]
for kw in entity_keywords:
if kw.lower() in text.lower():
entities.append(kw)
# Analyze gaps
gaps = analyze_gaps(text, detected_type)
# Generate recommendations
recommendations = []
if gaps:
recommendations = [g["required_action"] for g in gaps[:5]]
if not recommendations:
recommendations = ["Dokument erscheint vollstaendig"]
# Persist to database
doc_id = str(uuid.uuid4())
db = SessionLocal()
try:
db.execute(
"""INSERT INTO compliance_imported_documents
(id, tenant_id, filename, file_type, file_size, detected_type, detection_confidence,
extracted_text, extracted_entities, recommendations, status, analyzed_at)
VALUES (:id, :tenant_id, :filename, :file_type, :file_size, :detected_type, :confidence,
:text, :entities::jsonb, :recommendations::jsonb, 'analyzed', NOW())""",
{
"id": doc_id,
"tenant_id": tenant_id,
"filename": file.filename,
"file_type": file.content_type or "unknown",
"file_size": file_size,
"detected_type": detected_type,
"confidence": confidence,
"text": text[:50000], # Limit stored text
"entities": str(entities).replace("'", '"'),
"recommendations": str(recommendations).replace("'", '"'),
},
)
# Save gap analysis
total_gaps = len(gaps)
gap_analysis_result = {
"id": f"analysis-{doc_id[:8]}",
"total_gaps": total_gaps,
"critical_gaps": len([g for g in gaps if g["severity"] == "CRITICAL"]),
"high_gaps": len([g for g in gaps if g["severity"] == "HIGH"]),
"medium_gaps": len([g for g in gaps if g["severity"] == "MEDIUM"]),
"low_gaps": len([g for g in gaps if g["severity"] == "LOW"]),
"gaps": gaps,
"recommended_packages": ["analyse", "dokumentation"] if total_gaps > 0 else [],
}
if total_gaps > 0:
import json
db.execute(
"""INSERT INTO compliance_gap_analyses
(tenant_id, document_id, total_gaps, critical_gaps, high_gaps, medium_gaps, low_gaps, gaps, recommended_packages)
VALUES (:tenant_id, :document_id, :total, :critical, :high, :medium, :low, :gaps::jsonb, :packages::jsonb)""",
{
"tenant_id": tenant_id,
"document_id": doc_id,
"total": gap_analysis_result["total_gaps"],
"critical": gap_analysis_result["critical_gaps"],
"high": gap_analysis_result["high_gaps"],
"medium": gap_analysis_result["medium_gaps"],
"low": gap_analysis_result["low_gaps"],
"gaps": json.dumps(gaps),
"packages": json.dumps(gap_analysis_result["recommended_packages"]),
},
)
db.commit()
except Exception as e:
db.rollback()
logger.error(f"Failed to persist document analysis: {e}")
finally:
db.close()
return DocumentAnalysisResponse(
document_id=doc_id,
filename=file.filename or "unknown",
detected_type=detected_type,
confidence=confidence,
extracted_entities=entities,
recommendations=recommendations,
gap_analysis=gap_analysis_result,
)
@router.get("/documents", response_model=DocumentListResponse)
async def list_documents(tenant_id: str = "default"):
"""List all imported documents for a tenant."""
db = SessionLocal()
try:
result = db.execute(
"""SELECT id, filename, file_type, file_size, detected_type, detection_confidence,
extracted_entities, recommendations, status, analyzed_at, created_at
FROM compliance_imported_documents
WHERE tenant_id = :tenant_id
ORDER BY created_at DESC""",
{"tenant_id": tenant_id},
)
rows = result.fetchall()
documents = []
for row in rows:
documents.append({
"id": str(row[0]),
"filename": row[1],
"file_type": row[2],
"file_size": row[3],
"detected_type": row[4],
"confidence": row[5],
"extracted_entities": row[6] or [],
"recommendations": row[7] or [],
"status": row[8],
"analyzed_at": str(row[9]) if row[9] else None,
"created_at": str(row[10]),
})
return DocumentListResponse(documents=documents, total=len(documents))
finally:
db.close()

View File

@@ -0,0 +1,608 @@
"""
FastAPI routes for System Screening (SBOM Generation + Vulnerability Scan).
Endpoints:
- POST /v1/screening/scan: Upload dependency file, generate SBOM, scan for vulnerabilities
- GET /v1/screening/{screening_id}: Get screening result by ID
- GET /v1/screening: List screenings for a tenant
"""
import json
import logging
import re
import uuid
from datetime import datetime, timezone
from typing import Optional
import httpx
from fastapi import APIRouter, File, Form, UploadFile, HTTPException
from pydantic import BaseModel
from database import SessionLocal
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/v1/screening", tags=["system-screening"])
OSV_API_URL = "https://api.osv.dev/v1/query"
# =============================================================================
# RESPONSE MODELS
# =============================================================================
class SecurityIssueResponse(BaseModel):
id: str
severity: str
title: str
description: Optional[str] = None
cve: Optional[str] = None
cvss: Optional[float] = None
affected_component: str
affected_version: Optional[str] = None
fixed_in: Optional[str] = None
remediation: Optional[str] = None
status: str = "OPEN"
class SBOMComponentResponse(BaseModel):
name: str
version: str
type: str
purl: str
licenses: list[str]
vulnerabilities: list[dict]
class ScreeningResponse(BaseModel):
id: str
status: str
sbom_format: str
sbom_version: str
total_components: int
total_issues: int
critical_issues: int
high_issues: int
medium_issues: int
low_issues: int
components: list[SBOMComponentResponse]
issues: list[SecurityIssueResponse]
started_at: Optional[str] = None
completed_at: Optional[str] = None
class ScreeningListResponse(BaseModel):
screenings: list[dict]
total: int
# =============================================================================
# DEPENDENCY PARSING
# =============================================================================
def parse_package_lock(content: str) -> list[dict]:
"""Parse package-lock.json and extract dependencies."""
try:
data = json.loads(content)
except json.JSONDecodeError:
return []
components = []
# package-lock.json v2/v3 format (packages field)
packages = data.get("packages", {})
if packages:
for path, info in packages.items():
if not path: # Skip root
continue
name = path.split("node_modules/")[-1] if "node_modules/" in path else path
version = info.get("version", "unknown")
if name and version != "unknown":
components.append({
"name": name,
"version": version,
"type": "library",
"ecosystem": "npm",
"license": info.get("license", "unknown"),
})
# Fallback: v1 format (dependencies field)
if not components:
dependencies = data.get("dependencies", {})
for name, info in dependencies.items():
if isinstance(info, dict):
components.append({
"name": name,
"version": info.get("version", "unknown"),
"type": "library",
"ecosystem": "npm",
"license": "unknown",
})
return components
def parse_requirements_txt(content: str) -> list[dict]:
"""Parse requirements.txt and extract dependencies."""
components = []
for line in content.strip().split("\n"):
line = line.strip()
if not line or line.startswith("#") or line.startswith("-"):
continue
# Match patterns: package==version, package>=version, package~=version
match = re.match(r'^([a-zA-Z0-9_.-]+)\s*([>=<~!]+)\s*([a-zA-Z0-9_.*-]+)', line)
if match:
components.append({
"name": match.group(1),
"version": match.group(3),
"type": "library",
"ecosystem": "PyPI",
"license": "unknown",
})
elif re.match(r'^[a-zA-Z0-9_.-]+$', line):
components.append({
"name": line,
"version": "latest",
"type": "library",
"ecosystem": "PyPI",
"license": "unknown",
})
return components
def parse_yarn_lock(content: str) -> list[dict]:
"""Parse yarn.lock and extract dependencies (basic)."""
components = []
current_name = None
for line in content.split("\n"):
# Match: "package@version":
match = re.match(r'^"?([^@]+)@[^"]*"?:', line)
if match:
current_name = match.group(1).strip()
elif current_name and line.strip().startswith("version "):
version_match = re.match(r'\s+version\s+"?([^"]+)"?', line)
if version_match:
components.append({
"name": current_name,
"version": version_match.group(1),
"type": "library",
"ecosystem": "npm",
"license": "unknown",
})
current_name = None
return components
def detect_and_parse(filename: str, content: str) -> tuple[list[dict], str]:
"""Detect file type and parse accordingly."""
fname = filename.lower()
if "package-lock" in fname or fname.endswith("package-lock.json"):
return parse_package_lock(content), "npm"
elif fname == "requirements.txt" or fname.endswith("/requirements.txt"):
return parse_requirements_txt(content), "PyPI"
elif "yarn.lock" in fname:
return parse_yarn_lock(content), "npm"
elif fname.endswith(".json"):
# Try package-lock format
comps = parse_package_lock(content)
if comps:
return comps, "npm"
# Fallback: try requirements.txt format
comps = parse_requirements_txt(content)
if comps:
return comps, "PyPI"
return [], "unknown"
# =============================================================================
# SBOM GENERATION (CycloneDX format)
# =============================================================================
def generate_sbom(components: list[dict], ecosystem: str) -> dict:
"""Generate a CycloneDX 1.5 SBOM from parsed components."""
sbom_components = []
for comp in components:
purl = f"pkg:{ecosystem.lower()}/{comp['name']}@{comp['version']}"
sbom_components.append({
"type": "library",
"name": comp["name"],
"version": comp["version"],
"purl": purl,
"licenses": [comp.get("license", "unknown")] if comp.get("license") != "unknown" else [],
})
return {
"bomFormat": "CycloneDX",
"specVersion": "1.5",
"version": 1,
"metadata": {
"timestamp": datetime.now(timezone.utc).isoformat(),
"tools": [{"name": "breakpilot-screening", "version": "1.0.0"}],
},
"components": sbom_components,
}
# =============================================================================
# VULNERABILITY SCANNING (OSV.dev API)
# =============================================================================
async def query_osv(name: str, version: str, ecosystem: str) -> list[dict]:
"""Query OSV.dev API for vulnerabilities of a single package."""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.post(
OSV_API_URL,
json={
"package": {"name": name, "ecosystem": ecosystem},
"version": version,
},
)
if response.status_code == 200:
data = response.json()
return data.get("vulns", [])
except Exception as e:
logger.warning(f"OSV query failed for {name}@{version}: {e}")
return []
def map_osv_severity(vuln: dict) -> tuple[str, float]:
"""Extract severity and CVSS from OSV vulnerability data."""
severity = "MEDIUM"
cvss = 5.0
# Check severity array
for sev in vuln.get("severity", []):
if sev.get("type") == "CVSS_V3":
score_str = sev.get("score", "")
# Extract base score from CVSS vector
try:
import re as _re
# CVSS vectors don't contain the score directly, try database_specific
pass
except Exception:
pass
# Check database_specific for severity
db_specific = vuln.get("database_specific", {})
if "severity" in db_specific:
sev_str = db_specific["severity"].upper()
if sev_str in ("CRITICAL", "HIGH", "MEDIUM", "LOW"):
severity = sev_str
# Derive CVSS from severity if not found
cvss_map = {"CRITICAL": 9.5, "HIGH": 7.5, "MEDIUM": 5.0, "LOW": 2.5}
cvss = cvss_map.get(severity, 5.0)
return severity, cvss
def extract_fix_version(vuln: dict, package_name: str) -> Optional[str]:
"""Extract the fixed-in version from OSV data."""
for affected in vuln.get("affected", []):
pkg = affected.get("package", {})
if pkg.get("name", "").lower() == package_name.lower():
for rng in affected.get("ranges", []):
for event in rng.get("events", []):
if "fixed" in event:
return event["fixed"]
return None
async def scan_vulnerabilities(components: list[dict], ecosystem: str) -> list[dict]:
"""Scan all components for vulnerabilities via OSV.dev."""
issues = []
# Batch: scan up to 50 components to avoid timeouts
scan_limit = min(len(components), 50)
for comp in components[:scan_limit]:
if comp["version"] in ("latest", "unknown", "*"):
continue
vulns = await query_osv(comp["name"], comp["version"], ecosystem)
for vuln in vulns:
vuln_id = vuln.get("id", f"OSV-{uuid.uuid4().hex[:8]}")
aliases = vuln.get("aliases", [])
cve = next((a for a in aliases if a.startswith("CVE-")), None)
severity, cvss = map_osv_severity(vuln)
fixed_in = extract_fix_version(vuln, comp["name"])
issues.append({
"id": str(uuid.uuid4()),
"severity": severity,
"title": vuln.get("summary", vuln_id),
"description": vuln.get("details", "")[:500],
"cve": cve,
"cvss": cvss,
"affected_component": comp["name"],
"affected_version": comp["version"],
"fixed_in": fixed_in,
"remediation": f"Upgrade {comp['name']} to {fixed_in}" if fixed_in else f"Check {vuln_id} for remediation steps",
"status": "OPEN",
})
return issues
# =============================================================================
# ROUTES
# =============================================================================
@router.post("/scan", response_model=ScreeningResponse)
async def scan_dependencies(
file: UploadFile = File(...),
tenant_id: str = Form("default"),
):
"""Upload a dependency file, generate SBOM, and scan for vulnerabilities."""
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
content = await file.read()
try:
text = content.decode("utf-8")
except UnicodeDecodeError:
raise HTTPException(status_code=400, detail="File must be a text-based dependency file")
# Parse dependencies
components, ecosystem = detect_and_parse(file.filename, text)
if not components:
raise HTTPException(
status_code=400,
detail="Could not parse dependencies. Supported: package-lock.json, requirements.txt, yarn.lock",
)
# Generate SBOM
sbom = generate_sbom(components, ecosystem)
# Scan for vulnerabilities
started_at = datetime.now(timezone.utc)
issues = await scan_vulnerabilities(components, ecosystem)
completed_at = datetime.now(timezone.utc)
# Count severities
critical = len([i for i in issues if i["severity"] == "CRITICAL"])
high = len([i for i in issues if i["severity"] == "HIGH"])
medium = len([i for i in issues if i["severity"] == "MEDIUM"])
low = len([i for i in issues if i["severity"] == "LOW"])
# Persist to database
screening_id = str(uuid.uuid4())
db = SessionLocal()
try:
db.execute(
"""INSERT INTO compliance_screenings
(id, tenant_id, status, sbom_format, sbom_version,
total_components, total_issues, critical_issues, high_issues, medium_issues, low_issues,
sbom_data, started_at, completed_at)
VALUES (:id, :tenant_id, 'completed', 'CycloneDX', '1.5',
:total_components, :total_issues, :critical, :high, :medium, :low,
:sbom_data::jsonb, :started_at, :completed_at)""",
{
"id": screening_id,
"tenant_id": tenant_id,
"total_components": len(components),
"total_issues": len(issues),
"critical": critical,
"high": high,
"medium": medium,
"low": low,
"sbom_data": json.dumps(sbom),
"started_at": started_at,
"completed_at": completed_at,
},
)
# Persist security issues
for issue in issues:
db.execute(
"""INSERT INTO compliance_security_issues
(id, screening_id, severity, title, description, cve, cvss,
affected_component, affected_version, fixed_in, remediation, status)
VALUES (:id, :screening_id, :severity, :title, :description, :cve, :cvss,
:component, :version, :fixed_in, :remediation, :status)""",
{
"id": issue["id"],
"screening_id": screening_id,
"severity": issue["severity"],
"title": issue["title"][:500],
"description": issue.get("description", "")[:1000],
"cve": issue.get("cve"),
"cvss": issue.get("cvss"),
"component": issue["affected_component"],
"version": issue.get("affected_version"),
"fixed_in": issue.get("fixed_in"),
"remediation": issue.get("remediation"),
"status": issue["status"],
},
)
db.commit()
except Exception as e:
db.rollback()
logger.error(f"Failed to persist screening: {e}")
finally:
db.close()
# Build response
sbom_components = []
comp_vulns: dict[str, list[dict]] = {}
for issue in issues:
comp_name = issue["affected_component"]
if comp_name not in comp_vulns:
comp_vulns[comp_name] = []
comp_vulns[comp_name].append({
"id": issue.get("cve") or issue["id"],
"cve": issue.get("cve"),
"severity": issue["severity"],
"title": issue["title"],
"cvss": issue.get("cvss"),
"fixedIn": issue.get("fixed_in"),
})
for sc in sbom["components"]:
sbom_components.append(SBOMComponentResponse(
name=sc["name"],
version=sc["version"],
type=sc["type"],
purl=sc["purl"],
licenses=sc.get("licenses", []),
vulnerabilities=comp_vulns.get(sc["name"], []),
))
issue_responses = [
SecurityIssueResponse(
id=i["id"],
severity=i["severity"],
title=i["title"],
description=i.get("description"),
cve=i.get("cve"),
cvss=i.get("cvss"),
affected_component=i["affected_component"],
affected_version=i.get("affected_version"),
fixed_in=i.get("fixed_in"),
remediation=i.get("remediation"),
status=i["status"],
)
for i in issues
]
return ScreeningResponse(
id=screening_id,
status="completed",
sbom_format="CycloneDX",
sbom_version="1.5",
total_components=len(components),
total_issues=len(issues),
critical_issues=critical,
high_issues=high,
medium_issues=medium,
low_issues=low,
components=sbom_components,
issues=issue_responses,
started_at=started_at.isoformat(),
completed_at=completed_at.isoformat(),
)
@router.get("/{screening_id}", response_model=ScreeningResponse)
async def get_screening(screening_id: str):
"""Get a screening result by ID."""
db = SessionLocal()
try:
result = db.execute(
"""SELECT id, status, sbom_format, sbom_version,
total_components, total_issues, critical_issues, high_issues,
medium_issues, low_issues, sbom_data, started_at, completed_at
FROM compliance_screenings WHERE id = :id""",
{"id": screening_id},
)
row = result.fetchone()
if not row:
raise HTTPException(status_code=404, detail="Screening not found")
# Fetch issues
issues_result = db.execute(
"""SELECT id, severity, title, description, cve, cvss,
affected_component, affected_version, fixed_in, remediation, status
FROM compliance_security_issues WHERE screening_id = :id""",
{"id": screening_id},
)
issues_rows = issues_result.fetchall()
issues = [
SecurityIssueResponse(
id=str(r[0]), severity=r[1], title=r[2], description=r[3],
cve=r[4], cvss=r[5], affected_component=r[6],
affected_version=r[7], fixed_in=r[8], remediation=r[9], status=r[10],
)
for r in issues_rows
]
# Reconstruct components from SBOM data
sbom_data = row[10] or {}
components = []
comp_vulns: dict[str, list[dict]] = {}
for issue in issues:
if issue.affected_component not in comp_vulns:
comp_vulns[issue.affected_component] = []
comp_vulns[issue.affected_component].append({
"id": issue.cve or issue.id,
"cve": issue.cve,
"severity": issue.severity,
"title": issue.title,
"cvss": issue.cvss,
"fixedIn": issue.fixed_in,
})
for sc in sbom_data.get("components", []):
components.append(SBOMComponentResponse(
name=sc["name"],
version=sc["version"],
type=sc.get("type", "library"),
purl=sc.get("purl", ""),
licenses=sc.get("licenses", []),
vulnerabilities=comp_vulns.get(sc["name"], []),
))
return ScreeningResponse(
id=str(row[0]),
status=row[1],
sbom_format=row[2] or "CycloneDX",
sbom_version=row[3] or "1.5",
total_components=row[4] or 0,
total_issues=row[5] or 0,
critical_issues=row[6] or 0,
high_issues=row[7] or 0,
medium_issues=row[8] or 0,
low_issues=row[9] or 0,
components=components,
issues=issues,
started_at=str(row[11]) if row[11] else None,
completed_at=str(row[12]) if row[12] else None,
)
finally:
db.close()
@router.get("", response_model=ScreeningListResponse)
async def list_screenings(tenant_id: str = "default"):
"""List all screenings for a tenant."""
db = SessionLocal()
try:
result = db.execute(
"""SELECT id, status, total_components, total_issues,
critical_issues, high_issues, medium_issues, low_issues,
started_at, completed_at, created_at
FROM compliance_screenings
WHERE tenant_id = :tenant_id
ORDER BY created_at DESC""",
{"tenant_id": tenant_id},
)
rows = result.fetchall()
screenings = [
{
"id": str(r[0]),
"status": r[1],
"total_components": r[2],
"total_issues": r[3],
"critical_issues": r[4],
"high_issues": r[5],
"medium_issues": r[6],
"low_issues": r[7],
"started_at": str(r[8]) if r[8] else None,
"completed_at": str(r[9]) if r[9] else None,
"created_at": str(r[10]),
}
for r in rows
]
return ScreeningListResponse(screenings=screenings, total=len(screenings))
finally:
db.close()