# mypy: disable-error-code="arg-type,assignment,union-attr,no-any-return" """ System screening service — SBOM generation + OSV vulnerability scan. Phase 1 Step 4: pure parsing/SBOM/OSV helpers extracted from ``compliance.api.screening_routes``. Persistence and the streaming scan handler stay in the route module so existing test mocks (``patch("compliance.api.screening_routes.SessionLocal", ...)``, ``patch("compliance.api.screening_routes.scan_vulnerabilities", ...)``) keep working without test edits. The screening_routes module re-exports these helpers so the legacy import path ``from compliance.api.screening_routes import parse_package_lock`` continues to work. """ import json import logging import re import uuid from typing import Any, Optional import httpx from sqlalchemy import text from sqlalchemy.orm import Session from compliance.domain import NotFoundError from compliance.schemas.screening import ( ScreeningListResponse, ScreeningResponse, SBOMComponentResponse, SecurityIssueResponse, ) logger = logging.getLogger(__name__) OSV_API_URL = "https://api.osv.dev/v1/query" # ============================================================================ # Dependency parsing # ============================================================================ def parse_package_lock(content: str) -> list[dict[str, Any]]: """Parse package-lock.json and extract dependencies.""" try: data = json.loads(content) except json.JSONDecodeError: return [] components: list[dict[str, Any]] = [] packages = data.get("packages", {}) if packages: for path, info in packages.items(): if not path: # skip root continue name = ( path.split("node_modules/")[-1] if "node_modules/" in path else path ) version = info.get("version", "unknown") if name and version != "unknown": components.append({ "name": name, "version": version, "type": "library", "ecosystem": "npm", "license": info.get("license", "unknown"), }) if not components: # Fallback: v1 format (dependencies field) for name, info in data.get("dependencies", {}).items(): if isinstance(info, dict): components.append({ "name": name, "version": info.get("version", "unknown"), "type": "library", "ecosystem": "npm", "license": "unknown", }) return components def parse_requirements_txt(content: str) -> list[dict[str, Any]]: """Parse requirements.txt and extract dependencies.""" components: list[dict[str, Any]] = [] for line in content.strip().split("\n"): line = line.strip() if not line or line.startswith("#") or line.startswith("-"): continue match = re.match( r"^([a-zA-Z0-9_.-]+)\s*([>=<~!]+)\s*([a-zA-Z0-9_.*-]+)", line ) if match: components.append({ "name": match.group(1), "version": match.group(3), "type": "library", "ecosystem": "PyPI", "license": "unknown", }) elif re.match(r"^[a-zA-Z0-9_.-]+$", line): components.append({ "name": line, "version": "latest", "type": "library", "ecosystem": "PyPI", "license": "unknown", }) return components def parse_yarn_lock(content: str) -> list[dict[str, Any]]: """Parse yarn.lock and extract dependencies (basic).""" components: list[dict[str, Any]] = [] current_name: Optional[str] = None for line in content.split("\n"): match = re.match(r'^"?([^@]+)@[^"]*"?:', line) if match: current_name = match.group(1).strip() elif current_name and line.strip().startswith("version "): version_match = re.match(r'\s+version\s+"?([^"]+)"?', line) if version_match: components.append({ "name": current_name, "version": version_match.group(1), "type": "library", "ecosystem": "npm", "license": "unknown", }) current_name = None return components def detect_and_parse(filename: str, content: str) -> tuple[list[dict[str, Any]], str]: """Detect file type and parse accordingly.""" fname = filename.lower() if "package-lock" in fname or fname.endswith("package-lock.json"): return parse_package_lock(content), "npm" if fname == "requirements.txt" or fname.endswith("/requirements.txt"): return parse_requirements_txt(content), "PyPI" if "yarn.lock" in fname: return parse_yarn_lock(content), "npm" if fname.endswith(".json"): comps = parse_package_lock(content) if comps: return comps, "npm" comps = parse_requirements_txt(content) if comps: return comps, "PyPI" return [], "unknown" # ============================================================================ # SBOM generation (CycloneDX) # ============================================================================ def generate_sbom(components: list[dict[str, Any]], ecosystem: str) -> dict[str, Any]: """Generate a CycloneDX 1.5 SBOM from parsed components.""" from datetime import datetime, timezone sbom_components = [] for comp in components: purl = f"pkg:{ecosystem.lower()}/{comp['name']}@{comp['version']}" sbom_components.append({ "type": "library", "name": comp["name"], "version": comp["version"], "purl": purl, "licenses": ( [comp.get("license", "unknown")] if comp.get("license") != "unknown" else [] ), }) return { "bomFormat": "CycloneDX", "specVersion": "1.5", "version": 1, "metadata": { "timestamp": datetime.now(timezone.utc).isoformat(), "tools": [{"name": "breakpilot-screening", "version": "1.0.0"}], }, "components": sbom_components, } # ============================================================================ # OSV.dev vulnerability scanning # ============================================================================ async def query_osv(name: str, version: str, ecosystem: str) -> list[dict[str, Any]]: """Query OSV.dev API for vulnerabilities of a single package.""" try: async with httpx.AsyncClient(timeout=10.0) as client: response = await client.post( OSV_API_URL, json={ "package": {"name": name, "ecosystem": ecosystem}, "version": version, }, ) if response.status_code == 200: return response.json().get("vulns", []) except Exception as exc: # noqa: BLE001 logger.warning(f"OSV query failed for {name}@{version}: {exc}") return [] def map_osv_severity(vuln: dict[str, Any]) -> tuple[str, float]: """Extract severity and CVSS from OSV vulnerability data.""" severity = "MEDIUM" db_specific = vuln.get("database_specific", {}) if "severity" in db_specific: sev_str = db_specific["severity"].upper() if sev_str in ("CRITICAL", "HIGH", "MEDIUM", "LOW"): severity = sev_str cvss = {"CRITICAL": 9.5, "HIGH": 7.5, "MEDIUM": 5.0, "LOW": 2.5}.get(severity, 5.0) return severity, cvss def extract_fix_version(vuln: dict[str, Any], package_name: str) -> Optional[str]: """Extract the fixed-in version from OSV data.""" for affected in vuln.get("affected", []): pkg = affected.get("package", {}) if pkg.get("name", "").lower() == package_name.lower(): for rng in affected.get("ranges", []): for event in rng.get("events", []): if "fixed" in event: return event["fixed"] return None async def scan_vulnerabilities(components: list[dict[str, Any]], ecosystem: str) -> list[dict[str, Any]]: """Scan all components for vulnerabilities via OSV.dev (max 50).""" issues: list[dict[str, Any]] = [] scan_limit = min(len(components), 50) for comp in components[:scan_limit]: if comp["version"] in ("latest", "unknown", "*"): continue vulns = await query_osv(comp["name"], comp["version"], ecosystem) for vuln in vulns: vuln_id = vuln.get("id", f"OSV-{uuid.uuid4().hex[:8]}") aliases = vuln.get("aliases", []) cve = next((a for a in aliases if a.startswith("CVE-")), None) severity, cvss = map_osv_severity(vuln) fixed_in = extract_fix_version(vuln, comp["name"]) issues.append({ "id": str(uuid.uuid4()), "severity": severity, "title": vuln.get("summary", vuln_id), "description": vuln.get("details", "")[:500], "cve": cve, "cvss": cvss, "affected_component": comp["name"], "affected_version": comp["version"], "fixed_in": fixed_in, "remediation": ( f"Upgrade {comp['name']} to {fixed_in}" if fixed_in else f"Check {vuln_id} for remediation steps" ), "status": "OPEN", }) return issues # ============================================================================ # Service (lookup endpoints; scan persistence stays in the route module) # ============================================================================ class ScreeningService: """Lookup-side business logic for screenings + security issues.""" def __init__(self, db: Session) -> None: self.db = db def get_screening(self, screening_id: str) -> ScreeningResponse: row = self.db.execute( text( "SELECT id, status, sbom_format, sbom_version, " "total_components, total_issues, critical_issues, high_issues, " "medium_issues, low_issues, sbom_data, started_at, completed_at " "FROM compliance_screenings WHERE id = :id" ), {"id": screening_id}, ).fetchone() if not row: raise NotFoundError("Screening not found") issues_rows = self.db.execute( text( "SELECT id, severity, title, description, cve, cvss, " "affected_component, affected_version, fixed_in, remediation, status " "FROM compliance_security_issues WHERE screening_id = :id" ), {"id": screening_id}, ).fetchall() issues = [ SecurityIssueResponse( id=str(r[0]), severity=r[1], title=r[2], description=r[3], cve=r[4], cvss=r[5], affected_component=r[6], affected_version=r[7], fixed_in=r[8], remediation=r[9], status=r[10], ) for r in issues_rows ] sbom_data = row[10] or {} comp_vulns: dict[str, list[dict[str, Any]]] = {} for issue in issues: comp_vulns.setdefault(issue.affected_component, []).append({ "id": issue.cve or issue.id, "cve": issue.cve, "severity": issue.severity, "title": issue.title, "cvss": issue.cvss, "fixedIn": issue.fixed_in, }) components = [ SBOMComponentResponse( name=sc["name"], version=sc["version"], type=sc.get("type", "library"), purl=sc.get("purl", ""), licenses=sc.get("licenses", []), vulnerabilities=comp_vulns.get(sc["name"], []), ) for sc in sbom_data.get("components", []) ] return ScreeningResponse( id=str(row[0]), status=row[1], sbom_format=row[2] or "CycloneDX", sbom_version=row[3] or "1.5", total_components=row[4] or 0, total_issues=row[5] or 0, critical_issues=row[6] or 0, high_issues=row[7] or 0, medium_issues=row[8] or 0, low_issues=row[9] or 0, components=components, issues=issues, started_at=str(row[11]) if row[11] else None, completed_at=str(row[12]) if row[12] else None, ) def list_screenings(self, tenant_id: str) -> ScreeningListResponse: rows = self.db.execute( text( "SELECT id, status, total_components, total_issues, " "critical_issues, high_issues, medium_issues, low_issues, " "started_at, completed_at, created_at " "FROM compliance_screenings " "WHERE tenant_id = :tenant_id " "ORDER BY created_at DESC" ), {"tenant_id": tenant_id}, ).fetchall() screenings = [ { "id": str(r[0]), "status": r[1], "total_components": r[2], "total_issues": r[3], "critical_issues": r[4], "high_issues": r[5], "medium_issues": r[6], "low_issues": r[7], "started_at": str(r[8]) if r[8] else None, "completed_at": str(r[9]) if r[9] else None, "created_at": str(r[10]), } for r in rows ] return ScreeningListResponse(screenings=screenings, total=len(screenings))