Files
breakpilot-compliance/backend-compliance/compliance/services/screening_service.py
Sharang Parnerkar e613af1a7d refactor(backend/api): extract ScreeningService (Step 4 — file 8 of 18)
compliance/api/screening_routes.py (597 LOC) -> 233 LOC thin routes +
353-line ScreeningService + 60-line schemas file. Manages SBOM generation
(CycloneDX 1.5) and OSV.dev vulnerability scanning.

Pure helpers (parse_package_lock, parse_requirements_txt, parse_yarn_lock,
detect_and_parse, generate_sbom, query_osv, map_osv_severity,
extract_fix_version, scan_vulnerabilities) moved to the service module.
The two lookup endpoints (get_screening, list_screenings) delegate to
the new ScreeningService class.

Test-mock compatibility: tests/test_screening_routes.py uses
`patch("compliance.api.screening_routes.SessionLocal", ...)` and
`patch("compliance.api.screening_routes.scan_vulnerabilities", ...)`.
Both names are re-imported and re-exported from the route module so the
patches still take effect. The scan handler keeps direct
`SessionLocal()` usage; the lookup handlers also use SessionLocal so the
test mocks intercept them.

Latent bug fixed: the original scan handler had
    text = content.decode("utf-8")
on line 339, shadowing the imported `sqlalchemy.text` so that the
subsequent `text("INSERT ...")` calls would have raised at runtime.
The variable is now named `file_text`. Allowed under "minor behavior
fixes" — the bug was unreachable in tests because they always patched
SessionLocal.

Verified:
  - 240/240 pytest pass
  - OpenAPI 360/484 unchanged
  - mypy compliance/ -> Success on 134 source files
  - screening_routes.py 597 -> 233 LOC
  - Hard-cap violations: 11 -> 10

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 20:03:16 +02:00

385 lines
14 KiB
Python

# mypy: disable-error-code="arg-type,assignment,union-attr,no-any-return"
"""
System screening service — SBOM generation + OSV vulnerability scan.
Phase 1 Step 4: pure parsing/SBOM/OSV helpers extracted from
``compliance.api.screening_routes``. Persistence and the streaming scan
handler stay in the route module so existing test mocks
(``patch("compliance.api.screening_routes.SessionLocal", ...)``,
``patch("compliance.api.screening_routes.scan_vulnerabilities", ...)``)
keep working without test edits.
The screening_routes module re-exports these helpers so the legacy
import path ``from compliance.api.screening_routes import parse_package_lock``
continues to work.
"""
import json
import logging
import re
import uuid
from typing import Any, Optional
import httpx
from sqlalchemy import text
from sqlalchemy.orm import Session
from compliance.domain import NotFoundError
from compliance.schemas.screening import (
ScreeningListResponse,
ScreeningResponse,
SBOMComponentResponse,
SecurityIssueResponse,
)
logger = logging.getLogger(__name__)
OSV_API_URL = "https://api.osv.dev/v1/query"
# ============================================================================
# Dependency parsing
# ============================================================================
def parse_package_lock(content: str) -> list[dict[str, Any]]:
"""Parse package-lock.json and extract dependencies."""
try:
data = json.loads(content)
except json.JSONDecodeError:
return []
components: list[dict[str, Any]] = []
packages = data.get("packages", {})
if packages:
for path, info in packages.items():
if not path: # skip root
continue
name = (
path.split("node_modules/")[-1] if "node_modules/" in path else path
)
version = info.get("version", "unknown")
if name and version != "unknown":
components.append({
"name": name,
"version": version,
"type": "library",
"ecosystem": "npm",
"license": info.get("license", "unknown"),
})
if not components:
# Fallback: v1 format (dependencies field)
for name, info in data.get("dependencies", {}).items():
if isinstance(info, dict):
components.append({
"name": name,
"version": info.get("version", "unknown"),
"type": "library",
"ecosystem": "npm",
"license": "unknown",
})
return components
def parse_requirements_txt(content: str) -> list[dict[str, Any]]:
"""Parse requirements.txt and extract dependencies."""
components: list[dict[str, Any]] = []
for line in content.strip().split("\n"):
line = line.strip()
if not line or line.startswith("#") or line.startswith("-"):
continue
match = re.match(
r"^([a-zA-Z0-9_.-]+)\s*([>=<~!]+)\s*([a-zA-Z0-9_.*-]+)", line
)
if match:
components.append({
"name": match.group(1),
"version": match.group(3),
"type": "library",
"ecosystem": "PyPI",
"license": "unknown",
})
elif re.match(r"^[a-zA-Z0-9_.-]+$", line):
components.append({
"name": line,
"version": "latest",
"type": "library",
"ecosystem": "PyPI",
"license": "unknown",
})
return components
def parse_yarn_lock(content: str) -> list[dict[str, Any]]:
"""Parse yarn.lock and extract dependencies (basic)."""
components: list[dict[str, Any]] = []
current_name: Optional[str] = None
for line in content.split("\n"):
match = re.match(r'^"?([^@]+)@[^"]*"?:', line)
if match:
current_name = match.group(1).strip()
elif current_name and line.strip().startswith("version "):
version_match = re.match(r'\s+version\s+"?([^"]+)"?', line)
if version_match:
components.append({
"name": current_name,
"version": version_match.group(1),
"type": "library",
"ecosystem": "npm",
"license": "unknown",
})
current_name = None
return components
def detect_and_parse(filename: str, content: str) -> tuple[list[dict[str, Any]], str]:
"""Detect file type and parse accordingly."""
fname = filename.lower()
if "package-lock" in fname or fname.endswith("package-lock.json"):
return parse_package_lock(content), "npm"
if fname == "requirements.txt" or fname.endswith("/requirements.txt"):
return parse_requirements_txt(content), "PyPI"
if "yarn.lock" in fname:
return parse_yarn_lock(content), "npm"
if fname.endswith(".json"):
comps = parse_package_lock(content)
if comps:
return comps, "npm"
comps = parse_requirements_txt(content)
if comps:
return comps, "PyPI"
return [], "unknown"
# ============================================================================
# SBOM generation (CycloneDX)
# ============================================================================
def generate_sbom(components: list[dict[str, Any]], ecosystem: str) -> dict[str, Any]:
"""Generate a CycloneDX 1.5 SBOM from parsed components."""
from datetime import datetime, timezone
sbom_components = []
for comp in components:
purl = f"pkg:{ecosystem.lower()}/{comp['name']}@{comp['version']}"
sbom_components.append({
"type": "library",
"name": comp["name"],
"version": comp["version"],
"purl": purl,
"licenses": (
[comp.get("license", "unknown")]
if comp.get("license") != "unknown"
else []
),
})
return {
"bomFormat": "CycloneDX",
"specVersion": "1.5",
"version": 1,
"metadata": {
"timestamp": datetime.now(timezone.utc).isoformat(),
"tools": [{"name": "breakpilot-screening", "version": "1.0.0"}],
},
"components": sbom_components,
}
# ============================================================================
# OSV.dev vulnerability scanning
# ============================================================================
async def query_osv(name: str, version: str, ecosystem: str) -> list[dict[str, Any]]:
"""Query OSV.dev API for vulnerabilities of a single package."""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.post(
OSV_API_URL,
json={
"package": {"name": name, "ecosystem": ecosystem},
"version": version,
},
)
if response.status_code == 200:
return response.json().get("vulns", [])
except Exception as exc: # noqa: BLE001
logger.warning(f"OSV query failed for {name}@{version}: {exc}")
return []
def map_osv_severity(vuln: dict[str, Any]) -> tuple[str, float]:
"""Extract severity and CVSS from OSV vulnerability data."""
severity = "MEDIUM"
db_specific = vuln.get("database_specific", {})
if "severity" in db_specific:
sev_str = db_specific["severity"].upper()
if sev_str in ("CRITICAL", "HIGH", "MEDIUM", "LOW"):
severity = sev_str
cvss = {"CRITICAL": 9.5, "HIGH": 7.5, "MEDIUM": 5.0, "LOW": 2.5}.get(severity, 5.0)
return severity, cvss
def extract_fix_version(vuln: dict[str, Any], package_name: str) -> Optional[str]:
"""Extract the fixed-in version from OSV data."""
for affected in vuln.get("affected", []):
pkg = affected.get("package", {})
if pkg.get("name", "").lower() == package_name.lower():
for rng in affected.get("ranges", []):
for event in rng.get("events", []):
if "fixed" in event:
return event["fixed"]
return None
async def scan_vulnerabilities(components: list[dict[str, Any]], ecosystem: str) -> list[dict[str, Any]]:
"""Scan all components for vulnerabilities via OSV.dev (max 50)."""
issues: list[dict[str, Any]] = []
scan_limit = min(len(components), 50)
for comp in components[:scan_limit]:
if comp["version"] in ("latest", "unknown", "*"):
continue
vulns = await query_osv(comp["name"], comp["version"], ecosystem)
for vuln in vulns:
vuln_id = vuln.get("id", f"OSV-{uuid.uuid4().hex[:8]}")
aliases = vuln.get("aliases", [])
cve = next((a for a in aliases if a.startswith("CVE-")), None)
severity, cvss = map_osv_severity(vuln)
fixed_in = extract_fix_version(vuln, comp["name"])
issues.append({
"id": str(uuid.uuid4()),
"severity": severity,
"title": vuln.get("summary", vuln_id),
"description": vuln.get("details", "")[:500],
"cve": cve,
"cvss": cvss,
"affected_component": comp["name"],
"affected_version": comp["version"],
"fixed_in": fixed_in,
"remediation": (
f"Upgrade {comp['name']} to {fixed_in}"
if fixed_in
else f"Check {vuln_id} for remediation steps"
),
"status": "OPEN",
})
return issues
# ============================================================================
# Service (lookup endpoints; scan persistence stays in the route module)
# ============================================================================
class ScreeningService:
"""Lookup-side business logic for screenings + security issues."""
def __init__(self, db: Session) -> None:
self.db = db
def get_screening(self, screening_id: str) -> ScreeningResponse:
row = self.db.execute(
text(
"SELECT id, status, sbom_format, sbom_version, "
"total_components, total_issues, critical_issues, high_issues, "
"medium_issues, low_issues, sbom_data, started_at, completed_at "
"FROM compliance_screenings WHERE id = :id"
),
{"id": screening_id},
).fetchone()
if not row:
raise NotFoundError("Screening not found")
issues_rows = self.db.execute(
text(
"SELECT id, severity, title, description, cve, cvss, "
"affected_component, affected_version, fixed_in, remediation, status "
"FROM compliance_security_issues WHERE screening_id = :id"
),
{"id": screening_id},
).fetchall()
issues = [
SecurityIssueResponse(
id=str(r[0]), severity=r[1], title=r[2], description=r[3],
cve=r[4], cvss=r[5], affected_component=r[6],
affected_version=r[7], fixed_in=r[8], remediation=r[9], status=r[10],
)
for r in issues_rows
]
sbom_data = row[10] or {}
comp_vulns: dict[str, list[dict[str, Any]]] = {}
for issue in issues:
comp_vulns.setdefault(issue.affected_component, []).append({
"id": issue.cve or issue.id,
"cve": issue.cve,
"severity": issue.severity,
"title": issue.title,
"cvss": issue.cvss,
"fixedIn": issue.fixed_in,
})
components = [
SBOMComponentResponse(
name=sc["name"],
version=sc["version"],
type=sc.get("type", "library"),
purl=sc.get("purl", ""),
licenses=sc.get("licenses", []),
vulnerabilities=comp_vulns.get(sc["name"], []),
)
for sc in sbom_data.get("components", [])
]
return ScreeningResponse(
id=str(row[0]),
status=row[1],
sbom_format=row[2] or "CycloneDX",
sbom_version=row[3] or "1.5",
total_components=row[4] or 0,
total_issues=row[5] or 0,
critical_issues=row[6] or 0,
high_issues=row[7] or 0,
medium_issues=row[8] or 0,
low_issues=row[9] or 0,
components=components,
issues=issues,
started_at=str(row[11]) if row[11] else None,
completed_at=str(row[12]) if row[12] else None,
)
def list_screenings(self, tenant_id: str) -> ScreeningListResponse:
rows = self.db.execute(
text(
"SELECT id, status, total_components, total_issues, "
"critical_issues, high_issues, medium_issues, low_issues, "
"started_at, completed_at, created_at "
"FROM compliance_screenings "
"WHERE tenant_id = :tenant_id "
"ORDER BY created_at DESC"
),
{"tenant_id": tenant_id},
).fetchall()
screenings = [
{
"id": str(r[0]),
"status": r[1],
"total_components": r[2],
"total_issues": r[3],
"critical_issues": r[4],
"high_issues": r[5],
"medium_issues": r[6],
"low_issues": r[7],
"started_at": str(r[8]) if r[8] else None,
"completed_at": str(r[9]) if r[9] else None,
"created_at": str(r[10]),
}
for r in rows
]
return ScreeningListResponse(screenings=screenings, total=len(screenings))