compliance-scanner-agent/compliance-agent/src/pipeline/sbom/mod.rs

mod cargo_audit;
mod syft;

use std::path::Path;

use compliance_core::models::{SbomEntry, ScanType, VulnRef};
use compliance_core::traits::{ScanOutput, Scanner};
use compliance_core::CoreError;

pub struct SbomScanner;

impl Scanner for SbomScanner {
    fn name(&self) -> &str {
        "sbom"
    }

    fn scan_type(&self) -> ScanType {
        ScanType::Sbom
    }

    #[tracing::instrument(skip_all)]
    async fn scan(&self, repo_path: &Path, repo_id: &str) -> Result<ScanOutput, CoreError> {
        let mut entries = Vec::new();

        // Generate missing lock files so Syft can resolve the full dependency tree
        generate_lockfiles(repo_path).await;

        // Run syft for SBOM generation
        match syft::run_syft(repo_path, repo_id).await {
            Ok(syft_entries) => entries.extend(syft_entries),
            Err(e) => tracing::warn!("syft failed: {e}"),
        }

        // Enrich Cargo entries with license info from cargo metadata
        enrich_cargo_licenses(repo_path, &mut entries).await;

        // Run cargo-audit for Rust-specific vulns
        match cargo_audit::run_cargo_audit(repo_path, repo_id).await {
            Ok(vulns) => merge_audit_vulns(&mut entries, vulns),
            Err(e) => tracing::warn!("cargo-audit skipped: {e}"),
        }

        Ok(ScanOutput {
            findings: Vec::new(),
            sbom_entries: entries,
        })
    }
}

/// Generate missing lock files so Syft can resolve the full dependency tree.
/// This handles repos that gitignore their lock files (common for Rust libraries).
#[tracing::instrument(skip_all)]
async fn generate_lockfiles(repo_path: &Path) {
    // Cargo: generate Cargo.lock if Cargo.toml exists without it
    if repo_path.join("Cargo.toml").exists() && !repo_path.join("Cargo.lock").exists() {
        tracing::info!("generating Cargo.lock for SBOM scan");
        let result = tokio::process::Command::new("cargo")
            .args(["generate-lockfile"])
            .current_dir(repo_path)
            .env("RUSTC_WRAPPER", "")
            .output()
            .await;
        match result {
            Ok(o) if o.status.success() => tracing::info!("Cargo.lock generated"),
            Ok(o) => tracing::warn!(
                "cargo generate-lockfile failed: {}",
                String::from_utf8_lossy(&o.stderr)
            ),
            Err(e) => tracing::warn!("cargo generate-lockfile error: {e}"),
        }
    }

    // pip: generate a requirements lock if only pyproject.toml / setup.py exists
    let has_pip_manifest = repo_path.join("pyproject.toml").exists()
        || repo_path.join("setup.py").exists()
        || repo_path.join("setup.cfg").exists();
    let has_pip_lock = repo_path.join("requirements.txt").exists()
        || repo_path.join("requirements-lock.txt").exists()
        || repo_path.join("poetry.lock").exists()
        || repo_path.join("Pipfile.lock").exists();
    if has_pip_manifest && !has_pip_lock {
        // Try pip-compile (pip-tools) first, fall back to pip freeze approach
        tracing::info!("attempting to generate pip requirements for SBOM scan");
        if repo_path.join("pyproject.toml").exists() {
            let result = tokio::process::Command::new("pip-compile")
                .args([
                    "--quiet",
                    "--output-file",
                    "requirements.txt",
                    "pyproject.toml",
                ])
                .current_dir(repo_path)
                .output()
                .await;
            match result {
                Ok(o) if o.status.success() => {
                    tracing::info!("requirements.txt generated via pip-compile")
                }
                _ => tracing::warn!(
                    "pip-compile not available or failed, Syft will parse pyproject.toml directly"
                ),
            }
        }
    }

    // npm: generate package-lock.json if package.json exists without it
    let has_npm_lock = repo_path.join("package-lock.json").exists()
        || repo_path.join("yarn.lock").exists()
        || repo_path.join("pnpm-lock.yaml").exists();
    if repo_path.join("package.json").exists() && !has_npm_lock {
        tracing::info!("generating package-lock.json for SBOM scan");
        let result = tokio::process::Command::new("npm")
            .args(["install", "--package-lock-only", "--ignore-scripts"])
            .current_dir(repo_path)
            .output()
            .await;
        match result {
            Ok(o) if o.status.success() => tracing::info!("package-lock.json generated"),
            Ok(o) => tracing::warn!(
                "npm install --package-lock-only failed: {}",
                String::from_utf8_lossy(&o.stderr)
            ),
            Err(e) => tracing::warn!("npm lock generation error: {e}"),
        }
    }
}

/// Enrich Cargo SBOM entries with license info from `cargo metadata`.
/// Syft doesn't read license data from Cargo.lock, so we fill it in.
#[tracing::instrument(skip_all)]
async fn enrich_cargo_licenses(repo_path: &Path, entries: &mut [SbomEntry]) {
    if !repo_path.join("Cargo.toml").exists() {
        return;
    }

    let has_cargo_entries = entries.iter().any(|e| e.package_manager == "cargo");
    if !has_cargo_entries {
        return;
    }

    let output = match tokio::process::Command::new("cargo")
        .args(["metadata", "--format-version", "1"])
        .current_dir(repo_path)
        .env("RUSTC_WRAPPER", "")
        .output()
        .await
    {
        Ok(o) if o.status.success() => o,
        Ok(o) => {
            tracing::warn!(
                "cargo metadata failed: {}",
                String::from_utf8_lossy(&o.stderr)
            );
            return;
        }
        Err(e) => {
            tracing::warn!("cargo metadata error: {e}");
            return;
        }
    };

    let meta: CargoMetadata = match serde_json::from_slice(&output.stdout) {
        Ok(m) => m,
        Err(e) => {
            tracing::warn!("failed to parse cargo metadata: {e}");
            return;
        }
    };

    // Build a lookup: (name, version) -> license
    let license_map: std::collections::HashMap<(&str, &str), &str> = meta
        .packages
        .iter()
        .filter_map(|p| {
            p.license
                .as_deref()
                .map(|l| (p.name.as_str(), p.version.as_str(), l))
        })
        .map(|(n, v, l)| ((n, v), l))
        .collect();

    for entry in entries.iter_mut() {
        if entry.package_manager != "cargo" || entry.license.is_some() {
            continue;
        }
        if let Some(license) = license_map.get(&(entry.name.as_str(), entry.version.as_str())) {
            entry.license = Some(license.to_string());
        }
    }
}

fn merge_audit_vulns(entries: &mut [SbomEntry], vulns: Vec<cargo_audit::AuditVuln>) {
    for vuln in vulns {
        if let Some(entry) = entries.iter_mut().find(|e| e.name == vuln.package) {
            entry.known_vulnerabilities.push(VulnRef {
                id: vuln.id.clone(),
                source: "cargo-audit".to_string(),
                severity: None,
                url: Some(vuln.url),
            });
        }
    }
}

// Cargo metadata types
#[derive(serde::Deserialize)]
struct CargoMetadata {
    packages: Vec<CargoPackage>,
}

#[derive(serde::Deserialize)]
struct CargoPackage {
    name: String,
    version: String,
    license: Option<String>,
}