Files
compliance-scanner-agent/compliance-agent/src/pipeline/orchestrator.rs
Sharang Parnerkar a9d039dad3
All checks were successful
CI / Check (push) Has been skipped
CI / Detect Changes (push) Successful in 3s
CI / Deploy Agent (push) Successful in 2s
CI / Deploy Dashboard (push) Successful in 2s
CI / Deploy Docs (push) Has been skipped
CI / Deploy MCP (push) Has been skipped
fix: stop storing code review findings in dashboard (#22)
2026-03-18 15:18:07 +00:00

409 lines
14 KiB
Rust

use std::sync::Arc;
use mongodb::bson::doc;
use tracing::Instrument;
use compliance_core::models::*;
use compliance_core::traits::Scanner;
use compliance_core::AgentConfig;
use crate::database::Database;
use crate::error::AgentError;
use crate::llm::LlmClient;
use crate::pipeline::cve::CveScanner;
use crate::pipeline::git::GitOps;
use crate::pipeline::gitleaks::GitleaksScanner;
use crate::pipeline::lint::LintScanner;
use crate::pipeline::patterns::{GdprPatternScanner, OAuthPatternScanner};
use crate::pipeline::sbom::SbomScanner;
use crate::pipeline::semgrep::SemgrepScanner;
/// Context from graph analysis passed to LLM triage for enhanced filtering
#[derive(Debug)]
#[allow(dead_code)]
pub struct GraphContext {
pub node_count: u32,
pub edge_count: u32,
pub community_count: u32,
pub impacts: Vec<compliance_core::models::graph::ImpactAnalysis>,
}
pub struct PipelineOrchestrator {
pub(super) config: AgentConfig,
pub(super) db: Database,
pub(super) llm: Arc<LlmClient>,
pub(super) http: reqwest::Client,
}
impl PipelineOrchestrator {
pub fn new(
config: AgentConfig,
db: Database,
llm: Arc<LlmClient>,
http: reqwest::Client,
) -> Self {
Self {
config,
db,
llm,
http,
}
}
#[tracing::instrument(skip_all, fields(repo_id = %repo_id, trigger = ?trigger))]
pub async fn run(&self, repo_id: &str, trigger: ScanTrigger) -> Result<(), AgentError> {
// Look up the repository
let repo = self
.db
.repositories()
.find_one(doc! { "_id": mongodb::bson::oid::ObjectId::parse_str(repo_id).map_err(|e| AgentError::Other(e.to_string()))? })
.await?
.ok_or_else(|| AgentError::Other(format!("Repository {repo_id} not found")))?;
// Create scan run
let scan_run = ScanRun::new(repo_id.to_string(), trigger);
let insert = self.db.scan_runs().insert_one(&scan_run).await?;
let scan_run_id = insert
.inserted_id
.as_object_id()
.map(|id| id.to_hex())
.unwrap_or_default();
let result = self.run_pipeline(&repo, &scan_run_id).await;
// Update scan run status
match &result {
Ok(count) => {
self.db
.scan_runs()
.update_one(
doc! { "_id": &insert.inserted_id },
doc! {
"$set": {
"status": "completed",
"current_phase": "completed",
"new_findings_count": *count as i64,
"completed_at": mongodb::bson::DateTime::now(),
}
},
)
.await?;
}
Err(e) => {
tracing::error!(repo_id, error = %e, "Scan pipeline failed");
self.db
.scan_runs()
.update_one(
doc! { "_id": &insert.inserted_id },
doc! {
"$set": {
"status": "failed",
"error_message": e.to_string(),
"completed_at": mongodb::bson::DateTime::now(),
}
},
)
.await?;
}
}
result.map(|_| ())
}
#[tracing::instrument(skip_all, fields(repo_id = repo.name.as_str()))]
async fn run_pipeline(
&self,
repo: &TrackedRepository,
scan_run_id: &str,
) -> Result<u32, AgentError> {
let repo_id = repo.id.as_ref().map(|id| id.to_hex()).unwrap_or_default();
// Stage 0: Change detection
tracing::info!("[{repo_id}] Stage 0: Change detection");
let creds = GitOps::make_repo_credentials(&self.config, repo);
let git_ops = GitOps::new(&self.config.git_clone_base_path, creds);
let repo_path = git_ops.clone_or_fetch(&repo.git_url, &repo.name)?;
if !GitOps::has_new_commits(&repo_path, repo.last_scanned_commit.as_deref())? {
tracing::info!("[{repo_id}] No new commits, skipping scan");
return Ok(0);
}
let current_sha = GitOps::get_head_sha(&repo_path)?;
let mut all_findings: Vec<Finding> = Vec::new();
// Stage 1: Semgrep SAST
tracing::info!("[{repo_id}] Stage 1: Semgrep SAST");
self.update_phase(scan_run_id, "sast").await;
match async {
let semgrep = SemgrepScanner;
semgrep.scan(&repo_path, &repo_id).await
}
.instrument(tracing::info_span!("stage_sast"))
.await
{
Ok(output) => all_findings.extend(output.findings),
Err(e) => tracing::warn!("[{repo_id}] Semgrep failed: {e}"),
}
// Stage 2: SBOM Generation
tracing::info!("[{repo_id}] Stage 2: SBOM Generation");
self.update_phase(scan_run_id, "sbom_generation").await;
let mut sbom_entries = match async {
let sbom_scanner = SbomScanner;
sbom_scanner.scan(&repo_path, &repo_id).await
}
.instrument(tracing::info_span!("stage_sbom_generation"))
.await
{
Ok(output) => output.sbom_entries,
Err(e) => {
tracing::warn!("[{repo_id}] SBOM generation failed: {e}");
Vec::new()
}
};
// Stage 3: CVE Scanning
tracing::info!("[{repo_id}] Stage 3: CVE Scanning");
self.update_phase(scan_run_id, "cve_scanning").await;
let cve_scanner = CveScanner::new(
self.http.clone(),
self.config.searxng_url.clone(),
self.config.nvd_api_key.as_ref().map(|k| {
use secrecy::ExposeSecret;
k.expose_secret().to_string()
}),
);
let cve_alerts = match async {
cve_scanner
.scan_dependencies(&repo_id, &mut sbom_entries)
.await
}
.instrument(tracing::info_span!("stage_cve_scanning"))
.await
{
Ok(alerts) => alerts,
Err(e) => {
tracing::warn!("[{repo_id}] CVE scanning failed: {e}");
Vec::new()
}
};
// Stage 4: Pattern Scanning (GDPR + OAuth)
tracing::info!("[{repo_id}] Stage 4: Pattern Scanning");
self.update_phase(scan_run_id, "pattern_scanning").await;
{
let pattern_findings = async {
let mut findings = Vec::new();
let gdpr = GdprPatternScanner::new();
match gdpr.scan(&repo_path, &repo_id).await {
Ok(output) => findings.extend(output.findings),
Err(e) => tracing::warn!("[{repo_id}] GDPR pattern scan failed: {e}"),
}
let oauth = OAuthPatternScanner::new();
match oauth.scan(&repo_path, &repo_id).await {
Ok(output) => findings.extend(output.findings),
Err(e) => tracing::warn!("[{repo_id}] OAuth pattern scan failed: {e}"),
}
findings
}
.instrument(tracing::info_span!("stage_pattern_scanning"))
.await;
all_findings.extend(pattern_findings);
}
// Stage 4a: Secret Detection (Gitleaks)
tracing::info!("[{repo_id}] Stage 4a: Secret Detection");
self.update_phase(scan_run_id, "secret_detection").await;
match async {
let gitleaks = GitleaksScanner;
gitleaks.scan(&repo_path, &repo_id).await
}
.instrument(tracing::info_span!("stage_secret_detection"))
.await
{
Ok(output) => all_findings.extend(output.findings),
Err(e) => tracing::warn!("[{repo_id}] Gitleaks failed: {e}"),
}
// Stage 4b: Lint Scanning
tracing::info!("[{repo_id}] Stage 4b: Lint Scanning");
self.update_phase(scan_run_id, "lint_scanning").await;
match async {
let lint = LintScanner;
lint.scan(&repo_path, &repo_id).await
}
.instrument(tracing::info_span!("stage_lint_scanning"))
.await
{
Ok(output) => all_findings.extend(output.findings),
Err(e) => tracing::warn!("[{repo_id}] Lint scanning failed: {e}"),
}
// Stage 4.5: Graph Building
tracing::info!("[{repo_id}] Stage 4.5: Graph Building");
self.update_phase(scan_run_id, "graph_building").await;
let graph_context = match async {
self.build_code_graph(&repo_path, &repo_id, &all_findings)
.await
}
.instrument(tracing::info_span!("stage_graph_building"))
.await
{
Ok(ctx) => Some(ctx),
Err(e) => {
tracing::warn!("[{repo_id}] Graph building failed: {e}");
None
}
};
// Stage 5: LLM Triage (enhanced with graph context)
tracing::info!(
"[{repo_id}] Stage 5: LLM Triage ({} findings)",
all_findings.len()
);
self.update_phase(scan_run_id, "llm_triage").await;
let triaged = crate::llm::triage::triage_findings(
&self.llm,
&mut all_findings,
graph_context.as_ref(),
)
.await;
tracing::info!("[{repo_id}] Triaged: {triaged} findings passed confidence threshold");
// Dedup against existing findings and insert new ones
let mut new_count = 0u32;
let mut new_findings: Vec<Finding> = Vec::new();
for mut finding in all_findings {
finding.scan_run_id = Some(scan_run_id.to_string());
// Check if fingerprint already exists
let existing = self
.db
.findings()
.find_one(doc! { "fingerprint": &finding.fingerprint })
.await?;
if existing.is_none() {
let result = self.db.findings().insert_one(&finding).await?;
finding.id = result.inserted_id.as_object_id();
new_findings.push(finding);
new_count += 1;
}
}
// Remove stale SBOM entries for this repo before reinserting
if !sbom_entries.is_empty() {
self.db
.sbom_entries()
.delete_many(doc! { "repo_id": &repo.id })
.await?;
}
// Persist SBOM entries
for entry in &sbom_entries {
let filter = doc! {
"repo_id": &entry.repo_id,
"name": &entry.name,
"version": &entry.version,
};
let update = mongodb::bson::to_document(entry)
.map(|d| doc! { "$set": d })
.unwrap_or_else(|_| doc! {});
self.db
.sbom_entries()
.update_one(filter, update)
.upsert(true)
.await?;
}
// Persist CVE alerts (upsert by cve_id + repo_id)
for alert in &cve_alerts {
let filter = doc! {
"cve_id": &alert.cve_id,
"repo_id": &alert.repo_id,
};
let update = mongodb::bson::to_document(alert)
.map(|d| doc! { "$set": d })
.unwrap_or_else(|_| doc! {});
self.db
.cve_alerts()
.update_one(filter, update)
.upsert(true)
.await?;
}
// Stage 6: Issue Creation
tracing::info!("[{repo_id}] Stage 6: Issue Creation");
self.update_phase(scan_run_id, "issue_creation").await;
if let Err(e) = self
.create_tracker_issues(repo, &repo_id, &new_findings)
.await
{
tracing::warn!("[{repo_id}] Issue creation failed: {e}");
}
// Stage 7: Update repository
self.db
.repositories()
.update_one(
doc! { "_id": repo.id },
doc! {
"$set": {
"last_scanned_commit": &current_sha,
"updated_at": mongodb::bson::DateTime::now(),
},
"$inc": { "findings_count": new_count as i64 },
},
)
.await?;
// Stage 8: DAST (async, optional — only if a DastTarget is configured)
tracing::info!("[{repo_id}] Stage 8: Checking for DAST targets");
self.update_phase(scan_run_id, "dast_scanning").await;
self.maybe_trigger_dast(&repo_id, scan_run_id).await;
tracing::info!("[{repo_id}] Scan complete: {new_count} new findings");
Ok(new_count)
}
pub(super) async fn update_phase(&self, scan_run_id: &str, phase: &str) {
if let Ok(oid) = mongodb::bson::oid::ObjectId::parse_str(scan_run_id) {
let _ = self
.db
.scan_runs()
.update_one(
doc! { "_id": oid },
doc! {
"$set": { "current_phase": phase },
"$push": { "phases_completed": phase },
},
)
.await;
}
}
}
/// Extract the scheme + host from a git URL.
/// e.g. "https://gitea.example.com/owner/repo.git" -> "https://gitea.example.com"
/// e.g. "ssh://git@gitea.example.com:22/owner/repo.git" -> "https://gitea.example.com"
pub(super) fn extract_base_url(git_url: &str) -> Option<String> {
if let Some(rest) = git_url.strip_prefix("https://") {
let host = rest.split('/').next()?;
Some(format!("https://{host}"))
} else if let Some(rest) = git_url.strip_prefix("http://") {
let host = rest.split('/').next()?;
Some(format!("http://{host}"))
} else if let Some(rest) = git_url.strip_prefix("ssh://") {
// ssh://git@host:port/path -> extract host
let after_at = rest.find('@').map(|i| &rest[i + 1..]).unwrap_or(rest);
let host = after_at.split(&[':', '/'][..]).next()?;
Some(format!("https://{host}"))
} else if let Some(at_pos) = git_url.find('@') {
// SCP-style: git@host:owner/repo.git
let after_at = &git_url[at_pos + 1..];
let host = after_at.split(':').next()?;
Some(format!("https://{host}"))
} else {
None
}
}