feat: add new scanners, enhanced triage, findings refinement, and deployment tooling
Some checks failed
CI / Deploy Agent (push) Has been skipped
CI / Deploy Dashboard (push) Has been skipped
CI / Deploy Docs (push) Has been skipped
CI / Deploy MCP (push) Has been skipped
CI / Deploy Agent (pull_request) Has been skipped
CI / Deploy Dashboard (pull_request) Has been skipped
CI / Deploy Docs (pull_request) Has been skipped
CI / Format (push) Failing after 3s
CI / Clippy (push) Failing after 2m44s
CI / Security Audit (push) Has been skipped
CI / Tests (push) Has been skipped
CI / Format (pull_request) Failing after 3s
CI / Clippy (pull_request) Failing after 2m51s
CI / Security Audit (pull_request) Has been skipped
CI / Tests (pull_request) Has been skipped
CI / Detect Changes (push) Has been skipped
CI / Detect Changes (pull_request) Has been skipped
CI / Deploy MCP (pull_request) Has been skipped

- Add gitleaks secret detection, lint scanning (clippy/eslint/ruff), and LLM code review scanners
- Enhance LLM triage with multi-action support (confirm/downgrade/upgrade/dismiss),
  surrounding code context, and file-path classification confidence adjustment
- Add text search, column sorting, and bulk status update to findings dashboard
- Fix finding detail page status refresh and add developer feedback field
- Fix BSON DateTime deserialization across all models with shared serde helpers
- Add scan progress spinner with polling to repositories page
- Batch OSV.dev queries to avoid "Too many queries" errors
- Add gitleaks, semgrep, and ruff to Dockerfile.agent for deployment

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Sharang Parnerkar
2026-03-09 11:05:31 +01:00
parent 32e5fc21e7
commit 23ba52276b
31 changed files with 1602 additions and 95 deletions

View File

@@ -41,6 +41,12 @@ pub struct FindingsFilter {
pub scan_type: Option<String>,
#[serde(default)]
pub status: Option<String>,
#[serde(default)]
pub q: Option<String>,
#[serde(default)]
pub sort_by: Option<String>,
#[serde(default)]
pub sort_order: Option<String>,
#[serde(default = "default_page")]
pub page: u64,
#[serde(default = "default_limit")]
@@ -91,6 +97,17 @@ pub struct UpdateStatusRequest {
pub status: String,
}
#[derive(Deserialize)]
pub struct BulkUpdateStatusRequest {
pub ids: Vec<String>,
pub status: String,
}
#[derive(Deserialize)]
pub struct UpdateFeedbackRequest {
pub feedback: String,
}
#[derive(Deserialize)]
pub struct SbomFilter {
#[serde(default)]
@@ -367,6 +384,29 @@ pub async fn list_findings(
if let Some(status) = &filter.status {
query.insert("status", status);
}
// Text search across title, description, file_path, rule_id
if let Some(q) = &filter.q {
if !q.is_empty() {
let regex = doc! { "$regex": q, "$options": "i" };
query.insert(
"$or",
mongodb::bson::bson!([
{ "title": regex.clone() },
{ "description": regex.clone() },
{ "file_path": regex.clone() },
{ "rule_id": regex },
]),
);
}
}
// Dynamic sort
let sort_field = filter.sort_by.as_deref().unwrap_or("created_at");
let sort_dir: i32 = match filter.sort_order.as_deref() {
Some("asc") => 1,
_ => -1,
};
let sort_doc = doc! { sort_field: sort_dir };
let skip = (filter.page.saturating_sub(1)) * filter.limit as u64;
let total = db
@@ -378,7 +418,7 @@ pub async fn list_findings(
let findings = match db
.findings()
.find(query)
.sort(doc! { "created_at": -1 })
.sort(sort_doc)
.skip(skip)
.limit(filter.limit)
.await
@@ -434,6 +474,55 @@ pub async fn update_finding_status(
Ok(Json(serde_json::json!({ "status": "updated" })))
}
pub async fn bulk_update_finding_status(
Extension(agent): AgentExt,
Json(req): Json<BulkUpdateStatusRequest>,
) -> Result<Json<serde_json::Value>, StatusCode> {
let oids: Vec<mongodb::bson::oid::ObjectId> = req
.ids
.iter()
.filter_map(|id| mongodb::bson::oid::ObjectId::parse_str(id).ok())
.collect();
if oids.is_empty() {
return Err(StatusCode::BAD_REQUEST);
}
let result = agent
.db
.findings()
.update_many(
doc! { "_id": { "$in": oids } },
doc! { "$set": { "status": &req.status, "updated_at": mongodb::bson::DateTime::now() } },
)
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
Ok(Json(
serde_json::json!({ "status": "updated", "modified_count": result.modified_count }),
))
}
pub async fn update_finding_feedback(
Extension(agent): AgentExt,
Path(id): Path<String>,
Json(req): Json<UpdateFeedbackRequest>,
) -> Result<Json<serde_json::Value>, StatusCode> {
let oid = mongodb::bson::oid::ObjectId::parse_str(&id).map_err(|_| StatusCode::BAD_REQUEST)?;
agent
.db
.findings()
.update_one(
doc! { "_id": oid },
doc! { "$set": { "developer_feedback": &req.feedback, "updated_at": mongodb::bson::DateTime::now() } },
)
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
Ok(Json(serde_json::json!({ "status": "updated" })))
}
pub async fn list_sbom(
Extension(agent): AgentExt,
Query(filter): Query<SbomFilter>,

View File

@@ -23,6 +23,14 @@ pub fn build_router() -> Router {
"/api/v1/findings/{id}/status",
patch(handlers::update_finding_status),
)
.route(
"/api/v1/findings/bulk-status",
patch(handlers::bulk_update_finding_status),
)
.route(
"/api/v1/findings/{id}/feedback",
patch(handlers::update_finding_feedback),
)
.route("/api/v1/sbom", get(handlers::list_sbom))
.route("/api/v1/sbom/export", get(handlers::export_sbom))
.route("/api/v1/sbom/licenses", get(handlers::license_summary))

View File

@@ -5,6 +5,7 @@ pub mod descriptions;
pub mod fixes;
#[allow(dead_code)]
pub mod pr_review;
pub mod review_prompts;
pub mod triage;
pub use client::LlmClient;

View File

@@ -0,0 +1,77 @@
// System prompts for multi-pass LLM code review.
// Each pass focuses on a different aspect to avoid overloading a single prompt.
pub const LOGIC_REVIEW_PROMPT: &str = r#"You are a senior software engineer reviewing code changes. Focus ONLY on logic and correctness issues.
Look for:
- Off-by-one errors, wrong comparisons, missing edge cases
- Incorrect control flow (unreachable code, missing returns, wrong loop conditions)
- Race conditions or concurrency bugs
- Resource leaks (unclosed handles, missing cleanup)
- Wrong variable used (copy-paste errors)
- Incorrect error handling (swallowed errors, wrong error type)
Ignore: style, naming, formatting, documentation, minor improvements.
For each issue found, respond with a JSON array:
[{"title": "...", "description": "...", "severity": "high|medium|low", "file": "...", "line": N, "suggestion": "..."}]
If no issues found, respond with: []"#;
pub const SECURITY_REVIEW_PROMPT: &str = r#"You are a security engineer reviewing code changes. Focus ONLY on security vulnerabilities.
Look for:
- Injection vulnerabilities (SQL, command, XSS, template injection)
- Authentication/authorization bypasses
- Sensitive data exposure (logging secrets, hardcoded credentials)
- Insecure cryptography (weak algorithms, predictable randomness)
- Path traversal, SSRF, open redirects
- Unsafe deserialization
- Missing input validation at trust boundaries
Ignore: code style, performance, general quality.
For each issue found, respond with a JSON array:
[{"title": "...", "description": "...", "severity": "critical|high|medium", "file": "...", "line": N, "cwe": "CWE-XXX", "suggestion": "..."}]
If no issues found, respond with: []"#;
pub const CONVENTION_REVIEW_PROMPT: &str = r#"You are a code reviewer checking adherence to project conventions. Focus ONLY on patterns that indicate likely bugs or maintenance problems.
Look for:
- Inconsistent error handling patterns within the same module
- Public API that doesn't follow the project's established patterns
- Missing or incorrect type annotations that could cause runtime issues
- Anti-patterns specific to the language (e.g. unwrap in Rust library code, any in TypeScript)
Do NOT report: minor style preferences, documentation gaps, formatting.
Only report issues with HIGH confidence that they deviate from the visible codebase conventions.
For each issue found, respond with a JSON array:
[{"title": "...", "description": "...", "severity": "medium|low", "file": "...", "line": N, "suggestion": "..."}]
If no issues found, respond with: []"#;
pub const COMPLEXITY_REVIEW_PROMPT: &str = r#"You are reviewing code changes for excessive complexity that could lead to bugs.
Look for:
- Functions over 50 lines that should be decomposed
- Deeply nested control flow (4+ levels)
- Complex boolean expressions that are hard to reason about
- Functions with 5+ parameters
- Code duplication within the changed files
Only report complexity issues that are HIGH risk for future bugs. Ignore acceptable complexity in configuration, CLI argument parsing, or generated code.
For each issue found, respond with a JSON array:
[{"title": "...", "description": "...", "severity": "medium|low", "file": "...", "line": N, "suggestion": "..."}]
If no issues found, respond with: []"#;
/// All review types with their prompts
pub const REVIEW_PASSES: &[(&str, &str)] = &[
("logic", LOGIC_REVIEW_PROMPT),
("security", SECURITY_REVIEW_PROMPT),
("convention", CONVENTION_REVIEW_PROMPT),
("complexity", COMPLEXITY_REVIEW_PROMPT),
];

View File

@@ -5,13 +5,22 @@ use compliance_core::models::{Finding, FindingStatus};
use crate::llm::LlmClient;
use crate::pipeline::orchestrator::GraphContext;
const TRIAGE_SYSTEM_PROMPT: &str = r#"You are a security finding triage expert. Analyze the following security finding and determine:
1. Is this a true positive? (yes/no)
2. Confidence score (0-10, where 10 is highest confidence this is a real issue)
3. Brief remediation suggestion (1-2 sentences)
const TRIAGE_SYSTEM_PROMPT: &str = r#"You are a security finding triage expert. Analyze the following security finding with its code context and determine the appropriate action.
Actions:
- "confirm": The finding is a true positive at the reported severity. Keep as-is.
- "downgrade": The finding is real but over-reported. Lower severity recommended.
- "upgrade": The finding is under-reported. Higher severity recommended.
- "dismiss": The finding is a false positive. Should be removed.
Consider:
- Is the code in a test, example, or generated file? (lower confidence for test code)
- Does the surrounding code context confirm or refute the finding?
- Is the finding actionable by a developer?
- Would a real attacker be able to exploit this?
Respond in JSON format:
{"true_positive": true/false, "confidence": N, "remediation": "..."}"#;
{"action": "confirm|downgrade|upgrade|dismiss", "confidence": 0-10, "rationale": "brief explanation", "remediation": "optional fix suggestion"}"#;
pub async fn triage_findings(
llm: &Arc<LlmClient>,
@@ -21,8 +30,10 @@ pub async fn triage_findings(
let mut passed = 0;
for finding in findings.iter_mut() {
let file_classification = classify_file_path(finding.file_path.as_deref());
let mut user_prompt = format!(
"Scanner: {}\nRule: {}\nSeverity: {}\nTitle: {}\nDescription: {}\nFile: {}\nLine: {}\nCode: {}",
"Scanner: {}\nRule: {}\nSeverity: {}\nTitle: {}\nDescription: {}\nFile: {}\nLine: {}\nCode: {}\nFile classification: {}",
finding.scanner,
finding.rule_id.as_deref().unwrap_or("N/A"),
finding.severity,
@@ -31,8 +42,14 @@ pub async fn triage_findings(
finding.file_path.as_deref().unwrap_or("N/A"),
finding.line_number.map(|n| n.to_string()).unwrap_or_else(|| "N/A".to_string()),
finding.code_snippet.as_deref().unwrap_or("N/A"),
file_classification,
);
// Enrich with surrounding code context if possible
if let Some(context) = read_surrounding_context(finding) {
user_prompt.push_str(&format!("\n\n--- Surrounding Code (50 lines) ---\n{context}"));
}
// Enrich with graph context if available
if let Some(ctx) = graph_context {
if let Some(impact) = ctx
@@ -69,32 +86,54 @@ pub async fn triage_findings(
.await
{
Ok(response) => {
// Strip markdown code fences if present (e.g. ```json ... ```)
let cleaned = response.trim();
let cleaned = if cleaned.starts_with("```") {
let inner = cleaned
cleaned
.trim_start_matches("```json")
.trim_start_matches("```")
.trim_end_matches("```")
.trim();
inner
.trim()
} else {
cleaned
};
if let Ok(result) = serde_json::from_str::<TriageResult>(cleaned) {
finding.confidence = Some(result.confidence);
// Apply file-path confidence adjustment
let adjusted_confidence = adjust_confidence(result.confidence, &file_classification);
finding.confidence = Some(adjusted_confidence);
finding.triage_action = Some(result.action.clone());
finding.triage_rationale = Some(result.rationale);
if let Some(remediation) = result.remediation {
finding.remediation = Some(remediation);
}
if result.confidence >= 3.0 {
finding.status = FindingStatus::Triaged;
passed += 1;
} else {
finding.status = FindingStatus::FalsePositive;
match result.action.as_str() {
"dismiss" => {
finding.status = FindingStatus::FalsePositive;
}
"downgrade" => {
// Downgrade severity by one level
finding.severity = downgrade_severity(&finding.severity);
finding.status = FindingStatus::Triaged;
passed += 1;
}
"upgrade" => {
finding.severity = upgrade_severity(&finding.severity);
finding.status = FindingStatus::Triaged;
passed += 1;
}
_ => {
// "confirm" or unknown — keep as-is
if adjusted_confidence >= 3.0 {
finding.status = FindingStatus::Triaged;
passed += 1;
} else {
finding.status = FindingStatus::FalsePositive;
}
}
}
} else {
// If LLM response doesn't parse, keep the finding
// Parse failure — keep the finding
finding.status = FindingStatus::Triaged;
passed += 1;
tracing::warn!(
@@ -117,12 +156,118 @@ pub async fn triage_findings(
passed
}
/// Read ~50 lines of surrounding code from the file at the finding's location
fn read_surrounding_context(finding: &Finding) -> Option<String> {
let file_path = finding.file_path.as_deref()?;
let line = finding.line_number? as usize;
// Try to read the file — this works because the repo is cloned locally
let content = std::fs::read_to_string(file_path).ok()?;
let lines: Vec<&str> = content.lines().collect();
let start = line.saturating_sub(25);
let end = (line + 25).min(lines.len());
Some(
lines[start..end]
.iter()
.enumerate()
.map(|(i, l)| format!("{:>4} | {}", start + i + 1, l))
.collect::<Vec<_>>()
.join("\n"),
)
}
/// Classify a file path to inform triage confidence adjustment
fn classify_file_path(path: Option<&str>) -> String {
let path = match path {
Some(p) => p.to_lowercase(),
None => return "unknown".to_string(),
};
if path.contains("/test/")
|| path.contains("/tests/")
|| path.contains("_test.")
|| path.contains(".test.")
|| path.contains(".spec.")
|| path.contains("/fixtures/")
|| path.contains("/testdata/")
{
return "test".to_string();
}
if path.contains("/example")
|| path.contains("/examples/")
|| path.contains("/demo/")
|| path.contains("/sample")
{
return "example".to_string();
}
if path.contains("/generated/")
|| path.contains("/gen/")
|| path.contains(".generated.")
|| path.contains(".pb.go")
|| path.contains("_generated.rs")
{
return "generated".to_string();
}
if path.contains("/vendor/")
|| path.contains("/node_modules/")
|| path.contains("/third_party/")
{
return "vendored".to_string();
}
"production".to_string()
}
/// Adjust confidence based on file classification
fn adjust_confidence(raw_confidence: f64, classification: &str) -> f64 {
let multiplier = match classification {
"test" => 0.5,
"example" => 0.6,
"generated" => 0.3,
"vendored" => 0.4,
_ => 1.0,
};
raw_confidence * multiplier
}
fn downgrade_severity(severity: &compliance_core::models::Severity) -> compliance_core::models::Severity {
use compliance_core::models::Severity;
match severity {
Severity::Critical => Severity::High,
Severity::High => Severity::Medium,
Severity::Medium => Severity::Low,
Severity::Low => Severity::Info,
Severity::Info => Severity::Info,
}
}
fn upgrade_severity(severity: &compliance_core::models::Severity) -> compliance_core::models::Severity {
use compliance_core::models::Severity;
match severity {
Severity::Info => Severity::Low,
Severity::Low => Severity::Medium,
Severity::Medium => Severity::High,
Severity::High => Severity::Critical,
Severity::Critical => Severity::Critical,
}
}
#[derive(serde::Deserialize)]
struct TriageResult {
#[serde(default)]
#[allow(dead_code)]
true_positive: bool,
#[serde(default = "default_action")]
action: String,
#[serde(default)]
confidence: f64,
#[serde(default)]
rationale: String,
remediation: Option<String>,
}
fn default_action() -> String {
"confirm".to_string()
}

View File

@@ -0,0 +1,186 @@
use std::path::Path;
use std::sync::Arc;
use compliance_core::models::{Finding, ScanType, Severity};
use compliance_core::traits::ScanOutput;
use crate::llm::review_prompts::REVIEW_PASSES;
use crate::llm::LlmClient;
use crate::pipeline::dedup;
use crate::pipeline::git::{DiffFile, GitOps};
pub struct CodeReviewScanner {
llm: Arc<LlmClient>,
}
impl CodeReviewScanner {
pub fn new(llm: Arc<LlmClient>) -> Self {
Self { llm }
}
/// Run multi-pass LLM code review on the diff between old and new commits.
pub async fn review_diff(
&self,
repo_path: &Path,
repo_id: &str,
old_sha: &str,
new_sha: &str,
) -> ScanOutput {
let diff_files = match GitOps::get_diff_content(repo_path, old_sha, new_sha) {
Ok(files) => files,
Err(e) => {
tracing::warn!("Failed to extract diff for code review: {e}");
return ScanOutput::default();
}
};
if diff_files.is_empty() {
return ScanOutput::default();
}
let mut all_findings = Vec::new();
// Chunk diff files into groups to avoid exceeding context limits
let chunks = chunk_diff_files(&diff_files, 8000);
for (pass_name, system_prompt) in REVIEW_PASSES {
for chunk in &chunks {
let user_prompt = format!(
"Review the following code changes:\n\n{}",
chunk
.iter()
.map(|f| format!("--- {} ---\n{}", f.path, f.hunks))
.collect::<Vec<_>>()
.join("\n\n")
);
match self.llm.chat(system_prompt, &user_prompt, Some(0.1)).await {
Ok(response) => {
let parsed = parse_review_response(&response, pass_name, repo_id, chunk);
all_findings.extend(parsed);
}
Err(e) => {
tracing::warn!("Code review pass '{pass_name}' failed: {e}");
}
}
}
}
ScanOutput {
findings: all_findings,
sbom_entries: Vec::new(),
}
}
}
/// Group diff files into chunks that fit within a token budget (rough char estimate)
fn chunk_diff_files(files: &[DiffFile], max_chars: usize) -> Vec<Vec<&DiffFile>> {
let mut chunks: Vec<Vec<&DiffFile>> = Vec::new();
let mut current_chunk: Vec<&DiffFile> = Vec::new();
let mut current_size = 0;
for file in files {
if current_size + file.hunks.len() > max_chars && !current_chunk.is_empty() {
chunks.push(std::mem::take(&mut current_chunk));
current_size = 0;
}
current_chunk.push(file);
current_size += file.hunks.len();
}
if !current_chunk.is_empty() {
chunks.push(current_chunk);
}
chunks
}
fn parse_review_response(
response: &str,
pass_name: &str,
repo_id: &str,
chunk: &[&DiffFile],
) -> Vec<Finding> {
let cleaned = response.trim();
let cleaned = if cleaned.starts_with("```") {
cleaned
.trim_start_matches("```json")
.trim_start_matches("```")
.trim_end_matches("```")
.trim()
} else {
cleaned
};
let issues: Vec<ReviewIssue> = match serde_json::from_str(cleaned) {
Ok(v) => v,
Err(_) => {
if cleaned != "[]" {
tracing::debug!("Failed to parse {pass_name} review response: {cleaned}");
}
return Vec::new();
}
};
issues
.into_iter()
.filter(|issue| {
// Verify the file exists in the diff chunk
chunk.iter().any(|f| f.path == issue.file)
})
.map(|issue| {
let severity = match issue.severity.as_str() {
"critical" => Severity::Critical,
"high" => Severity::High,
"medium" => Severity::Medium,
"low" => Severity::Low,
_ => Severity::Info,
};
let fingerprint = dedup::compute_fingerprint(&[
repo_id,
"code-review",
pass_name,
&issue.file,
&issue.line.to_string(),
&issue.title,
]);
let description = if let Some(suggestion) = &issue.suggestion {
format!("{}\n\nSuggested fix: {}", issue.description, suggestion)
} else {
issue.description.clone()
};
let mut finding = Finding::new(
repo_id.to_string(),
fingerprint,
format!("code-review/{pass_name}"),
ScanType::CodeReview,
issue.title,
description,
severity,
);
finding.rule_id = Some(format!("review/{pass_name}"));
finding.file_path = Some(issue.file);
finding.line_number = Some(issue.line);
finding.cwe = issue.cwe;
finding.suggested_fix = issue.suggestion;
finding
})
.collect()
}
#[derive(serde::Deserialize)]
struct ReviewIssue {
title: String,
description: String,
severity: String,
file: String,
#[serde(default)]
line: u32,
#[serde(default)]
cwe: Option<String>,
#[serde(default)]
suggestion: Option<String>,
}

View File

@@ -64,6 +64,8 @@ impl CveScanner {
}
async fn query_osv_batch(&self, entries: &[SbomEntry]) -> Result<Vec<Vec<OsvVuln>>, CoreError> {
const OSV_BATCH_SIZE: usize = 500;
let queries: Vec<_> = entries
.iter()
.filter_map(|e| {
@@ -79,47 +81,54 @@ impl CveScanner {
return Ok(Vec::new());
}
let body = serde_json::json!({ "queries": queries });
let mut all_vulns: Vec<Vec<OsvVuln>> = Vec::with_capacity(queries.len());
let resp = self
.http
.post("https://api.osv.dev/v1/querybatch")
.json(&body)
.send()
.await
.map_err(|e| CoreError::Http(format!("OSV.dev request failed: {e}")))?;
for chunk in queries.chunks(OSV_BATCH_SIZE) {
let body = serde_json::json!({ "queries": chunk });
if !resp.status().is_success() {
let status = resp.status();
let body = resp.text().await.unwrap_or_default();
tracing::warn!("OSV.dev returned {status}: {body}");
return Ok(Vec::new());
let resp = self
.http
.post("https://api.osv.dev/v1/querybatch")
.json(&body)
.send()
.await
.map_err(|e| CoreError::Http(format!("OSV.dev request failed: {e}")))?;
if !resp.status().is_success() {
let status = resp.status();
let body = resp.text().await.unwrap_or_default();
tracing::warn!("OSV.dev returned {status}: {body}");
// Push empty results for this chunk so indices stay aligned
all_vulns.extend(std::iter::repeat_with(Vec::new).take(chunk.len()));
continue;
}
let result: OsvBatchResponse = resp
.json()
.await
.map_err(|e| CoreError::Http(format!("Failed to parse OSV.dev response: {e}")))?;
let chunk_vulns = result
.results
.into_iter()
.map(|r| {
r.vulns
.unwrap_or_default()
.into_iter()
.map(|v| OsvVuln {
id: v.id,
summary: v.summary,
severity: v.database_specific.and_then(|d| {
d.get("severity").and_then(|s| s.as_str()).map(String::from)
}),
})
.collect()
});
all_vulns.extend(chunk_vulns);
}
let result: OsvBatchResponse = resp
.json()
.await
.map_err(|e| CoreError::Http(format!("Failed to parse OSV.dev response: {e}")))?;
let vulns = result
.results
.into_iter()
.map(|r| {
r.vulns
.unwrap_or_default()
.into_iter()
.map(|v| OsvVuln {
id: v.id,
summary: v.summary,
severity: v.database_specific.and_then(|d| {
d.get("severity").and_then(|s| s.as_str()).map(String::from)
}),
})
.collect()
})
.collect();
Ok(vulns)
Ok(all_vulns)
}
async fn query_nvd(&self, cve_id: &str) -> Result<Option<f64>, CoreError> {

View File

@@ -63,6 +63,62 @@ impl GitOps {
}
}
/// Extract structured diff content between two commits
pub fn get_diff_content(
repo_path: &Path,
old_sha: &str,
new_sha: &str,
) -> Result<Vec<DiffFile>, AgentError> {
let repo = Repository::open(repo_path)?;
let old_commit = repo.find_commit(git2::Oid::from_str(old_sha)?)?;
let new_commit = repo.find_commit(git2::Oid::from_str(new_sha)?)?;
let old_tree = old_commit.tree()?;
let new_tree = new_commit.tree()?;
let diff = repo.diff_tree_to_tree(Some(&old_tree), Some(&new_tree), None)?;
let mut diff_files: Vec<DiffFile> = Vec::new();
diff.print(git2::DiffFormat::Patch, |delta, _hunk, line| {
let file_path = delta
.new_file()
.path()
.map(|p| p.to_string_lossy().to_string())
.unwrap_or_default();
// Find or create the DiffFile entry
let idx = if let Some(pos) = diff_files.iter().position(|f| f.path == file_path) {
pos
} else {
diff_files.push(DiffFile {
path: file_path,
hunks: String::new(),
});
diff_files.len() - 1
};
let diff_file = &mut diff_files[idx];
let prefix = match line.origin() {
'+' => "+",
'-' => "-",
' ' => " ",
_ => "",
};
let content = std::str::from_utf8(line.content()).unwrap_or("");
diff_file.hunks.push_str(prefix);
diff_file.hunks.push_str(content);
true
})?;
// Filter out binary files and very large diffs
diff_files.retain(|f| !f.hunks.is_empty() && f.hunks.len() < 50_000);
Ok(diff_files)
}
#[allow(dead_code)]
pub fn get_changed_files(
repo_path: &Path,
@@ -94,3 +150,10 @@ impl GitOps {
Ok(files)
}
}
/// A file changed between two commits with its diff content
#[derive(Debug, Clone)]
pub struct DiffFile {
pub path: String,
pub hunks: String,
}

View File

@@ -0,0 +1,117 @@
use std::path::Path;
use compliance_core::models::{Finding, ScanType, Severity};
use compliance_core::traits::{ScanOutput, Scanner};
use compliance_core::CoreError;
use crate::pipeline::dedup;
pub struct GitleaksScanner;
impl Scanner for GitleaksScanner {
fn name(&self) -> &str {
"gitleaks"
}
fn scan_type(&self) -> ScanType {
ScanType::SecretDetection
}
async fn scan(&self, repo_path: &Path, repo_id: &str) -> Result<ScanOutput, CoreError> {
let output = tokio::process::Command::new("gitleaks")
.args(["detect", "--source", ".", "--report-format", "json", "--report-path", "/dev/stdout", "--no-banner", "--exit-code", "0"])
.current_dir(repo_path)
.output()
.await
.map_err(|e| CoreError::Scanner {
scanner: "gitleaks".to_string(),
source: Box::new(e),
})?;
if output.stdout.is_empty() {
return Ok(ScanOutput::default());
}
let results: Vec<GitleaksResult> = serde_json::from_slice(&output.stdout)
.unwrap_or_default();
let findings = results
.into_iter()
.filter(|r| !is_allowlisted(&r.file))
.map(|r| {
let severity = match r.rule_id.as_str() {
s if s.contains("private-key") => Severity::Critical,
s if s.contains("token") || s.contains("password") || s.contains("secret") => Severity::High,
s if s.contains("api-key") => Severity::High,
_ => Severity::Medium,
};
let fingerprint = dedup::compute_fingerprint(&[
repo_id,
&r.rule_id,
&r.file,
&r.start_line.to_string(),
]);
let title = format!("Secret detected: {}", r.description);
let description = format!(
"Potential secret ({}) found in {}:{}. Match: {}",
r.rule_id,
r.file,
r.start_line,
r.r#match.chars().take(80).collect::<String>(),
);
let mut finding = Finding::new(
repo_id.to_string(),
fingerprint,
"gitleaks".to_string(),
ScanType::SecretDetection,
title,
description,
severity,
);
finding.rule_id = Some(r.rule_id);
finding.file_path = Some(r.file);
finding.line_number = Some(r.start_line);
finding.code_snippet = Some(r.r#match);
finding
})
.collect();
Ok(ScanOutput {
findings,
sbom_entries: Vec::new(),
})
}
}
/// Skip files that commonly contain example/placeholder secrets
fn is_allowlisted(file_path: &str) -> bool {
let lower = file_path.to_lowercase();
lower.ends_with(".env.example")
|| lower.ends_with(".env.sample")
|| lower.ends_with(".env.template")
|| lower.contains("/test/")
|| lower.contains("/tests/")
|| lower.contains("/fixtures/")
|| lower.contains("/testdata/")
|| lower.contains("mock")
|| lower.ends_with("_test.go")
|| lower.ends_with(".test.ts")
|| lower.ends_with(".test.js")
|| lower.ends_with(".spec.ts")
|| lower.ends_with(".spec.js")
}
#[derive(serde::Deserialize)]
#[serde(rename_all = "PascalCase")]
struct GitleaksResult {
description: String,
#[serde(rename = "RuleID")]
rule_id: String,
file: String,
start_line: u32,
#[serde(rename = "Match")]
r#match: String,
}

View File

@@ -0,0 +1,361 @@
use std::path::Path;
use std::time::Duration;
use compliance_core::models::{Finding, ScanType, Severity};
use compliance_core::traits::{ScanOutput, Scanner};
use compliance_core::CoreError;
use tokio::process::Command;
use crate::pipeline::dedup;
/// Timeout for each individual lint command
const LINT_TIMEOUT: Duration = Duration::from_secs(120);
pub struct LintScanner;
impl Scanner for LintScanner {
fn name(&self) -> &str {
"lint"
}
fn scan_type(&self) -> ScanType {
ScanType::Lint
}
async fn scan(&self, repo_path: &Path, repo_id: &str) -> Result<ScanOutput, CoreError> {
let mut all_findings = Vec::new();
// Detect which languages are present and run appropriate linters
if has_rust_project(repo_path) {
match run_clippy(repo_path, repo_id).await {
Ok(findings) => all_findings.extend(findings),
Err(e) => tracing::warn!("Clippy failed: {e}"),
}
}
if has_js_project(repo_path) {
match run_eslint(repo_path, repo_id).await {
Ok(findings) => all_findings.extend(findings),
Err(e) => tracing::warn!("ESLint failed: {e}"),
}
}
if has_python_project(repo_path) {
match run_ruff(repo_path, repo_id).await {
Ok(findings) => all_findings.extend(findings),
Err(e) => tracing::warn!("Ruff failed: {e}"),
}
}
Ok(ScanOutput {
findings: all_findings,
sbom_entries: Vec::new(),
})
}
}
fn has_rust_project(repo_path: &Path) -> bool {
repo_path.join("Cargo.toml").exists()
}
fn has_js_project(repo_path: &Path) -> bool {
// Only run if eslint is actually installed in the project
repo_path.join("package.json").exists()
&& repo_path.join("node_modules/.bin/eslint").exists()
}
fn has_python_project(repo_path: &Path) -> bool {
repo_path.join("pyproject.toml").exists()
|| repo_path.join("setup.py").exists()
|| repo_path.join("requirements.txt").exists()
}
/// Run a command with a timeout, returning its output or an error
async fn run_with_timeout(
child: tokio::process::Child,
scanner_name: &str,
) -> Result<std::process::Output, CoreError> {
let result = tokio::time::timeout(LINT_TIMEOUT, child.wait_with_output()).await;
match result {
Ok(Ok(output)) => Ok(output),
Ok(Err(e)) => Err(CoreError::Scanner {
scanner: scanner_name.to_string(),
source: Box::new(e),
}),
Err(_) => {
// Process is dropped here which sends SIGKILL on Unix
Err(CoreError::Scanner {
scanner: scanner_name.to_string(),
source: Box::new(std::io::Error::new(
std::io::ErrorKind::TimedOut,
format!("{scanner_name} timed out after {}s", LINT_TIMEOUT.as_secs()),
)),
})
}
}
}
// ── Clippy ──────────────────────────────────────────────
async fn run_clippy(repo_path: &Path, repo_id: &str) -> Result<Vec<Finding>, CoreError> {
let child = Command::new("cargo")
.args(["clippy", "--message-format=json", "--quiet", "--", "-W", "clippy::all"])
.current_dir(repo_path)
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()
.map_err(|e| CoreError::Scanner {
scanner: "clippy".to_string(),
source: Box::new(e),
})?;
let output = run_with_timeout(child, "clippy").await?;
let stdout = String::from_utf8_lossy(&output.stdout);
let mut findings = Vec::new();
for line in stdout.lines() {
let msg: serde_json::Value = match serde_json::from_str(line) {
Ok(v) => v,
Err(_) => continue,
};
if msg.get("reason").and_then(|v| v.as_str()) != Some("compiler-message") {
continue;
}
let message = match msg.get("message") {
Some(m) => m,
None => continue,
};
let level = message
.get("level")
.and_then(|v| v.as_str())
.unwrap_or("");
if level != "warning" && level != "error" {
continue;
}
let text = message
.get("message")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let code = message
.get("code")
.and_then(|v| v.get("code"))
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
if text.starts_with("aborting due to") || code.is_empty() {
continue;
}
let (file_path, line_number) = extract_primary_span(message);
let severity = if level == "error" {
Severity::High
} else {
Severity::Low
};
let fingerprint =
dedup::compute_fingerprint(&[repo_id, "clippy", &code, &file_path, &line_number.to_string()]);
let mut finding = Finding::new(
repo_id.to_string(),
fingerprint,
"clippy".to_string(),
ScanType::Lint,
format!("[clippy] {text}"),
text,
severity,
);
finding.rule_id = Some(code);
if !file_path.is_empty() {
finding.file_path = Some(file_path);
}
if line_number > 0 {
finding.line_number = Some(line_number);
}
findings.push(finding);
}
Ok(findings)
}
fn extract_primary_span(message: &serde_json::Value) -> (String, u32) {
let spans = match message.get("spans").and_then(|v| v.as_array()) {
Some(s) => s,
None => return (String::new(), 0),
};
for span in spans {
if span.get("is_primary").and_then(|v| v.as_bool()) == Some(true) {
let file = span
.get("file_name")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let line = span
.get("line_start")
.and_then(|v| v.as_u64())
.unwrap_or(0) as u32;
return (file, line);
}
}
(String::new(), 0)
}
// ── ESLint ──────────────────────────────────────────────
async fn run_eslint(repo_path: &Path, repo_id: &str) -> Result<Vec<Finding>, CoreError> {
// Use the project-local eslint binary directly, not npx (which can hang downloading)
let eslint_bin = repo_path.join("node_modules/.bin/eslint");
let child = Command::new(eslint_bin)
.args([".", "--format", "json", "--no-error-on-unmatched-pattern"])
.current_dir(repo_path)
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()
.map_err(|e| CoreError::Scanner {
scanner: "eslint".to_string(),
source: Box::new(e),
})?;
let output = run_with_timeout(child, "eslint").await?;
if output.stdout.is_empty() {
return Ok(Vec::new());
}
let results: Vec<EslintFileResult> =
serde_json::from_slice(&output.stdout).unwrap_or_default();
let mut findings = Vec::new();
for file_result in results {
for msg in file_result.messages {
let severity = match msg.severity {
2 => Severity::Medium,
_ => Severity::Low,
};
let rule_id = msg.rule_id.unwrap_or_default();
let fingerprint = dedup::compute_fingerprint(&[
repo_id,
"eslint",
&rule_id,
&file_result.file_path,
&msg.line.to_string(),
]);
let mut finding = Finding::new(
repo_id.to_string(),
fingerprint,
"eslint".to_string(),
ScanType::Lint,
format!("[eslint] {}", msg.message),
msg.message,
severity,
);
finding.rule_id = Some(rule_id);
finding.file_path = Some(file_result.file_path.clone());
finding.line_number = Some(msg.line);
findings.push(finding);
}
}
Ok(findings)
}
#[derive(serde::Deserialize)]
struct EslintFileResult {
#[serde(rename = "filePath")]
file_path: String,
messages: Vec<EslintMessage>,
}
#[derive(serde::Deserialize)]
struct EslintMessage {
#[serde(rename = "ruleId")]
rule_id: Option<String>,
severity: u8,
message: String,
line: u32,
}
// ── Ruff ────────────────────────────────────────────────
async fn run_ruff(repo_path: &Path, repo_id: &str) -> Result<Vec<Finding>, CoreError> {
let child = Command::new("ruff")
.args(["check", ".", "--output-format", "json", "--exit-zero"])
.current_dir(repo_path)
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()
.map_err(|e| CoreError::Scanner {
scanner: "ruff".to_string(),
source: Box::new(e),
})?;
let output = run_with_timeout(child, "ruff").await?;
if output.stdout.is_empty() {
return Ok(Vec::new());
}
let results: Vec<RuffResult> =
serde_json::from_slice(&output.stdout).unwrap_or_default();
let findings = results
.into_iter()
.map(|r| {
let severity = if r.code.starts_with('E') || r.code.starts_with('F') {
Severity::Medium
} else {
Severity::Low
};
let fingerprint = dedup::compute_fingerprint(&[
repo_id,
"ruff",
&r.code,
&r.filename,
&r.location.row.to_string(),
]);
let mut finding = Finding::new(
repo_id.to_string(),
fingerprint,
"ruff".to_string(),
ScanType::Lint,
format!("[ruff] {}: {}", r.code, r.message),
r.message,
severity,
);
finding.rule_id = Some(r.code);
finding.file_path = Some(r.filename);
finding.line_number = Some(r.location.row);
finding
})
.collect();
Ok(findings)
}
#[derive(serde::Deserialize)]
struct RuffResult {
code: String,
message: String,
filename: String,
location: RuffLocation,
}
#[derive(serde::Deserialize)]
struct RuffLocation {
row: u32,
}

View File

@@ -1,6 +1,9 @@
pub mod code_review;
pub mod cve;
pub mod dedup;
pub mod git;
pub mod gitleaks;
pub mod lint;
pub mod orchestrator;
pub mod patterns;
pub mod sbom;

View File

@@ -9,8 +9,11 @@ use compliance_core::AgentConfig;
use crate::database::Database;
use crate::error::AgentError;
use crate::llm::LlmClient;
use crate::pipeline::code_review::CodeReviewScanner;
use crate::pipeline::cve::CveScanner;
use crate::pipeline::git::GitOps;
use crate::pipeline::gitleaks::GitleaksScanner;
use crate::pipeline::lint::LintScanner;
use crate::pipeline::patterns::{GdprPatternScanner, OAuthPatternScanner};
use crate::pipeline::sbom::SbomScanner;
use crate::pipeline::semgrep::SemgrepScanner;
@@ -182,6 +185,35 @@ impl PipelineOrchestrator {
Err(e) => tracing::warn!("[{repo_id}] OAuth pattern scan failed: {e}"),
}
// Stage 4a: Secret Detection (Gitleaks)
tracing::info!("[{repo_id}] Stage 4a: Secret Detection");
self.update_phase(scan_run_id, "secret_detection").await;
let gitleaks = GitleaksScanner;
match gitleaks.scan(&repo_path, &repo_id).await {
Ok(output) => all_findings.extend(output.findings),
Err(e) => tracing::warn!("[{repo_id}] Gitleaks failed: {e}"),
}
// Stage 4b: Lint Scanning
tracing::info!("[{repo_id}] Stage 4b: Lint Scanning");
self.update_phase(scan_run_id, "lint_scanning").await;
let lint = LintScanner;
match lint.scan(&repo_path, &repo_id).await {
Ok(output) => all_findings.extend(output.findings),
Err(e) => tracing::warn!("[{repo_id}] Lint scanning failed: {e}"),
}
// Stage 4c: LLM Code Review (only on incremental scans)
if let Some(old_sha) = &repo.last_scanned_commit {
tracing::info!("[{repo_id}] Stage 4c: LLM Code Review");
self.update_phase(scan_run_id, "code_review").await;
let reviewer = CodeReviewScanner::new(self.llm.clone());
let review_output = reviewer
.review_diff(&repo_path, &repo_id, old_sha, &current_sha)
.await;
all_findings.extend(review_output.findings);
}
// Stage 4.5: Graph Building
tracing::info!("[{repo_id}] Stage 4.5: Graph Building");
self.update_phase(scan_run_id, "graph_building").await;