237 lines
7.3 KiB
Rust
237 lines
7.3 KiB
Rust
use std::path::Path;
|
|
use std::sync::Arc;
|
|
|
|
use compliance_core::models::{Finding, ScanType, Severity};
|
|
use compliance_core::traits::ScanOutput;
|
|
|
|
use crate::llm::review_prompts::REVIEW_PASSES;
|
|
use crate::llm::LlmClient;
|
|
use crate::pipeline::dedup;
|
|
use crate::pipeline::git::{DiffFile, GitOps};
|
|
|
|
pub struct CodeReviewScanner {
|
|
llm: Arc<LlmClient>,
|
|
}
|
|
|
|
impl CodeReviewScanner {
|
|
pub fn new(llm: Arc<LlmClient>) -> Self {
|
|
Self { llm }
|
|
}
|
|
|
|
/// Run multi-pass LLM code review on the diff between old and new commits.
|
|
pub async fn review_diff(
|
|
&self,
|
|
repo_path: &Path,
|
|
repo_id: &str,
|
|
old_sha: &str,
|
|
new_sha: &str,
|
|
) -> ScanOutput {
|
|
let diff_files = match GitOps::get_diff_content(repo_path, old_sha, new_sha) {
|
|
Ok(files) => files,
|
|
Err(e) => {
|
|
tracing::warn!("Failed to extract diff for code review: {e}");
|
|
return ScanOutput::default();
|
|
}
|
|
};
|
|
|
|
if diff_files.is_empty() {
|
|
return ScanOutput::default();
|
|
}
|
|
|
|
let mut all_findings = Vec::new();
|
|
|
|
// Chunk diff files into groups to avoid exceeding context limits
|
|
let chunks = chunk_diff_files(&diff_files, 8000);
|
|
|
|
for (pass_name, system_prompt) in REVIEW_PASSES {
|
|
for chunk in &chunks {
|
|
let user_prompt = format!(
|
|
"Review the following code changes:\n\n{}",
|
|
chunk
|
|
.iter()
|
|
.map(|f| format!("--- {} ---\n{}", f.path, f.hunks))
|
|
.collect::<Vec<_>>()
|
|
.join("\n\n")
|
|
);
|
|
|
|
match self.llm.chat(system_prompt, &user_prompt, Some(0.1)).await {
|
|
Ok(response) => {
|
|
let parsed = parse_review_response(&response, pass_name, repo_id, chunk);
|
|
all_findings.extend(parsed);
|
|
}
|
|
Err(e) => {
|
|
tracing::warn!("Code review pass '{pass_name}' failed: {e}");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
let deduped = dedup_cross_pass(all_findings);
|
|
|
|
ScanOutput {
|
|
findings: deduped,
|
|
sbom_entries: Vec::new(),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Group diff files into chunks that fit within a token budget (rough char estimate)
|
|
fn chunk_diff_files(files: &[DiffFile], max_chars: usize) -> Vec<Vec<&DiffFile>> {
|
|
let mut chunks: Vec<Vec<&DiffFile>> = Vec::new();
|
|
let mut current_chunk: Vec<&DiffFile> = Vec::new();
|
|
let mut current_size = 0;
|
|
|
|
for file in files {
|
|
if current_size + file.hunks.len() > max_chars && !current_chunk.is_empty() {
|
|
chunks.push(std::mem::take(&mut current_chunk));
|
|
current_size = 0;
|
|
}
|
|
current_chunk.push(file);
|
|
current_size += file.hunks.len();
|
|
}
|
|
|
|
if !current_chunk.is_empty() {
|
|
chunks.push(current_chunk);
|
|
}
|
|
|
|
chunks
|
|
}
|
|
|
|
fn parse_review_response(
|
|
response: &str,
|
|
pass_name: &str,
|
|
repo_id: &str,
|
|
chunk: &[&DiffFile],
|
|
) -> Vec<Finding> {
|
|
let cleaned = response.trim();
|
|
let cleaned = if cleaned.starts_with("```") {
|
|
cleaned
|
|
.trim_start_matches("```json")
|
|
.trim_start_matches("```")
|
|
.trim_end_matches("```")
|
|
.trim()
|
|
} else {
|
|
cleaned
|
|
};
|
|
|
|
let issues: Vec<ReviewIssue> = match serde_json::from_str(cleaned) {
|
|
Ok(v) => v,
|
|
Err(_) => {
|
|
if cleaned != "[]" {
|
|
tracing::debug!("Failed to parse {pass_name} review response: {cleaned}");
|
|
}
|
|
return Vec::new();
|
|
}
|
|
};
|
|
|
|
issues
|
|
.into_iter()
|
|
.filter(|issue| {
|
|
// Verify the file exists in the diff chunk
|
|
chunk.iter().any(|f| f.path == issue.file)
|
|
})
|
|
.map(|issue| {
|
|
let severity = match issue.severity.as_str() {
|
|
"critical" => Severity::Critical,
|
|
"high" => Severity::High,
|
|
"medium" => Severity::Medium,
|
|
"low" => Severity::Low,
|
|
_ => Severity::Info,
|
|
};
|
|
|
|
let fingerprint = dedup::compute_fingerprint(&[
|
|
repo_id,
|
|
"code-review",
|
|
pass_name,
|
|
&issue.file,
|
|
&issue.line.to_string(),
|
|
&issue.title,
|
|
]);
|
|
|
|
let description = if let Some(suggestion) = &issue.suggestion {
|
|
format!("{}\n\nSuggested fix: {}", issue.description, suggestion)
|
|
} else {
|
|
issue.description.clone()
|
|
};
|
|
|
|
let mut finding = Finding::new(
|
|
repo_id.to_string(),
|
|
fingerprint,
|
|
format!("code-review/{pass_name}"),
|
|
ScanType::CodeReview,
|
|
issue.title,
|
|
description,
|
|
severity,
|
|
);
|
|
finding.rule_id = Some(format!("review/{pass_name}"));
|
|
finding.file_path = Some(issue.file);
|
|
finding.line_number = Some(issue.line);
|
|
finding.cwe = issue.cwe;
|
|
finding.suggested_fix = issue.suggestion;
|
|
finding
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
#[derive(serde::Deserialize)]
|
|
struct ReviewIssue {
|
|
title: String,
|
|
description: String,
|
|
severity: String,
|
|
file: String,
|
|
#[serde(default)]
|
|
line: u32,
|
|
#[serde(default)]
|
|
cwe: Option<String>,
|
|
#[serde(default)]
|
|
suggestion: Option<String>,
|
|
}
|
|
|
|
/// Deduplicate findings across review passes.
|
|
///
|
|
/// Multiple passes often flag the same issue (e.g. SQL injection reported by
|
|
/// logic, security, and convention passes). We group by file + nearby line +
|
|
/// normalized title keywords and keep the highest-severity finding.
|
|
fn dedup_cross_pass(findings: Vec<Finding>) -> Vec<Finding> {
|
|
use std::collections::HashMap;
|
|
|
|
// Build a dedup key: (file, line bucket, normalized title words)
|
|
fn dedup_key(f: &Finding) -> String {
|
|
let file = f.file_path.as_deref().unwrap_or("");
|
|
// Group lines within 3 of each other
|
|
let line_bucket = f.line_number.unwrap_or(0) / 4;
|
|
// Normalize: lowercase, keep only alphanumeric, sort words for order-independence
|
|
let title_lower = f.title.to_lowercase();
|
|
let mut words: Vec<&str> = title_lower
|
|
.split(|c: char| !c.is_alphanumeric())
|
|
.filter(|w| w.len() > 2)
|
|
.collect();
|
|
words.sort();
|
|
format!("{file}:{line_bucket}:{}", words.join(","))
|
|
}
|
|
|
|
let mut groups: HashMap<String, Finding> = HashMap::new();
|
|
|
|
for finding in findings {
|
|
let key = dedup_key(&finding);
|
|
groups
|
|
.entry(key)
|
|
.and_modify(|existing| {
|
|
// Keep the higher severity; on tie, keep the one with more detail
|
|
if finding.severity > existing.severity
|
|
|| (finding.severity == existing.severity
|
|
&& finding.description.len() > existing.description.len())
|
|
{
|
|
*existing = finding.clone();
|
|
}
|
|
// Merge CWE if the existing one is missing it
|
|
if existing.cwe.is_none() {
|
|
existing.cwe = finding.cwe.clone();
|
|
}
|
|
})
|
|
.or_insert(finding);
|
|
}
|
|
|
|
groups.into_values().collect()
|
|
}
|