use std::path::Path; use std::sync::Arc; use compliance_core::models::{Finding, ScanType, Severity}; use compliance_core::traits::ScanOutput; use crate::llm::review_prompts::REVIEW_PASSES; use crate::llm::LlmClient; use crate::pipeline::dedup; use crate::pipeline::git::{DiffFile, GitOps}; pub struct CodeReviewScanner { llm: Arc, } impl CodeReviewScanner { pub fn new(llm: Arc) -> Self { Self { llm } } /// Run multi-pass LLM code review on the diff between old and new commits. pub async fn review_diff( &self, repo_path: &Path, repo_id: &str, old_sha: &str, new_sha: &str, ) -> ScanOutput { let diff_files = match GitOps::get_diff_content(repo_path, old_sha, new_sha) { Ok(files) => files, Err(e) => { tracing::warn!("Failed to extract diff for code review: {e}"); return ScanOutput::default(); } }; if diff_files.is_empty() { return ScanOutput::default(); } let mut all_findings = Vec::new(); // Chunk diff files into groups to avoid exceeding context limits let chunks = chunk_diff_files(&diff_files, 8000); for (pass_name, system_prompt) in REVIEW_PASSES { for chunk in &chunks { let user_prompt = format!( "Review the following code changes:\n\n{}", chunk .iter() .map(|f| format!("--- {} ---\n{}", f.path, f.hunks)) .collect::>() .join("\n\n") ); match self.llm.chat(system_prompt, &user_prompt, Some(0.1)).await { Ok(response) => { let parsed = parse_review_response(&response, pass_name, repo_id, chunk); all_findings.extend(parsed); } Err(e) => { tracing::warn!("Code review pass '{pass_name}' failed: {e}"); } } } } let deduped = dedup_cross_pass(all_findings); ScanOutput { findings: deduped, sbom_entries: Vec::new(), } } } /// Group diff files into chunks that fit within a token budget (rough char estimate) fn chunk_diff_files(files: &[DiffFile], max_chars: usize) -> Vec> { let mut chunks: Vec> = Vec::new(); let mut current_chunk: Vec<&DiffFile> = Vec::new(); let mut current_size = 0; for file in files { if current_size + file.hunks.len() > max_chars && !current_chunk.is_empty() { chunks.push(std::mem::take(&mut current_chunk)); current_size = 0; } current_chunk.push(file); current_size += file.hunks.len(); } if !current_chunk.is_empty() { chunks.push(current_chunk); } chunks } fn parse_review_response( response: &str, pass_name: &str, repo_id: &str, chunk: &[&DiffFile], ) -> Vec { let cleaned = response.trim(); let cleaned = if cleaned.starts_with("```") { cleaned .trim_start_matches("```json") .trim_start_matches("```") .trim_end_matches("```") .trim() } else { cleaned }; let issues: Vec = match serde_json::from_str(cleaned) { Ok(v) => v, Err(_) => { if cleaned != "[]" { tracing::debug!("Failed to parse {pass_name} review response: {cleaned}"); } return Vec::new(); } }; issues .into_iter() .filter(|issue| { // Verify the file exists in the diff chunk chunk.iter().any(|f| f.path == issue.file) }) .map(|issue| { let severity = match issue.severity.as_str() { "critical" => Severity::Critical, "high" => Severity::High, "medium" => Severity::Medium, "low" => Severity::Low, _ => Severity::Info, }; let fingerprint = dedup::compute_fingerprint(&[ repo_id, "code-review", pass_name, &issue.file, &issue.line.to_string(), &issue.title, ]); let description = if let Some(suggestion) = &issue.suggestion { format!("{}\n\nSuggested fix: {}", issue.description, suggestion) } else { issue.description.clone() }; let mut finding = Finding::new( repo_id.to_string(), fingerprint, format!("code-review/{pass_name}"), ScanType::CodeReview, issue.title, description, severity, ); finding.rule_id = Some(format!("review/{pass_name}")); finding.file_path = Some(issue.file); finding.line_number = Some(issue.line); finding.cwe = issue.cwe; finding.suggested_fix = issue.suggestion; finding }) .collect() } #[derive(serde::Deserialize)] struct ReviewIssue { title: String, description: String, severity: String, file: String, #[serde(default)] line: u32, #[serde(default)] cwe: Option, #[serde(default)] suggestion: Option, } /// Deduplicate findings across review passes. /// /// Multiple passes often flag the same issue (e.g. SQL injection reported by /// logic, security, and convention passes). We group by file + nearby line + /// normalized title keywords and keep the highest-severity finding. fn dedup_cross_pass(findings: Vec) -> Vec { use std::collections::HashMap; // Build a dedup key: (file, line bucket, normalized title words) fn dedup_key(f: &Finding) -> String { let file = f.file_path.as_deref().unwrap_or(""); // Group lines within 3 of each other let line_bucket = f.line_number.unwrap_or(0) / 4; // Normalize: lowercase, keep only alphanumeric, sort words for order-independence let title_lower = f.title.to_lowercase(); let mut words: Vec<&str> = title_lower .split(|c: char| !c.is_alphanumeric()) .filter(|w| w.len() > 2) .collect(); words.sort(); format!("{file}:{line_bucket}:{}", words.join(",")) } let mut groups: HashMap = HashMap::new(); for finding in findings { let key = dedup_key(&finding); groups .entry(key) .and_modify(|existing| { // Keep the higher severity; on tie, keep the one with more detail if finding.severity > existing.severity || (finding.severity == existing.severity && finding.description.len() > existing.description.len()) { *existing = finding.clone(); } // Merge CWE if the existing one is missing it if existing.cwe.is_none() { existing.cwe = finding.cwe.clone(); } }) .or_insert(finding); } groups.into_values().collect() }