use std::path::Path; use std::sync::Arc; use compliance_core::models::{Finding, ScanType, Severity}; use compliance_core::traits::ScanOutput; use crate::llm::review_prompts::REVIEW_PASSES; use crate::llm::LlmClient; use crate::pipeline::dedup; use crate::pipeline::git::{DiffFile, GitOps}; pub struct CodeReviewScanner { llm: Arc, } impl CodeReviewScanner { pub fn new(llm: Arc) -> Self { Self { llm } } /// Run multi-pass LLM code review on the diff between old and new commits. pub async fn review_diff( &self, repo_path: &Path, repo_id: &str, old_sha: &str, new_sha: &str, ) -> ScanOutput { let diff_files = match GitOps::get_diff_content(repo_path, old_sha, new_sha) { Ok(files) => files, Err(e) => { tracing::warn!("Failed to extract diff for code review: {e}"); return ScanOutput::default(); } }; if diff_files.is_empty() { return ScanOutput::default(); } let mut all_findings = Vec::new(); // Chunk diff files into groups to avoid exceeding context limits let chunks = chunk_diff_files(&diff_files, 8000); for (pass_name, system_prompt) in REVIEW_PASSES { for chunk in &chunks { let user_prompt = format!( "Review the following code changes:\n\n{}", chunk .iter() .map(|f| format!("--- {} ---\n{}", f.path, f.hunks)) .collect::>() .join("\n\n") ); match self.llm.chat(system_prompt, &user_prompt, Some(0.1)).await { Ok(response) => { let parsed = parse_review_response(&response, pass_name, repo_id, chunk); all_findings.extend(parsed); } Err(e) => { tracing::warn!("Code review pass '{pass_name}' failed: {e}"); } } } } ScanOutput { findings: all_findings, sbom_entries: Vec::new(), } } } /// Group diff files into chunks that fit within a token budget (rough char estimate) fn chunk_diff_files(files: &[DiffFile], max_chars: usize) -> Vec> { let mut chunks: Vec> = Vec::new(); let mut current_chunk: Vec<&DiffFile> = Vec::new(); let mut current_size = 0; for file in files { if current_size + file.hunks.len() > max_chars && !current_chunk.is_empty() { chunks.push(std::mem::take(&mut current_chunk)); current_size = 0; } current_chunk.push(file); current_size += file.hunks.len(); } if !current_chunk.is_empty() { chunks.push(current_chunk); } chunks } fn parse_review_response( response: &str, pass_name: &str, repo_id: &str, chunk: &[&DiffFile], ) -> Vec { let cleaned = response.trim(); let cleaned = if cleaned.starts_with("```") { cleaned .trim_start_matches("```json") .trim_start_matches("```") .trim_end_matches("```") .trim() } else { cleaned }; let issues: Vec = match serde_json::from_str(cleaned) { Ok(v) => v, Err(_) => { if cleaned != "[]" { tracing::debug!("Failed to parse {pass_name} review response: {cleaned}"); } return Vec::new(); } }; issues .into_iter() .filter(|issue| { // Verify the file exists in the diff chunk chunk.iter().any(|f| f.path == issue.file) }) .map(|issue| { let severity = match issue.severity.as_str() { "critical" => Severity::Critical, "high" => Severity::High, "medium" => Severity::Medium, "low" => Severity::Low, _ => Severity::Info, }; let fingerprint = dedup::compute_fingerprint(&[ repo_id, "code-review", pass_name, &issue.file, &issue.line.to_string(), &issue.title, ]); let description = if let Some(suggestion) = &issue.suggestion { format!("{}\n\nSuggested fix: {}", issue.description, suggestion) } else { issue.description.clone() }; let mut finding = Finding::new( repo_id.to_string(), fingerprint, format!("code-review/{pass_name}"), ScanType::CodeReview, issue.title, description, severity, ); finding.rule_id = Some(format!("review/{pass_name}")); finding.file_path = Some(issue.file); finding.line_number = Some(issue.line); finding.cwe = issue.cwe; finding.suggested_fix = issue.suggestion; finding }) .collect() } #[derive(serde::Deserialize)] struct ReviewIssue { title: String, description: String, severity: String, file: String, #[serde(default)] line: u32, #[serde(default)] cwe: Option, #[serde(default)] suggestion: Option, }