compliance-scanner-agent/compliance-agent/src/pipeline/code_review.rs

use std::path::Path;
use std::sync::Arc;

use compliance_core::models::{Finding, ScanType, Severity};
use compliance_core::traits::ScanOutput;

use crate::llm::review_prompts::REVIEW_PASSES;
use crate::llm::LlmClient;
use crate::pipeline::dedup;
use crate::pipeline::git::{DiffFile, GitOps};

pub struct CodeReviewScanner {
    llm: Arc<LlmClient>,
}

impl CodeReviewScanner {
    pub fn new(llm: Arc<LlmClient>) -> Self {
        Self { llm }
    }

    /// Run multi-pass LLM code review on the diff between old and new commits.
    pub async fn review_diff(
        &self,
        repo_path: &Path,
        repo_id: &str,
        old_sha: &str,
        new_sha: &str,
    ) -> ScanOutput {
        let diff_files = match GitOps::get_diff_content(repo_path, old_sha, new_sha) {
            Ok(files) => files,
            Err(e) => {
                tracing::warn!("Failed to extract diff for code review: {e}");
                return ScanOutput::default();
            }
        };

        if diff_files.is_empty() {
            return ScanOutput::default();
        }

        let mut all_findings = Vec::new();

        // Chunk diff files into groups to avoid exceeding context limits
        let chunks = chunk_diff_files(&diff_files, 8000);

        for (pass_name, system_prompt) in REVIEW_PASSES {
            for chunk in &chunks {
                let user_prompt = format!(
                    "Review the following code changes:\n\n{}",
                    chunk
                        .iter()
                        .map(|f| format!("--- {} ---\n{}", f.path, f.hunks))
                        .collect::<Vec<_>>()
                        .join("\n\n")
                );

                match self.llm.chat(system_prompt, &user_prompt, Some(0.1)).await {
                    Ok(response) => {
                        let parsed = parse_review_response(&response, pass_name, repo_id, chunk);
                        all_findings.extend(parsed);
                    }
                    Err(e) => {
                        tracing::warn!("Code review pass '{pass_name}' failed: {e}");
                    }
                }
            }
        }

        let deduped = dedup_cross_pass(all_findings);

        ScanOutput {
            findings: deduped,
            sbom_entries: Vec::new(),
        }
    }
}

/// Group diff files into chunks that fit within a token budget (rough char estimate)
fn chunk_diff_files(files: &[DiffFile], max_chars: usize) -> Vec<Vec<&DiffFile>> {
    let mut chunks: Vec<Vec<&DiffFile>> = Vec::new();
    let mut current_chunk: Vec<&DiffFile> = Vec::new();
    let mut current_size = 0;

    for file in files {
        if current_size + file.hunks.len() > max_chars && !current_chunk.is_empty() {
            chunks.push(std::mem::take(&mut current_chunk));
            current_size = 0;
        }
        current_chunk.push(file);
        current_size += file.hunks.len();
    }

    if !current_chunk.is_empty() {
        chunks.push(current_chunk);
    }

    chunks
}

fn parse_review_response(
    response: &str,
    pass_name: &str,
    repo_id: &str,
    chunk: &[&DiffFile],
) -> Vec<Finding> {
    let cleaned = response.trim();
    let cleaned = if cleaned.starts_with("```") {
        cleaned
            .trim_start_matches("```json")
            .trim_start_matches("```")
            .trim_end_matches("```")
            .trim()
    } else {
        cleaned
    };

    let issues: Vec<ReviewIssue> = match serde_json::from_str(cleaned) {
        Ok(v) => v,
        Err(_) => {
            if cleaned != "[]" {
                tracing::debug!("Failed to parse {pass_name} review response: {cleaned}");
            }
            return Vec::new();
        }
    };

    issues
        .into_iter()
        .filter(|issue| {
            // Verify the file exists in the diff chunk
            chunk.iter().any(|f| f.path == issue.file)
        })
        .map(|issue| {
            let severity = match issue.severity.as_str() {
                "critical" => Severity::Critical,
                "high" => Severity::High,
                "medium" => Severity::Medium,
                "low" => Severity::Low,
                _ => Severity::Info,
            };

            let fingerprint = dedup::compute_fingerprint(&[
                repo_id,
                "code-review",
                pass_name,
                &issue.file,
                &issue.line.to_string(),
                &issue.title,
            ]);

            let description = if let Some(suggestion) = &issue.suggestion {
                format!("{}\n\nSuggested fix: {}", issue.description, suggestion)
            } else {
                issue.description.clone()
            };

            let mut finding = Finding::new(
                repo_id.to_string(),
                fingerprint,
                format!("code-review/{pass_name}"),
                ScanType::CodeReview,
                issue.title,
                description,
                severity,
            );
            finding.rule_id = Some(format!("review/{pass_name}"));
            finding.file_path = Some(issue.file);
            finding.line_number = Some(issue.line);
            finding.cwe = issue.cwe;
            finding.suggested_fix = issue.suggestion;
            finding
        })
        .collect()
}

#[derive(serde::Deserialize)]
struct ReviewIssue {
    title: String,
    description: String,
    severity: String,
    file: String,
    #[serde(default)]
    line: u32,
    #[serde(default)]
    cwe: Option<String>,
    #[serde(default)]
    suggestion: Option<String>,
}

/// Deduplicate findings across review passes.
///
/// Multiple passes often flag the same issue (e.g. SQL injection reported by
/// logic, security, and convention passes). We group by file + nearby line +
/// normalized title keywords and keep the highest-severity finding.
fn dedup_cross_pass(findings: Vec<Finding>) -> Vec<Finding> {
    use std::collections::HashMap;

    // Build a dedup key: (file, line bucket, normalized title words)
    fn dedup_key(f: &Finding) -> String {
        let file = f.file_path.as_deref().unwrap_or("");
        // Group lines within 3 of each other
        let line_bucket = f.line_number.unwrap_or(0) / 4;
        // Normalize: lowercase, keep only alphanumeric, sort words for order-independence
        let title_lower = f.title.to_lowercase();
        let mut words: Vec<&str> = title_lower
            .split(|c: char| !c.is_alphanumeric())
            .filter(|w| w.len() > 2)
            .collect();
        words.sort();
        format!("{file}:{line_bucket}:{}", words.join(","))
    }

    let mut groups: HashMap<String, Finding> = HashMap::new();

    for finding in findings {
        let key = dedup_key(&finding);
        groups
            .entry(key)
            .and_modify(|existing| {
                // Keep the higher severity; on tie, keep the one with more detail
                if finding.severity > existing.severity
                    || (finding.severity == existing.severity
                        && finding.description.len() > existing.description.len())
                {
                    *existing = finding.clone();
                }
                // Merge CWE if the existing one is missing it
                if existing.cwe.is_none() {
                    existing.cwe = finding.cwe.clone();
                }
            })
            .or_insert(finding);
    }

    groups.into_values().collect()
}