feat: add new scanners, enhanced triage, findings refinement, and deployment tooling

- Add gitleaks secret detection, lint scanning (clippy/eslint/ruff), and LLM code review scanners - Enhance LLM triage with multi-action support (confirm/downgrade/upgrade/dismiss), surrounding code context, and file-path classification confidence adjustment - Add text search, column sorting, and bulk status update to findings dashboard - Fix finding detail page status refresh and add developer feedback field - Fix BSON DateTime deserialization across all models with shared serde helpers - Add scan progress spinner with polling to repositories page - Batch OSV.dev queries to avoid "Too many queries" errors - Add gitleaks, semgrep, and ruff to Dockerfile.agent for deployment Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 11:05:31 +01:00
parent 32e5fc21e7
commit 23ba52276b
31 changed files with 1602 additions and 95 deletions
--- a/compliance-agent/src/api/handlers/mod.rs
+++ b/compliance-agent/src/api/handlers/mod.rs
@@ -41,6 +41,12 @@ pub struct FindingsFilter {
    pub scan_type: Option<String>,
    #[serde(default)]
    pub status: Option<String>,
+    #[serde(default)]
+    pub q: Option<String>,
+    #[serde(default)]
+    pub sort_by: Option<String>,
+    #[serde(default)]
+    pub sort_order: Option<String>,
    #[serde(default = "default_page")]
    pub page: u64,
    #[serde(default = "default_limit")]
@@ -91,6 +97,17 @@ pub struct UpdateStatusRequest {
    pub status: String,
 }

+#[derive(Deserialize)]
+pub struct BulkUpdateStatusRequest {
+    pub ids: Vec<String>,
+    pub status: String,
+}
+
+#[derive(Deserialize)]
+pub struct UpdateFeedbackRequest {
+    pub feedback: String,
+}
+
 #[derive(Deserialize)]
 pub struct SbomFilter {
    #[serde(default)]
@@ -367,6 +384,29 @@ pub async fn list_findings(
    if let Some(status) = &filter.status {
        query.insert("status", status);
    }
+    // Text search across title, description, file_path, rule_id
+    if let Some(q) = &filter.q {
+        if !q.is_empty() {
+            let regex = doc! { "$regex": q, "$options": "i" };
+            query.insert(
+                "$or",
+                mongodb::bson::bson!([
+                    { "title": regex.clone() },
+                    { "description": regex.clone() },
+                    { "file_path": regex.clone() },
+                    { "rule_id": regex },
+                ]),
+            );
+        }
+    }
+
+    // Dynamic sort
+    let sort_field = filter.sort_by.as_deref().unwrap_or("created_at");
+    let sort_dir: i32 = match filter.sort_order.as_deref() {
+        Some("asc") => 1,
+        _ => -1,
+    };
+    let sort_doc = doc! { sort_field: sort_dir };

    let skip = (filter.page.saturating_sub(1)) * filter.limit as u64;
    let total = db
@@ -378,7 +418,7 @@ pub async fn list_findings(
    let findings = match db
        .findings()
        .find(query)
-        .sort(doc! { "created_at": -1 })
+        .sort(sort_doc)
        .skip(skip)
        .limit(filter.limit)
        .await
@@ -434,6 +474,55 @@ pub async fn update_finding_status(
    Ok(Json(serde_json::json!({ "status": "updated" })))
 }

+pub async fn bulk_update_finding_status(
+    Extension(agent): AgentExt,
+    Json(req): Json<BulkUpdateStatusRequest>,
+) -> Result<Json<serde_json::Value>, StatusCode> {
+    let oids: Vec<mongodb::bson::oid::ObjectId> = req
+        .ids
+        .iter()
+        .filter_map(|id| mongodb::bson::oid::ObjectId::parse_str(id).ok())
+        .collect();
+
+    if oids.is_empty() {
+        return Err(StatusCode::BAD_REQUEST);
+    }
+
+    let result = agent
+        .db
+        .findings()
+        .update_many(
+            doc! { "_id": { "$in": oids } },
+            doc! { "$set": { "status": &req.status, "updated_at": mongodb::bson::DateTime::now() } },
+        )
+        .await
+        .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
+
+    Ok(Json(
+        serde_json::json!({ "status": "updated", "modified_count": result.modified_count }),
+    ))
+}
+
+pub async fn update_finding_feedback(
+    Extension(agent): AgentExt,
+    Path(id): Path<String>,
+    Json(req): Json<UpdateFeedbackRequest>,
+) -> Result<Json<serde_json::Value>, StatusCode> {
+    let oid = mongodb::bson::oid::ObjectId::parse_str(&id).map_err(|_| StatusCode::BAD_REQUEST)?;
+
+    agent
+        .db
+        .findings()
+        .update_one(
+            doc! { "_id": oid },
+            doc! { "$set": { "developer_feedback": &req.feedback, "updated_at": mongodb::bson::DateTime::now() } },
+        )
+        .await
+        .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
+
+    Ok(Json(serde_json::json!({ "status": "updated" })))
+}
+
 pub async fn list_sbom(
    Extension(agent): AgentExt,
    Query(filter): Query<SbomFilter>,
--- a/compliance-agent/src/api/routes.rs
+++ b/compliance-agent/src/api/routes.rs
@@ -23,6 +23,14 @@ pub fn build_router() -> Router {
            "/api/v1/findings/{id}/status",
            patch(handlers::update_finding_status),
        )
+        .route(
+            "/api/v1/findings/bulk-status",
+            patch(handlers::bulk_update_finding_status),
+        )
+        .route(
+            "/api/v1/findings/{id}/feedback",
+            patch(handlers::update_finding_feedback),
+        )
        .route("/api/v1/sbom", get(handlers::list_sbom))
        .route("/api/v1/sbom/export", get(handlers::export_sbom))
        .route("/api/v1/sbom/licenses", get(handlers::license_summary))
--- a/compliance-agent/src/llm/mod.rs
+++ b/compliance-agent/src/llm/mod.rs
@@ -5,6 +5,7 @@ pub mod descriptions;
 pub mod fixes;
 #[allow(dead_code)]
 pub mod pr_review;
+pub mod review_prompts;
 pub mod triage;

 pub use client::LlmClient;
--- a/compliance-agent/src/llm/review_prompts.rs
+++ b/compliance-agent/src/llm/review_prompts.rs
@@ -0,0 +1,77 @@
+// System prompts for multi-pass LLM code review.
+// Each pass focuses on a different aspect to avoid overloading a single prompt.
+
+pub const LOGIC_REVIEW_PROMPT: &str = r#"You are a senior software engineer reviewing code changes. Focus ONLY on logic and correctness issues.
+
+Look for:
+- Off-by-one errors, wrong comparisons, missing edge cases
+- Incorrect control flow (unreachable code, missing returns, wrong loop conditions)
+- Race conditions or concurrency bugs
+- Resource leaks (unclosed handles, missing cleanup)
+- Wrong variable used (copy-paste errors)
+- Incorrect error handling (swallowed errors, wrong error type)
+
+Ignore: style, naming, formatting, documentation, minor improvements.
+
+For each issue found, respond with a JSON array:
+[{"title": "...", "description": "...", "severity": "high|medium|low", "file": "...", "line": N, "suggestion": "..."}]
+
+If no issues found, respond with: []"#;
+
+pub const SECURITY_REVIEW_PROMPT: &str = r#"You are a security engineer reviewing code changes. Focus ONLY on security vulnerabilities.
+
+Look for:
+- Injection vulnerabilities (SQL, command, XSS, template injection)
+- Authentication/authorization bypasses
+- Sensitive data exposure (logging secrets, hardcoded credentials)
+- Insecure cryptography (weak algorithms, predictable randomness)
+- Path traversal, SSRF, open redirects
+- Unsafe deserialization
+- Missing input validation at trust boundaries
+
+Ignore: code style, performance, general quality.
+
+For each issue found, respond with a JSON array:
+[{"title": "...", "description": "...", "severity": "critical|high|medium", "file": "...", "line": N, "cwe": "CWE-XXX", "suggestion": "..."}]
+
+If no issues found, respond with: []"#;
+
+pub const CONVENTION_REVIEW_PROMPT: &str = r#"You are a code reviewer checking adherence to project conventions. Focus ONLY on patterns that indicate likely bugs or maintenance problems.
+
+Look for:
+- Inconsistent error handling patterns within the same module
+- Public API that doesn't follow the project's established patterns
+- Missing or incorrect type annotations that could cause runtime issues
+- Anti-patterns specific to the language (e.g. unwrap in Rust library code, any in TypeScript)
+
+Do NOT report: minor style preferences, documentation gaps, formatting.
+Only report issues with HIGH confidence that they deviate from the visible codebase conventions.
+
+For each issue found, respond with a JSON array:
+[{"title": "...", "description": "...", "severity": "medium|low", "file": "...", "line": N, "suggestion": "..."}]
+
+If no issues found, respond with: []"#;
+
+pub const COMPLEXITY_REVIEW_PROMPT: &str = r#"You are reviewing code changes for excessive complexity that could lead to bugs.
+
+Look for:
+- Functions over 50 lines that should be decomposed
+- Deeply nested control flow (4+ levels)
+- Complex boolean expressions that are hard to reason about
+- Functions with 5+ parameters
+- Code duplication within the changed files
+
+Only report complexity issues that are HIGH risk for future bugs. Ignore acceptable complexity in configuration, CLI argument parsing, or generated code.
+
+For each issue found, respond with a JSON array:
+[{"title": "...", "description": "...", "severity": "medium|low", "file": "...", "line": N, "suggestion": "..."}]
+
+If no issues found, respond with: []"#;
+
+/// All review types with their prompts
+pub const REVIEW_PASSES: &[(&str, &str)] = &[
+    ("logic", LOGIC_REVIEW_PROMPT),
+    ("security", SECURITY_REVIEW_PROMPT),
+    ("convention", CONVENTION_REVIEW_PROMPT),
+    ("complexity", COMPLEXITY_REVIEW_PROMPT),
+];
--- a/compliance-agent/src/llm/triage.rs
+++ b/compliance-agent/src/llm/triage.rs
@@ -5,13 +5,22 @@ use compliance_core::models::{Finding, FindingStatus};
 use crate::llm::LlmClient;
 use crate::pipeline::orchestrator::GraphContext;

-const TRIAGE_SYSTEM_PROMPT: &str = r#"You are a security finding triage expert. Analyze the following security finding and determine:
-1. Is this a true positive? (yes/no)
-2. Confidence score (0-10, where 10 is highest confidence this is a real issue)
-3. Brief remediation suggestion (1-2 sentences)
+const TRIAGE_SYSTEM_PROMPT: &str = r#"You are a security finding triage expert. Analyze the following security finding with its code context and determine the appropriate action.
+
+Actions:
+- "confirm": The finding is a true positive at the reported severity. Keep as-is.
+- "downgrade": The finding is real but over-reported. Lower severity recommended.
+- "upgrade": The finding is under-reported. Higher severity recommended.
+- "dismiss": The finding is a false positive. Should be removed.
+
+Consider:
+- Is the code in a test, example, or generated file? (lower confidence for test code)
+- Does the surrounding code context confirm or refute the finding?
+- Is the finding actionable by a developer?
+- Would a real attacker be able to exploit this?

 Respond in JSON format:
-{"true_positive": true/false, "confidence": N, "remediation": "..."}"#;
+{"action": "confirm|downgrade|upgrade|dismiss", "confidence": 0-10, "rationale": "brief explanation", "remediation": "optional fix suggestion"}"#;

 pub async fn triage_findings(
    llm: &Arc<LlmClient>,
@@ -21,8 +30,10 @@ pub async fn triage_findings(
    let mut passed = 0;

    for finding in findings.iter_mut() {
+        let file_classification = classify_file_path(finding.file_path.as_deref());
+
        let mut user_prompt = format!(
-            "Scanner: {}\nRule: {}\nSeverity: {}\nTitle: {}\nDescription: {}\nFile: {}\nLine: {}\nCode: {}",
+            "Scanner: {}\nRule: {}\nSeverity: {}\nTitle: {}\nDescription: {}\nFile: {}\nLine: {}\nCode: {}\nFile classification: {}",
            finding.scanner,
            finding.rule_id.as_deref().unwrap_or("N/A"),
            finding.severity,
@@ -31,8 +42,14 @@ pub async fn triage_findings(
            finding.file_path.as_deref().unwrap_or("N/A"),
            finding.line_number.map(|n| n.to_string()).unwrap_or_else(|| "N/A".to_string()),
            finding.code_snippet.as_deref().unwrap_or("N/A"),
+            file_classification,
        );

+        // Enrich with surrounding code context if possible
+        if let Some(context) = read_surrounding_context(finding) {
+            user_prompt.push_str(&format!("\n\n--- Surrounding Code (50 lines) ---\n{context}"));
+        }
+
        // Enrich with graph context if available
        if let Some(ctx) = graph_context {
            if let Some(impact) = ctx
@@ -69,32 +86,54 @@ pub async fn triage_findings(
            .await
        {
            Ok(response) => {
-                // Strip markdown code fences if present (e.g. ```json ... ```)
                let cleaned = response.trim();
                let cleaned = if cleaned.starts_with("```") {
-                    let inner = cleaned
+                    cleaned
                        .trim_start_matches("```json")
                        .trim_start_matches("```")
                        .trim_end_matches("```")
-                        .trim();
-                    inner
+                        .trim()
                } else {
                    cleaned
                };
                if let Ok(result) = serde_json::from_str::<TriageResult>(cleaned) {
-                    finding.confidence = Some(result.confidence);
+                    // Apply file-path confidence adjustment
+                    let adjusted_confidence = adjust_confidence(result.confidence, &file_classification);
+                    finding.confidence = Some(adjusted_confidence);
+                    finding.triage_action = Some(result.action.clone());
+                    finding.triage_rationale = Some(result.rationale);
+
                    if let Some(remediation) = result.remediation {
                        finding.remediation = Some(remediation);
                    }

-                    if result.confidence >= 3.0 {
-                        finding.status = FindingStatus::Triaged;
-                        passed += 1;
-                    } else {
-                        finding.status = FindingStatus::FalsePositive;
+                    match result.action.as_str() {
+                        "dismiss" => {
+                            finding.status = FindingStatus::FalsePositive;
+                        }
+                        "downgrade" => {
+                            // Downgrade severity by one level
+                            finding.severity = downgrade_severity(&finding.severity);
+                            finding.status = FindingStatus::Triaged;
+                            passed += 1;
+                        }
+                        "upgrade" => {
+                            finding.severity = upgrade_severity(&finding.severity);
+                            finding.status = FindingStatus::Triaged;
+                            passed += 1;
+                        }
+                        _ => {
+                            // "confirm" or unknown — keep as-is
+                            if adjusted_confidence >= 3.0 {
+                                finding.status = FindingStatus::Triaged;
+                                passed += 1;
+                            } else {
+                                finding.status = FindingStatus::FalsePositive;
+                            }
+                        }
                    }
                } else {
-                    // If LLM response doesn't parse, keep the finding
+                    // Parse failure — keep the finding
                    finding.status = FindingStatus::Triaged;
                    passed += 1;
                    tracing::warn!(
@@ -117,12 +156,118 @@ pub async fn triage_findings(
    passed
 }

+/// Read ~50 lines of surrounding code from the file at the finding's location
+fn read_surrounding_context(finding: &Finding) -> Option<String> {
+    let file_path = finding.file_path.as_deref()?;
+    let line = finding.line_number? as usize;
+
+    // Try to read the file — this works because the repo is cloned locally
+    let content = std::fs::read_to_string(file_path).ok()?;
+    let lines: Vec<&str> = content.lines().collect();
+
+    let start = line.saturating_sub(25);
+    let end = (line + 25).min(lines.len());
+
+    Some(
+        lines[start..end]
+            .iter()
+            .enumerate()
+            .map(|(i, l)| format!("{:>4} | {}", start + i + 1, l))
+            .collect::<Vec<_>>()
+            .join("\n"),
+    )
+}
+
+/// Classify a file path to inform triage confidence adjustment
+fn classify_file_path(path: Option<&str>) -> String {
+    let path = match path {
+        Some(p) => p.to_lowercase(),
+        None => return "unknown".to_string(),
+    };
+
+    if path.contains("/test/")
+        || path.contains("/tests/")
+        || path.contains("_test.")
+        || path.contains(".test.")
+        || path.contains(".spec.")
+        || path.contains("/fixtures/")
+        || path.contains("/testdata/")
+    {
+        return "test".to_string();
+    }
+
+    if path.contains("/example")
+        || path.contains("/examples/")
+        || path.contains("/demo/")
+        || path.contains("/sample")
+    {
+        return "example".to_string();
+    }
+
+    if path.contains("/generated/")
+        || path.contains("/gen/")
+        || path.contains(".generated.")
+        || path.contains(".pb.go")
+        || path.contains("_generated.rs")
+    {
+        return "generated".to_string();
+    }
+
+    if path.contains("/vendor/")
+        || path.contains("/node_modules/")
+        || path.contains("/third_party/")
+    {
+        return "vendored".to_string();
+    }
+
+    "production".to_string()
+}
+
+/// Adjust confidence based on file classification
+fn adjust_confidence(raw_confidence: f64, classification: &str) -> f64 {
+    let multiplier = match classification {
+        "test" => 0.5,
+        "example" => 0.6,
+        "generated" => 0.3,
+        "vendored" => 0.4,
+        _ => 1.0,
+    };
+    raw_confidence * multiplier
+}
+
+fn downgrade_severity(severity: &compliance_core::models::Severity) -> compliance_core::models::Severity {
+    use compliance_core::models::Severity;
+    match severity {
+        Severity::Critical => Severity::High,
+        Severity::High => Severity::Medium,
+        Severity::Medium => Severity::Low,
+        Severity::Low => Severity::Info,
+        Severity::Info => Severity::Info,
+    }
+}
+
+fn upgrade_severity(severity: &compliance_core::models::Severity) -> compliance_core::models::Severity {
+    use compliance_core::models::Severity;
+    match severity {
+        Severity::Info => Severity::Low,
+        Severity::Low => Severity::Medium,
+        Severity::Medium => Severity::High,
+        Severity::High => Severity::Critical,
+        Severity::Critical => Severity::Critical,
+    }
+}
+
 #[derive(serde::Deserialize)]
 struct TriageResult {
-    #[serde(default)]
-    #[allow(dead_code)]
-    true_positive: bool,
+    #[serde(default = "default_action")]
+    action: String,
    #[serde(default)]
    confidence: f64,
+    #[serde(default)]
+    rationale: String,
    remediation: Option<String>,
 }
+
+fn default_action() -> String {
+    "confirm".to_string()
+}
--- a/compliance-agent/src/pipeline/code_review.rs
+++ b/compliance-agent/src/pipeline/code_review.rs
@@ -0,0 +1,186 @@
+use std::path::Path;
+use std::sync::Arc;
+
+use compliance_core::models::{Finding, ScanType, Severity};
+use compliance_core::traits::ScanOutput;
+
+use crate::llm::review_prompts::REVIEW_PASSES;
+use crate::llm::LlmClient;
+use crate::pipeline::dedup;
+use crate::pipeline::git::{DiffFile, GitOps};
+
+pub struct CodeReviewScanner {
+    llm: Arc<LlmClient>,
+}
+
+impl CodeReviewScanner {
+    pub fn new(llm: Arc<LlmClient>) -> Self {
+        Self { llm }
+    }
+
+    /// Run multi-pass LLM code review on the diff between old and new commits.
+    pub async fn review_diff(
+        &self,
+        repo_path: &Path,
+        repo_id: &str,
+        old_sha: &str,
+        new_sha: &str,
+    ) -> ScanOutput {
+        let diff_files = match GitOps::get_diff_content(repo_path, old_sha, new_sha) {
+            Ok(files) => files,
+            Err(e) => {
+                tracing::warn!("Failed to extract diff for code review: {e}");
+                return ScanOutput::default();
+            }
+        };
+
+        if diff_files.is_empty() {
+            return ScanOutput::default();
+        }
+
+        let mut all_findings = Vec::new();
+
+        // Chunk diff files into groups to avoid exceeding context limits
+        let chunks = chunk_diff_files(&diff_files, 8000);
+
+        for (pass_name, system_prompt) in REVIEW_PASSES {
+            for chunk in &chunks {
+                let user_prompt = format!(
+                    "Review the following code changes:\n\n{}",
+                    chunk
+                        .iter()
+                        .map(|f| format!("--- {} ---\n{}", f.path, f.hunks))
+                        .collect::<Vec<_>>()
+                        .join("\n\n")
+                );
+
+                match self.llm.chat(system_prompt, &user_prompt, Some(0.1)).await {
+                    Ok(response) => {
+                        let parsed = parse_review_response(&response, pass_name, repo_id, chunk);
+                        all_findings.extend(parsed);
+                    }
+                    Err(e) => {
+                        tracing::warn!("Code review pass '{pass_name}' failed: {e}");
+                    }
+                }
+            }
+        }
+
+        ScanOutput {
+            findings: all_findings,
+            sbom_entries: Vec::new(),
+        }
+    }
+}
+
+/// Group diff files into chunks that fit within a token budget (rough char estimate)
+fn chunk_diff_files(files: &[DiffFile], max_chars: usize) -> Vec<Vec<&DiffFile>> {
+    let mut chunks: Vec<Vec<&DiffFile>> = Vec::new();
+    let mut current_chunk: Vec<&DiffFile> = Vec::new();
+    let mut current_size = 0;
+
+    for file in files {
+        if current_size + file.hunks.len() > max_chars && !current_chunk.is_empty() {
+            chunks.push(std::mem::take(&mut current_chunk));
+            current_size = 0;
+        }
+        current_chunk.push(file);
+        current_size += file.hunks.len();
+    }
+
+    if !current_chunk.is_empty() {
+        chunks.push(current_chunk);
+    }
+
+    chunks
+}
+
+fn parse_review_response(
+    response: &str,
+    pass_name: &str,
+    repo_id: &str,
+    chunk: &[&DiffFile],
+) -> Vec<Finding> {
+    let cleaned = response.trim();
+    let cleaned = if cleaned.starts_with("```") {
+        cleaned
+            .trim_start_matches("```json")
+            .trim_start_matches("```")
+            .trim_end_matches("```")
+            .trim()
+    } else {
+        cleaned
+    };
+
+    let issues: Vec<ReviewIssue> = match serde_json::from_str(cleaned) {
+        Ok(v) => v,
+        Err(_) => {
+            if cleaned != "[]" {
+                tracing::debug!("Failed to parse {pass_name} review response: {cleaned}");
+            }
+            return Vec::new();
+        }
+    };
+
+    issues
+        .into_iter()
+        .filter(|issue| {
+            // Verify the file exists in the diff chunk
+            chunk.iter().any(|f| f.path == issue.file)
+        })
+        .map(|issue| {
+            let severity = match issue.severity.as_str() {
+                "critical" => Severity::Critical,
+                "high" => Severity::High,
+                "medium" => Severity::Medium,
+                "low" => Severity::Low,
+                _ => Severity::Info,
+            };
+
+            let fingerprint = dedup::compute_fingerprint(&[
+                repo_id,
+                "code-review",
+                pass_name,
+                &issue.file,
+                &issue.line.to_string(),
+                &issue.title,
+            ]);
+
+            let description = if let Some(suggestion) = &issue.suggestion {
+                format!("{}\n\nSuggested fix: {}", issue.description, suggestion)
+            } else {
+                issue.description.clone()
+            };
+
+            let mut finding = Finding::new(
+                repo_id.to_string(),
+                fingerprint,
+                format!("code-review/{pass_name}"),
+                ScanType::CodeReview,
+                issue.title,
+                description,
+                severity,
+            );
+            finding.rule_id = Some(format!("review/{pass_name}"));
+            finding.file_path = Some(issue.file);
+            finding.line_number = Some(issue.line);
+            finding.cwe = issue.cwe;
+            finding.suggested_fix = issue.suggestion;
+            finding
+        })
+        .collect()
+}
+
+#[derive(serde::Deserialize)]
+struct ReviewIssue {
+    title: String,
+    description: String,
+    severity: String,
+    file: String,
+    #[serde(default)]
+    line: u32,
+    #[serde(default)]
+    cwe: Option<String>,
+    #[serde(default)]
+    suggestion: Option<String>,
+}
--- a/compliance-agent/src/pipeline/cve.rs
+++ b/compliance-agent/src/pipeline/cve.rs
@@ -64,6 +64,8 @@ impl CveScanner {
    }

    async fn query_osv_batch(&self, entries: &[SbomEntry]) -> Result<Vec<Vec<OsvVuln>>, CoreError> {
+        const OSV_BATCH_SIZE: usize = 500;
+
        let queries: Vec<_> = entries
            .iter()
            .filter_map(|e| {
@@ -79,47 +81,54 @@ impl CveScanner {
            return Ok(Vec::new());
        }

-        let body = serde_json::json!({ "queries": queries });
+        let mut all_vulns: Vec<Vec<OsvVuln>> = Vec::with_capacity(queries.len());

-        let resp = self
-            .http
-            .post("https://api.osv.dev/v1/querybatch")
-            .json(&body)
-            .send()
-            .await
-            .map_err(|e| CoreError::Http(format!("OSV.dev request failed: {e}")))?;
+        for chunk in queries.chunks(OSV_BATCH_SIZE) {
+            let body = serde_json::json!({ "queries": chunk });

-        if !resp.status().is_success() {
-            let status = resp.status();
-            let body = resp.text().await.unwrap_or_default();
-            tracing::warn!("OSV.dev returned {status}: {body}");
-            return Ok(Vec::new());
+            let resp = self
+                .http
+                .post("https://api.osv.dev/v1/querybatch")
+                .json(&body)
+                .send()
+                .await
+                .map_err(|e| CoreError::Http(format!("OSV.dev request failed: {e}")))?;
+
+            if !resp.status().is_success() {
+                let status = resp.status();
+                let body = resp.text().await.unwrap_or_default();
+                tracing::warn!("OSV.dev returned {status}: {body}");
+                // Push empty results for this chunk so indices stay aligned
+                all_vulns.extend(std::iter::repeat_with(Vec::new).take(chunk.len()));
+                continue;
+            }
+
+            let result: OsvBatchResponse = resp
+                .json()
+                .await
+                .map_err(|e| CoreError::Http(format!("Failed to parse OSV.dev response: {e}")))?;
+
+            let chunk_vulns = result
+                .results
+                .into_iter()
+                .map(|r| {
+                    r.vulns
+                        .unwrap_or_default()
+                        .into_iter()
+                        .map(|v| OsvVuln {
+                            id: v.id,
+                            summary: v.summary,
+                            severity: v.database_specific.and_then(|d| {
+                                d.get("severity").and_then(|s| s.as_str()).map(String::from)
+                            }),
+                        })
+                        .collect()
+                });
+
+            all_vulns.extend(chunk_vulns);
        }

-        let result: OsvBatchResponse = resp
-            .json()
-            .await
-            .map_err(|e| CoreError::Http(format!("Failed to parse OSV.dev response: {e}")))?;
-
-        let vulns = result
-            .results
-            .into_iter()
-            .map(|r| {
-                r.vulns
-                    .unwrap_or_default()
-                    .into_iter()
-                    .map(|v| OsvVuln {
-                        id: v.id,
-                        summary: v.summary,
-                        severity: v.database_specific.and_then(|d| {
-                            d.get("severity").and_then(|s| s.as_str()).map(String::from)
-                        }),
-                    })
-                    .collect()
-            })
-            .collect();
-
-        Ok(vulns)
+        Ok(all_vulns)
    }

    async fn query_nvd(&self, cve_id: &str) -> Result<Option<f64>, CoreError> {
--- a/compliance-agent/src/pipeline/git.rs
+++ b/compliance-agent/src/pipeline/git.rs
@@ -63,6 +63,62 @@ impl GitOps {
        }
    }

+    /// Extract structured diff content between two commits
+    pub fn get_diff_content(
+        repo_path: &Path,
+        old_sha: &str,
+        new_sha: &str,
+    ) -> Result<Vec<DiffFile>, AgentError> {
+        let repo = Repository::open(repo_path)?;
+        let old_commit = repo.find_commit(git2::Oid::from_str(old_sha)?)?;
+        let new_commit = repo.find_commit(git2::Oid::from_str(new_sha)?)?;
+
+        let old_tree = old_commit.tree()?;
+        let new_tree = new_commit.tree()?;
+
+        let diff = repo.diff_tree_to_tree(Some(&old_tree), Some(&new_tree), None)?;
+
+        let mut diff_files: Vec<DiffFile> = Vec::new();
+
+        diff.print(git2::DiffFormat::Patch, |delta, _hunk, line| {
+            let file_path = delta
+                .new_file()
+                .path()
+                .map(|p| p.to_string_lossy().to_string())
+                .unwrap_or_default();
+
+            // Find or create the DiffFile entry
+            let idx = if let Some(pos) = diff_files.iter().position(|f| f.path == file_path) {
+                pos
+            } else {
+                diff_files.push(DiffFile {
+                    path: file_path,
+                    hunks: String::new(),
+                });
+                diff_files.len() - 1
+            };
+            let diff_file = &mut diff_files[idx];
+
+            let prefix = match line.origin() {
+                '+' => "+",
+                '-' => "-",
+                ' ' => " ",
+                _ => "",
+            };
+
+            let content = std::str::from_utf8(line.content()).unwrap_or("");
+            diff_file.hunks.push_str(prefix);
+            diff_file.hunks.push_str(content);
+
+            true
+        })?;
+
+        // Filter out binary files and very large diffs
+        diff_files.retain(|f| !f.hunks.is_empty() && f.hunks.len() < 50_000);
+
+        Ok(diff_files)
+    }
+
    #[allow(dead_code)]
    pub fn get_changed_files(
        repo_path: &Path,
@@ -94,3 +150,10 @@ impl GitOps {
        Ok(files)
    }
 }
+
+/// A file changed between two commits with its diff content
+#[derive(Debug, Clone)]
+pub struct DiffFile {
+    pub path: String,
+    pub hunks: String,
+}
--- a/compliance-agent/src/pipeline/gitleaks.rs
+++ b/compliance-agent/src/pipeline/gitleaks.rs
@@ -0,0 +1,117 @@
+use std::path::Path;
+
+use compliance_core::models::{Finding, ScanType, Severity};
+use compliance_core::traits::{ScanOutput, Scanner};
+use compliance_core::CoreError;
+
+use crate::pipeline::dedup;
+
+pub struct GitleaksScanner;
+
+impl Scanner for GitleaksScanner {
+    fn name(&self) -> &str {
+        "gitleaks"
+    }
+
+    fn scan_type(&self) -> ScanType {
+        ScanType::SecretDetection
+    }
+
+    async fn scan(&self, repo_path: &Path, repo_id: &str) -> Result<ScanOutput, CoreError> {
+        let output = tokio::process::Command::new("gitleaks")
+            .args(["detect", "--source", ".", "--report-format", "json", "--report-path", "/dev/stdout", "--no-banner", "--exit-code", "0"])
+            .current_dir(repo_path)
+            .output()
+            .await
+            .map_err(|e| CoreError::Scanner {
+                scanner: "gitleaks".to_string(),
+                source: Box::new(e),
+            })?;
+
+        if output.stdout.is_empty() {
+            return Ok(ScanOutput::default());
+        }
+
+        let results: Vec<GitleaksResult> = serde_json::from_slice(&output.stdout)
+            .unwrap_or_default();
+
+        let findings = results
+            .into_iter()
+            .filter(|r| !is_allowlisted(&r.file))
+            .map(|r| {
+                let severity = match r.rule_id.as_str() {
+                    s if s.contains("private-key") => Severity::Critical,
+                    s if s.contains("token") || s.contains("password") || s.contains("secret") => Severity::High,
+                    s if s.contains("api-key") => Severity::High,
+                    _ => Severity::Medium,
+                };
+
+                let fingerprint = dedup::compute_fingerprint(&[
+                    repo_id,
+                    &r.rule_id,
+                    &r.file,
+                    &r.start_line.to_string(),
+                ]);
+
+                let title = format!("Secret detected: {}", r.description);
+                let description = format!(
+                    "Potential secret ({}) found in {}:{}. Match: {}",
+                    r.rule_id,
+                    r.file,
+                    r.start_line,
+                    r.r#match.chars().take(80).collect::<String>(),
+                );
+
+                let mut finding = Finding::new(
+                    repo_id.to_string(),
+                    fingerprint,
+                    "gitleaks".to_string(),
+                    ScanType::SecretDetection,
+                    title,
+                    description,
+                    severity,
+                );
+                finding.rule_id = Some(r.rule_id);
+                finding.file_path = Some(r.file);
+                finding.line_number = Some(r.start_line);
+                finding.code_snippet = Some(r.r#match);
+                finding
+            })
+            .collect();
+
+        Ok(ScanOutput {
+            findings,
+            sbom_entries: Vec::new(),
+        })
+    }
+}
+
+/// Skip files that commonly contain example/placeholder secrets
+fn is_allowlisted(file_path: &str) -> bool {
+    let lower = file_path.to_lowercase();
+    lower.ends_with(".env.example")
+        || lower.ends_with(".env.sample")
+        || lower.ends_with(".env.template")
+        || lower.contains("/test/")
+        || lower.contains("/tests/")
+        || lower.contains("/fixtures/")
+        || lower.contains("/testdata/")
+        || lower.contains("mock")
+        || lower.ends_with("_test.go")
+        || lower.ends_with(".test.ts")
+        || lower.ends_with(".test.js")
+        || lower.ends_with(".spec.ts")
+        || lower.ends_with(".spec.js")
+}
+
+#[derive(serde::Deserialize)]
+#[serde(rename_all = "PascalCase")]
+struct GitleaksResult {
+    description: String,
+    #[serde(rename = "RuleID")]
+    rule_id: String,
+    file: String,
+    start_line: u32,
+    #[serde(rename = "Match")]
+    r#match: String,
+}
--- a/compliance-agent/src/pipeline/lint.rs
+++ b/compliance-agent/src/pipeline/lint.rs
@@ -0,0 +1,361 @@
+use std::path::Path;
+use std::time::Duration;
+
+use compliance_core::models::{Finding, ScanType, Severity};
+use compliance_core::traits::{ScanOutput, Scanner};
+use compliance_core::CoreError;
+use tokio::process::Command;
+
+use crate::pipeline::dedup;
+
+/// Timeout for each individual lint command
+const LINT_TIMEOUT: Duration = Duration::from_secs(120);
+
+pub struct LintScanner;
+
+impl Scanner for LintScanner {
+    fn name(&self) -> &str {
+        "lint"
+    }
+
+    fn scan_type(&self) -> ScanType {
+        ScanType::Lint
+    }
+
+    async fn scan(&self, repo_path: &Path, repo_id: &str) -> Result<ScanOutput, CoreError> {
+        let mut all_findings = Vec::new();
+
+        // Detect which languages are present and run appropriate linters
+        if has_rust_project(repo_path) {
+            match run_clippy(repo_path, repo_id).await {
+                Ok(findings) => all_findings.extend(findings),
+                Err(e) => tracing::warn!("Clippy failed: {e}"),
+            }
+        }
+
+        if has_js_project(repo_path) {
+            match run_eslint(repo_path, repo_id).await {
+                Ok(findings) => all_findings.extend(findings),
+                Err(e) => tracing::warn!("ESLint failed: {e}"),
+            }
+        }
+
+        if has_python_project(repo_path) {
+            match run_ruff(repo_path, repo_id).await {
+                Ok(findings) => all_findings.extend(findings),
+                Err(e) => tracing::warn!("Ruff failed: {e}"),
+            }
+        }
+
+        Ok(ScanOutput {
+            findings: all_findings,
+            sbom_entries: Vec::new(),
+        })
+    }
+}
+
+fn has_rust_project(repo_path: &Path) -> bool {
+    repo_path.join("Cargo.toml").exists()
+}
+
+fn has_js_project(repo_path: &Path) -> bool {
+    // Only run if eslint is actually installed in the project
+    repo_path.join("package.json").exists()
+        && repo_path.join("node_modules/.bin/eslint").exists()
+}
+
+fn has_python_project(repo_path: &Path) -> bool {
+    repo_path.join("pyproject.toml").exists()
+        || repo_path.join("setup.py").exists()
+        || repo_path.join("requirements.txt").exists()
+}
+
+/// Run a command with a timeout, returning its output or an error
+async fn run_with_timeout(
+    child: tokio::process::Child,
+    scanner_name: &str,
+) -> Result<std::process::Output, CoreError> {
+    let result = tokio::time::timeout(LINT_TIMEOUT, child.wait_with_output()).await;
+    match result {
+        Ok(Ok(output)) => Ok(output),
+        Ok(Err(e)) => Err(CoreError::Scanner {
+            scanner: scanner_name.to_string(),
+            source: Box::new(e),
+        }),
+        Err(_) => {
+            // Process is dropped here which sends SIGKILL on Unix
+            Err(CoreError::Scanner {
+                scanner: scanner_name.to_string(),
+                source: Box::new(std::io::Error::new(
+                    std::io::ErrorKind::TimedOut,
+                    format!("{scanner_name} timed out after {}s", LINT_TIMEOUT.as_secs()),
+                )),
+            })
+        }
+    }
+}
+
+// ── Clippy ──────────────────────────────────────────────
+
+async fn run_clippy(repo_path: &Path, repo_id: &str) -> Result<Vec<Finding>, CoreError> {
+    let child = Command::new("cargo")
+        .args(["clippy", "--message-format=json", "--quiet", "--", "-W", "clippy::all"])
+        .current_dir(repo_path)
+        .stdout(std::process::Stdio::piped())
+        .stderr(std::process::Stdio::piped())
+        .spawn()
+        .map_err(|e| CoreError::Scanner {
+            scanner: "clippy".to_string(),
+            source: Box::new(e),
+        })?;
+
+    let output = run_with_timeout(child, "clippy").await?;
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    let mut findings = Vec::new();
+
+    for line in stdout.lines() {
+        let msg: serde_json::Value = match serde_json::from_str(line) {
+            Ok(v) => v,
+            Err(_) => continue,
+        };
+
+        if msg.get("reason").and_then(|v| v.as_str()) != Some("compiler-message") {
+            continue;
+        }
+
+        let message = match msg.get("message") {
+            Some(m) => m,
+            None => continue,
+        };
+
+        let level = message
+            .get("level")
+            .and_then(|v| v.as_str())
+            .unwrap_or("");
+
+        if level != "warning" && level != "error" {
+            continue;
+        }
+
+        let text = message
+            .get("message")
+            .and_then(|v| v.as_str())
+            .unwrap_or("")
+            .to_string();
+
+        let code = message
+            .get("code")
+            .and_then(|v| v.get("code"))
+            .and_then(|v| v.as_str())
+            .unwrap_or("")
+            .to_string();
+
+        if text.starts_with("aborting due to") || code.is_empty() {
+            continue;
+        }
+
+        let (file_path, line_number) = extract_primary_span(message);
+
+        let severity = if level == "error" {
+            Severity::High
+        } else {
+            Severity::Low
+        };
+
+        let fingerprint =
+            dedup::compute_fingerprint(&[repo_id, "clippy", &code, &file_path, &line_number.to_string()]);
+
+        let mut finding = Finding::new(
+            repo_id.to_string(),
+            fingerprint,
+            "clippy".to_string(),
+            ScanType::Lint,
+            format!("[clippy] {text}"),
+            text,
+            severity,
+        );
+        finding.rule_id = Some(code);
+        if !file_path.is_empty() {
+            finding.file_path = Some(file_path);
+        }
+        if line_number > 0 {
+            finding.line_number = Some(line_number);
+        }
+        findings.push(finding);
+    }
+
+    Ok(findings)
+}
+
+fn extract_primary_span(message: &serde_json::Value) -> (String, u32) {
+    let spans = match message.get("spans").and_then(|v| v.as_array()) {
+        Some(s) => s,
+        None => return (String::new(), 0),
+    };
+
+    for span in spans {
+        if span.get("is_primary").and_then(|v| v.as_bool()) == Some(true) {
+            let file = span
+                .get("file_name")
+                .and_then(|v| v.as_str())
+                .unwrap_or("")
+                .to_string();
+            let line = span
+                .get("line_start")
+                .and_then(|v| v.as_u64())
+                .unwrap_or(0) as u32;
+            return (file, line);
+        }
+    }
+
+    (String::new(), 0)
+}
+
+// ── ESLint ──────────────────────────────────────────────
+
+async fn run_eslint(repo_path: &Path, repo_id: &str) -> Result<Vec<Finding>, CoreError> {
+    // Use the project-local eslint binary directly, not npx (which can hang downloading)
+    let eslint_bin = repo_path.join("node_modules/.bin/eslint");
+    let child = Command::new(eslint_bin)
+        .args([".", "--format", "json", "--no-error-on-unmatched-pattern"])
+        .current_dir(repo_path)
+        .stdout(std::process::Stdio::piped())
+        .stderr(std::process::Stdio::piped())
+        .spawn()
+        .map_err(|e| CoreError::Scanner {
+            scanner: "eslint".to_string(),
+            source: Box::new(e),
+        })?;
+
+    let output = run_with_timeout(child, "eslint").await?;
+
+    if output.stdout.is_empty() {
+        return Ok(Vec::new());
+    }
+
+    let results: Vec<EslintFileResult> =
+        serde_json::from_slice(&output.stdout).unwrap_or_default();
+
+    let mut findings = Vec::new();
+    for file_result in results {
+        for msg in file_result.messages {
+            let severity = match msg.severity {
+                2 => Severity::Medium,
+                _ => Severity::Low,
+            };
+
+            let rule_id = msg.rule_id.unwrap_or_default();
+            let fingerprint = dedup::compute_fingerprint(&[
+                repo_id,
+                "eslint",
+                &rule_id,
+                &file_result.file_path,
+                &msg.line.to_string(),
+            ]);
+
+            let mut finding = Finding::new(
+                repo_id.to_string(),
+                fingerprint,
+                "eslint".to_string(),
+                ScanType::Lint,
+                format!("[eslint] {}", msg.message),
+                msg.message,
+                severity,
+            );
+            finding.rule_id = Some(rule_id);
+            finding.file_path = Some(file_result.file_path.clone());
+            finding.line_number = Some(msg.line);
+            findings.push(finding);
+        }
+    }
+
+    Ok(findings)
+}
+
+#[derive(serde::Deserialize)]
+struct EslintFileResult {
+    #[serde(rename = "filePath")]
+    file_path: String,
+    messages: Vec<EslintMessage>,
+}
+
+#[derive(serde::Deserialize)]
+struct EslintMessage {
+    #[serde(rename = "ruleId")]
+    rule_id: Option<String>,
+    severity: u8,
+    message: String,
+    line: u32,
+}
+
+// ── Ruff ────────────────────────────────────────────────
+
+async fn run_ruff(repo_path: &Path, repo_id: &str) -> Result<Vec<Finding>, CoreError> {
+    let child = Command::new("ruff")
+        .args(["check", ".", "--output-format", "json", "--exit-zero"])
+        .current_dir(repo_path)
+        .stdout(std::process::Stdio::piped())
+        .stderr(std::process::Stdio::piped())
+        .spawn()
+        .map_err(|e| CoreError::Scanner {
+            scanner: "ruff".to_string(),
+            source: Box::new(e),
+        })?;
+
+    let output = run_with_timeout(child, "ruff").await?;
+
+    if output.stdout.is_empty() {
+        return Ok(Vec::new());
+    }
+
+    let results: Vec<RuffResult> =
+        serde_json::from_slice(&output.stdout).unwrap_or_default();
+
+    let findings = results
+        .into_iter()
+        .map(|r| {
+            let severity = if r.code.starts_with('E') || r.code.starts_with('F') {
+                Severity::Medium
+            } else {
+                Severity::Low
+            };
+
+            let fingerprint = dedup::compute_fingerprint(&[
+                repo_id,
+                "ruff",
+                &r.code,
+                &r.filename,
+                &r.location.row.to_string(),
+            ]);
+
+            let mut finding = Finding::new(
+                repo_id.to_string(),
+                fingerprint,
+                "ruff".to_string(),
+                ScanType::Lint,
+                format!("[ruff] {}: {}", r.code, r.message),
+                r.message,
+                severity,
+            );
+            finding.rule_id = Some(r.code);
+            finding.file_path = Some(r.filename);
+            finding.line_number = Some(r.location.row);
+            finding
+        })
+        .collect();
+
+    Ok(findings)
+}
+
+#[derive(serde::Deserialize)]
+struct RuffResult {
+    code: String,
+    message: String,
+    filename: String,
+    location: RuffLocation,
+}
+
+#[derive(serde::Deserialize)]
+struct RuffLocation {
+    row: u32,
+}
--- a/compliance-agent/src/pipeline/mod.rs
+++ b/compliance-agent/src/pipeline/mod.rs
@@ -1,6 +1,9 @@
+pub mod code_review;
 pub mod cve;
 pub mod dedup;
 pub mod git;
+pub mod gitleaks;
+pub mod lint;
 pub mod orchestrator;
 pub mod patterns;
 pub mod sbom;
--- a/compliance-agent/src/pipeline/orchestrator.rs
+++ b/compliance-agent/src/pipeline/orchestrator.rs
@@ -9,8 +9,11 @@ use compliance_core::AgentConfig;
 use crate::database::Database;
 use crate::error::AgentError;
 use crate::llm::LlmClient;
+use crate::pipeline::code_review::CodeReviewScanner;
 use crate::pipeline::cve::CveScanner;
 use crate::pipeline::git::GitOps;
+use crate::pipeline::gitleaks::GitleaksScanner;
+use crate::pipeline::lint::LintScanner;
 use crate::pipeline::patterns::{GdprPatternScanner, OAuthPatternScanner};
 use crate::pipeline::sbom::SbomScanner;
 use crate::pipeline::semgrep::SemgrepScanner;
@@ -182,6 +185,35 @@ impl PipelineOrchestrator {
            Err(e) => tracing::warn!("[{repo_id}] OAuth pattern scan failed: {e}"),
        }

+        // Stage 4a: Secret Detection (Gitleaks)
+        tracing::info!("[{repo_id}] Stage 4a: Secret Detection");
+        self.update_phase(scan_run_id, "secret_detection").await;
+        let gitleaks = GitleaksScanner;
+        match gitleaks.scan(&repo_path, &repo_id).await {
+            Ok(output) => all_findings.extend(output.findings),
+            Err(e) => tracing::warn!("[{repo_id}] Gitleaks failed: {e}"),
+        }
+
+        // Stage 4b: Lint Scanning
+        tracing::info!("[{repo_id}] Stage 4b: Lint Scanning");
+        self.update_phase(scan_run_id, "lint_scanning").await;
+        let lint = LintScanner;
+        match lint.scan(&repo_path, &repo_id).await {
+            Ok(output) => all_findings.extend(output.findings),
+            Err(e) => tracing::warn!("[{repo_id}] Lint scanning failed: {e}"),
+        }
+
+        // Stage 4c: LLM Code Review (only on incremental scans)
+        if let Some(old_sha) = &repo.last_scanned_commit {
+            tracing::info!("[{repo_id}] Stage 4c: LLM Code Review");
+            self.update_phase(scan_run_id, "code_review").await;
+            let reviewer = CodeReviewScanner::new(self.llm.clone());
+            let review_output = reviewer
+                .review_diff(&repo_path, &repo_id, old_sha, &current_sha)
+                .await;
+            all_findings.extend(review_output.findings);
+        }
+
        // Stage 4.5: Graph Building
        tracing::info!("[{repo_id}] Stage 4.5: Graph Building");
        self.update_phase(scan_run_id, "graph_building").await;