compliance-scanner-agent/compliance-agent/src/pipeline/gitleaks.rs

use std::path::Path;

use compliance_core::models::{Finding, ScanType, Severity};
use compliance_core::traits::{ScanOutput, Scanner};
use compliance_core::CoreError;

use crate::pipeline::dedup;

pub struct GitleaksScanner;

impl Scanner for GitleaksScanner {
    fn name(&self) -> &str {
        "gitleaks"
    }

    fn scan_type(&self) -> ScanType {
        ScanType::SecretDetection
    }

    #[tracing::instrument(skip_all)]
    async fn scan(&self, repo_path: &Path, repo_id: &str) -> Result<ScanOutput, CoreError> {
        let output = tokio::time::timeout(
            std::time::Duration::from_secs(300),
            tokio::process::Command::new("gitleaks")
                .args([
                    "detect",
                    "--source",
                    ".",
                    "--report-format",
                    "json",
                    "--report-path",
                    "/dev/stdout",
                    "--no-banner",
                    "--exit-code",
                    "0",
                ])
                .current_dir(repo_path)
                .output(),
        )
        .await
        .map_err(|_| CoreError::Scanner {
            scanner: "gitleaks".to_string(),
            source: "timed out after 5 minutes".into(),
        })?
        .map_err(|e| CoreError::Scanner {
            scanner: "gitleaks".to_string(),
            source: Box::new(e),
        })?;

        if output.stdout.is_empty() {
            return Ok(ScanOutput::default());
        }

        let results: Vec<GitleaksResult> =
            serde_json::from_slice(&output.stdout).unwrap_or_default();

        let findings = results
            .into_iter()
            .filter(|r| !is_allowlisted(&r.file))
            .map(|r| {
                let severity = match r.rule_id.as_str() {
                    s if s.contains("private-key") => Severity::Critical,
                    s if s.contains("token") || s.contains("password") || s.contains("secret") => {
                        Severity::High
                    }
                    s if s.contains("api-key") => Severity::High,
                    _ => Severity::Medium,
                };

                let fingerprint = dedup::compute_fingerprint(&[
                    repo_id,
                    &r.rule_id,
                    &r.file,
                    &r.start_line.to_string(),
                ]);

                let title = format!("Secret detected: {}", r.description);
                let description = format!(
                    "Potential secret ({}) found in {}:{}. Match: {}",
                    r.rule_id,
                    r.file,
                    r.start_line,
                    r.r#match.chars().take(80).collect::<String>(),
                );

                let mut finding = Finding::new(
                    repo_id.to_string(),
                    fingerprint,
                    "gitleaks".to_string(),
                    ScanType::SecretDetection,
                    title,
                    description,
                    severity,
                );
                finding.rule_id = Some(r.rule_id);
                finding.file_path = Some(r.file);
                finding.line_number = Some(r.start_line);
                finding.code_snippet = Some(r.r#match);
                finding
            })
            .collect();

        Ok(ScanOutput {
            findings,
            sbom_entries: Vec::new(),
        })
    }
}

/// Skip files that commonly contain example/placeholder secrets
fn is_allowlisted(file_path: &str) -> bool {
    let lower = file_path.to_lowercase();
    lower.ends_with(".env.example")
        || lower.ends_with(".env.sample")
        || lower.ends_with(".env.template")
        || lower.contains("/test/")
        || lower.contains("/tests/")
        || lower.contains("/fixtures/")
        || lower.contains("/testdata/")
        || lower.contains("mock")
        || lower.ends_with("_test.go")
        || lower.ends_with(".test.ts")
        || lower.ends_with(".test.js")
        || lower.ends_with(".spec.ts")
        || lower.ends_with(".spec.js")
}

#[derive(serde::Deserialize)]
#[serde(rename_all = "PascalCase")]
struct GitleaksResult {
    description: String,
    #[serde(rename = "RuleID")]
    rule_id: String,
    file: String,
    start_line: u32,
    #[serde(rename = "Match")]
    r#match: String,
}

#[cfg(test)]
mod tests {
    use super::*;

    // --- is_allowlisted tests ---

    #[test]
    fn allowlisted_env_example_files() {
        assert!(is_allowlisted(".env.example"));
        assert!(is_allowlisted("config/.env.sample"));
        assert!(is_allowlisted("deploy/.ENV.TEMPLATE"));
    }

    #[test]
    fn allowlisted_test_directories() {
        assert!(is_allowlisted("src/test/config.json"));
        assert!(is_allowlisted("src/tests/fixtures.rs"));
        assert!(is_allowlisted("data/fixtures/secret.txt"));
        assert!(is_allowlisted("pkg/testdata/key.pem"));
    }

    #[test]
    fn allowlisted_mock_files() {
        assert!(is_allowlisted("src/mock_service.py"));
        assert!(is_allowlisted("lib/MockAuth.java"));
    }

    #[test]
    fn allowlisted_test_suffixes() {
        assert!(is_allowlisted("auth_test.go"));
        assert!(is_allowlisted("auth.test.ts"));
        assert!(is_allowlisted("auth.test.js"));
        assert!(is_allowlisted("auth.spec.ts"));
        assert!(is_allowlisted("auth.spec.js"));
    }

    #[test]
    fn not_allowlisted_regular_files() {
        assert!(!is_allowlisted("src/main.rs"));
        assert!(!is_allowlisted("config/.env"));
        assert!(!is_allowlisted("lib/auth.ts"));
        assert!(!is_allowlisted("deploy/secrets.yaml"));
    }

    #[test]
    fn not_allowlisted_partial_matches() {
        // "test" as substring in a non-directory context should not match
        assert!(!is_allowlisted("src/attestation.rs"));
        assert!(!is_allowlisted("src/contest/data.json"));
    }

    // --- GitleaksResult deserialization tests ---

    #[test]
    fn deserialize_gitleaks_result() {
        let json = r#"{
            "Description": "AWS Access Key",
            "RuleID": "aws-access-key",
            "File": "src/config.rs",
            "StartLine": 10,
            "Match": "AKIAIOSFODNN7EXAMPLE"
        }"#;
        let result: GitleaksResult = serde_json::from_str(json).unwrap();
        assert_eq!(result.description, "AWS Access Key");
        assert_eq!(result.rule_id, "aws-access-key");
        assert_eq!(result.file, "src/config.rs");
        assert_eq!(result.start_line, 10);
        assert_eq!(result.r#match, "AKIAIOSFODNN7EXAMPLE");
    }

    #[test]
    fn deserialize_gitleaks_result_array() {
        let json = r#"[
            {
                "Description": "Generic Secret",
                "RuleID": "generic-secret",
                "File": "app.py",
                "StartLine": 5,
                "Match": "password=hunter2"
            }
        ]"#;
        let results: Vec<GitleaksResult> = serde_json::from_str(json).unwrap();
        assert_eq!(results.len(), 1);
        assert_eq!(results[0].rule_id, "generic-secret");
    }

    #[test]
    fn severity_mapping_private_key() {
        // Verify the severity logic from the scan method
        let rule_id = "some-private-key-rule";
        assert!(rule_id.contains("private-key"));
    }

    #[test]
    fn severity_mapping_token_password_secret() {
        for keyword in &["token", "password", "secret"] {
            let rule_id = format!("some-{}-rule", keyword);
            assert!(
                rule_id.contains("token")
                    || rule_id.contains("password")
                    || rule_id.contains("secret"),
                "Expected '{rule_id}' to match token/password/secret"
            );
        }
    }
}