use std::path::Path; use compliance_core::models::{Finding, ScanType, Severity}; use compliance_core::traits::{ScanOutput, Scanner}; use compliance_core::CoreError; use regex::Regex; use crate::pipeline::dedup; fn compile_regex(pattern: &str) -> Regex { Regex::new(pattern).unwrap_or_else(|e| { tracing::warn!("Invalid regex pattern '{pattern}': {e}, using empty fallback"); // SAFETY: "^$" is a known-valid regex that matches only empty strings #[allow(clippy::unwrap_used)] Regex::new("^$").unwrap() }) } pub struct GdprPatternScanner { patterns: Vec, } pub struct OAuthPatternScanner { patterns: Vec, } struct PatternRule { id: String, title: String, description: String, pattern: Regex, severity: Severity, file_extensions: Vec, } impl GdprPatternScanner { pub fn new() -> Self { let patterns = vec![ PatternRule { id: "gdpr-pii-logging".to_string(), title: "PII data potentially logged".to_string(), description: "Logging statements that may contain personally identifiable information (email, SSN, phone, IP address).".to_string(), pattern: compile_regex(r#"(?i)(log|print|console\.|logger\.|tracing::)\s*[\.(].*\b(email|ssn|social.?security|phone.?number|ip.?addr|passport|date.?of.?birth|credit.?card)\b"#), severity: Severity::High, file_extensions: vec!["rs", "py", "js", "ts", "java", "go", "rb"].into_iter().map(String::from).collect(), }, PatternRule { id: "gdpr-no-consent".to_string(), title: "Data collection without apparent consent mechanism".to_string(), description: "Data collection endpoint that doesn't reference consent or opt-in mechanisms.".to_string(), pattern: compile_regex(r#"(?i)(collect|store|save|persist|record).*\b(personal|user.?data|pii|biometric)\b"#), severity: Severity::Medium, file_extensions: vec!["rs", "py", "js", "ts", "java", "go"].into_iter().map(String::from).collect(), }, PatternRule { id: "gdpr-no-delete-endpoint".to_string(), title: "Missing data deletion capability".to_string(), description: "User data models or controllers without corresponding deletion endpoints (right to erasure).".to_string(), pattern: compile_regex(r#"(?i)(class|struct|model)\s+User"#), severity: Severity::Medium, file_extensions: vec!["rs", "py", "js", "ts", "java", "go", "rb"].into_iter().map(String::from).collect(), }, PatternRule { id: "gdpr-hardcoded-retention".to_string(), title: "Hardcoded data retention period".to_string(), description: "Data retention periods should be configurable for GDPR compliance.".to_string(), pattern: compile_regex(r#"(?i)(retention|ttl|expire|keep.?for)\s*[=:]\s*\d+"#), severity: Severity::Low, file_extensions: vec!["rs", "py", "js", "ts", "java", "go", "yaml", "yml", "toml", "json"].into_iter().map(String::from).collect(), }, ]; Self { patterns } } } impl Scanner for GdprPatternScanner { fn name(&self) -> &str { "gdpr-patterns" } fn scan_type(&self) -> ScanType { ScanType::Gdpr } #[tracing::instrument(skip_all)] async fn scan(&self, repo_path: &Path, repo_id: &str) -> Result { let findings = scan_with_patterns( repo_path, repo_id, &self.patterns, ScanType::Gdpr, "gdpr-patterns", )?; Ok(ScanOutput { findings, sbom_entries: Vec::new(), }) } } impl OAuthPatternScanner { pub fn new() -> Self { let patterns = vec![ PatternRule { id: "oauth-implicit-grant".to_string(), title: "OAuth implicit grant flow detected".to_string(), description: "Implicit grant flow is deprecated and insecure. Use authorization code flow with PKCE instead.".to_string(), pattern: compile_regex(r#"(?i)(response_type\s*[=:]\s*["']?token|grant_type\s*[=:]\s*["']?implicit)"#), severity: Severity::High, file_extensions: vec!["rs", "py", "js", "ts", "java", "go", "yaml", "yml", "json"].into_iter().map(String::from).collect(), }, PatternRule { id: "oauth-missing-pkce".to_string(), title: "OAuth flow without PKCE".to_string(), description: "Authorization code flow should use PKCE (code_challenge/code_verifier) for public clients.".to_string(), pattern: compile_regex(r#"(?i)authorization.?code"#), severity: Severity::Medium, file_extensions: vec!["rs", "py", "js", "ts", "java", "go"].into_iter().map(String::from).collect(), }, PatternRule { id: "oauth-token-localstorage".to_string(), title: "Token stored in localStorage".to_string(), description: "Storing tokens in localStorage is vulnerable to XSS. Use httpOnly cookies or secure session storage.".to_string(), pattern: compile_regex(r#"(?i)localStorage\.(set|get)Item\s*\(\s*["'].*token"#), severity: Severity::High, file_extensions: vec!["js", "ts", "jsx", "tsx"].into_iter().map(String::from).collect(), }, PatternRule { id: "oauth-token-url".to_string(), title: "Token passed in URL parameters".to_string(), description: "Tokens in URLs can leak via referrer headers, server logs, and browser history.".to_string(), pattern: compile_regex(r#"(?i)(access_token|bearer)\s*[=]\s*.*\b(url|query|param|href)\b"#), severity: Severity::High, file_extensions: vec!["rs", "py", "js", "ts", "java", "go"].into_iter().map(String::from).collect(), }, ]; Self { patterns } } } impl Scanner for OAuthPatternScanner { fn name(&self) -> &str { "oauth-patterns" } fn scan_type(&self) -> ScanType { ScanType::OAuth } #[tracing::instrument(skip_all)] async fn scan(&self, repo_path: &Path, repo_id: &str) -> Result { let findings = scan_with_patterns( repo_path, repo_id, &self.patterns, ScanType::OAuth, "oauth-patterns", )?; Ok(ScanOutput { findings, sbom_entries: Vec::new(), }) } } fn scan_with_patterns( repo_path: &Path, repo_id: &str, patterns: &[PatternRule], scan_type: ScanType, scanner_name: &str, ) -> Result, CoreError> { let mut findings = Vec::new(); for entry in walkdir(repo_path)? { let path = entry.path(); if !path.is_file() { continue; } let ext = path .extension() .and_then(|e| e.to_str()) .unwrap_or("") .to_string(); let content = match std::fs::read_to_string(path) { Ok(c) => c, Err(_) => continue, // skip binary files }; let relative_path = path .strip_prefix(repo_path) .unwrap_or(path) .to_string_lossy() .to_string(); for pattern in patterns { if !pattern.file_extensions.contains(&ext) { continue; } for (line_num, line) in content.lines().enumerate() { if pattern.pattern.is_match(line) { let fingerprint = dedup::compute_fingerprint(&[ repo_id, &pattern.id, &relative_path, &(line_num + 1).to_string(), ]); let mut finding = Finding::new( repo_id.to_string(), fingerprint, scanner_name.to_string(), scan_type.clone(), pattern.title.clone(), pattern.description.clone(), pattern.severity.clone(), ); finding.rule_id = Some(pattern.id.clone()); finding.file_path = Some(relative_path.clone()); finding.line_number = Some((line_num + 1) as u32); finding.code_snippet = Some(line.to_string()); findings.push(finding); } } } } Ok(findings) } fn walkdir(path: &Path) -> Result, CoreError> { // Simple recursive file walk, skipping hidden dirs and common non-source dirs let skip_dirs = [ ".git", "node_modules", "target", "vendor", ".venv", "__pycache__", "dist", "build", ]; let entries: Vec<_> = walkdir::WalkDir::new(path) .into_iter() .filter_entry(|e| { let name = e.file_name().to_string_lossy(); !skip_dirs.contains(&name.as_ref()) }) .filter_map(|e| e.ok()) .collect(); Ok(entries) } #[cfg(test)] mod tests { use super::*; // --- compile_regex tests --- #[test] fn compile_regex_valid_pattern() { let re = compile_regex(r"\bfoo\b"); assert!(re.is_match("hello foo bar")); assert!(!re.is_match("foobar")); } #[test] fn compile_regex_invalid_pattern_returns_fallback() { // An invalid regex should return the fallback "^$" that only matches empty strings let re = compile_regex(r"[invalid"); assert!(re.is_match("")); assert!(!re.is_match("anything")); } // --- GDPR pattern tests --- #[test] fn gdpr_pii_logging_matches() { let scanner = GdprPatternScanner::new(); let pattern = &scanner.patterns[0]; // gdpr-pii-logging // Regex: (log|print|console\.|logger\.|tracing::)\s*[\.(].*\b(pii_keyword)\b assert!(pattern.pattern.is_match("console.log(email)")); assert!(pattern.pattern.is_match("console.log(user.ssn)")); assert!(pattern.pattern.is_match("print(phone_number)")); assert!(pattern.pattern.is_match("tracing::(ip_addr)")); assert!(pattern.pattern.is_match("log.debug(credit_card)")); } #[test] fn gdpr_pii_logging_no_false_positive() { let scanner = GdprPatternScanner::new(); let pattern = &scanner.patterns[0]; // Regular logging without PII fields should not match assert!(!pattern .pattern .is_match("logger.info(\"request completed\")")); assert!(!pattern.pattern.is_match("let email = user.email;")); } #[test] fn gdpr_no_consent_matches() { let scanner = GdprPatternScanner::new(); let pattern = &scanner.patterns[1]; // gdpr-no-consent assert!(pattern.pattern.is_match("collect personal data")); assert!(pattern.pattern.is_match("store user_data in db")); assert!(pattern.pattern.is_match("save pii to disk")); } #[test] fn gdpr_user_model_matches() { let scanner = GdprPatternScanner::new(); let pattern = &scanner.patterns[2]; // gdpr-no-delete-endpoint assert!(pattern.pattern.is_match("struct User {")); assert!(pattern.pattern.is_match("class User(Model):")); } #[test] fn gdpr_hardcoded_retention_matches() { let scanner = GdprPatternScanner::new(); let pattern = &scanner.patterns[3]; // gdpr-hardcoded-retention assert!(pattern.pattern.is_match("retention = 30")); assert!(pattern.pattern.is_match("ttl: 3600")); assert!(pattern.pattern.is_match("expire = 86400")); } // --- OAuth pattern tests --- #[test] fn oauth_implicit_grant_matches() { let scanner = OAuthPatternScanner::new(); let pattern = &scanner.patterns[0]; // oauth-implicit-grant assert!(pattern.pattern.is_match("response_type = \"token\"")); assert!(pattern.pattern.is_match("grant_type: implicit")); assert!(pattern.pattern.is_match("response_type='token'")); } #[test] fn oauth_implicit_grant_no_false_positive() { let scanner = OAuthPatternScanner::new(); let pattern = &scanner.patterns[0]; assert!(!pattern.pattern.is_match("response_type = \"code\"")); assert!(!pattern.pattern.is_match("grant_type: authorization_code")); } #[test] fn oauth_authorization_code_matches() { let scanner = OAuthPatternScanner::new(); let pattern = &scanner.patterns[1]; // oauth-missing-pkce assert!(pattern.pattern.is_match("uses authorization_code flow")); assert!(pattern.pattern.is_match("authorization code grant")); } #[test] fn oauth_token_localstorage_matches() { let scanner = OAuthPatternScanner::new(); let pattern = &scanner.patterns[2]; // oauth-token-localstorage assert!(pattern .pattern .is_match("localStorage.setItem('access_token', tok)")); assert!(pattern .pattern .is_match("localStorage.getItem(\"refresh_token\")")); } #[test] fn oauth_token_localstorage_no_false_positive() { let scanner = OAuthPatternScanner::new(); let pattern = &scanner.patterns[2]; assert!(!pattern .pattern .is_match("localStorage.setItem('theme', 'dark')")); assert!(!pattern .pattern .is_match("sessionStorage.setItem('token', t)")); } #[test] fn oauth_token_url_matches() { let scanner = OAuthPatternScanner::new(); let pattern = &scanner.patterns[3]; // oauth-token-url assert!(pattern.pattern.is_match("access_token = build_url(query)")); assert!(pattern.pattern.is_match("bearer = url.param")); } // --- Pattern rule file extension filtering --- #[test] fn gdpr_patterns_cover_common_languages() { let scanner = GdprPatternScanner::new(); for pattern in &scanner.patterns { assert!( pattern.file_extensions.contains(&"rs".to_string()), "Pattern {} should cover .rs files", pattern.id ); } } #[test] fn oauth_localstorage_only_js_ts() { let scanner = OAuthPatternScanner::new(); let pattern = &scanner.patterns[2]; // oauth-token-localstorage assert!(pattern.file_extensions.contains(&"js".to_string())); assert!(pattern.file_extensions.contains(&"ts".to_string())); assert!(!pattern.file_extensions.contains(&"rs".to_string())); assert!(!pattern.file_extensions.contains(&"py".to_string())); } }