use sha2::{Digest, Sha256}; use compliance_core::models::dast::DastFinding; pub fn compute_fingerprint(parts: &[&str]) -> String { let mut hasher = Sha256::new(); for part in parts { hasher.update(part.as_bytes()); hasher.update(b"|"); } hex::encode(hasher.finalize()) } /// Compute a dedup fingerprint for a DAST finding. /// /// The key is derived from the *canonicalized* title (lowercased, domain names /// stripped, known synonyms resolved), endpoint, and HTTP method. This lets us /// detect both exact duplicates (same tool reporting twice across passes) and /// semantic duplicates (e.g., `security_header_missing` "Missing HSTS header" /// vs `tls_misconfiguration` "Missing strict-transport-security header"). pub fn compute_dast_fingerprint(f: &DastFinding) -> String { let canon = canonicalize_dast_title(&f.title); let endpoint = f.endpoint.to_lowercase().trim_end_matches('/').to_string(); let method = f.method.to_uppercase(); let param = f.parameter.as_deref().unwrap_or(""); compute_fingerprint(&[&canon, &endpoint, &method, param]) } /// Canonicalize a DAST finding title for dedup purposes. /// /// 1. Lowercase /// 2. Strip domain names / URLs (e.g. "for comp-dev.meghsakha.com") /// 3. Resolve known header synonyms (hsts ↔ strict-transport-security, etc.) /// 4. Strip extra whitespace fn canonicalize_dast_title(title: &str) -> String { let mut s = title.to_lowercase(); // Strip "for " or "on " suffixes // Pattern: "for " or "on " if let Some(idx) = s.find(" for ") { // Check if what follows looks like a domain or URL let rest = &s[idx + 5..]; if rest.contains('.') || rest.starts_with("http") { s.truncate(idx); } } if let Some(idx) = s.find(" on ") { let rest = &s[idx + 4..]; if rest.contains("http") || rest.contains('/') { s.truncate(idx); } } // Resolve known header synonyms let synonyms: &[(&str, &str)] = &[ ("hsts", "strict-transport-security"), ("csp", "content-security-policy"), ("cors", "cross-origin-resource-sharing"), ("xfo", "x-frame-options"), ]; for &(short, canonical) in synonyms { // Only replace whole words — check boundaries if let Some(pos) = s.find(short) { let before_ok = pos == 0 || !s.as_bytes()[pos - 1].is_ascii_alphanumeric(); let after_ok = pos + short.len() >= s.len() || !s.as_bytes()[pos + short.len()].is_ascii_alphanumeric(); if before_ok && after_ok { s = format!("{}{}{}", &s[..pos], canonical, &s[pos + short.len()..]); } } } // Collapse whitespace s.split_whitespace().collect::>().join(" ") } /// Deduplicate a list of DAST findings, merging evidence from duplicates. /// /// Two-phase approach: /// 1. **Exact dedup** — group by canonicalized `(title, endpoint, method, parameter)`. /// Merge evidence arrays, keep the highest severity, preserve exploitable flag. /// 2. **CWE-based dedup** — within the same `(cwe, endpoint, method)` group, merge /// findings whose canonicalized titles resolve to the same subject (e.g., HSTS /// reported as both `security_header_missing` and `tls_misconfiguration`). pub fn dedup_dast_findings(findings: Vec) -> Vec { use std::collections::HashMap; if findings.len() <= 1 { return findings; } // Phase 1: exact fingerprint dedup let mut seen: HashMap = HashMap::new(); let mut deduped: Vec = Vec::new(); for finding in findings { let fp = compute_dast_fingerprint(&finding); if let Some(&idx) = seen.get(&fp) { // Merge into existing merge_dast_finding(&mut deduped[idx], &finding); } else { seen.insert(fp, deduped.len()); deduped.push(finding); } } let before = deduped.len(); // Phase 2: CWE-based related dedup // Group by (cwe, endpoint_normalized, method) — only when CWE is present let mut cwe_groups: HashMap> = HashMap::new(); for (i, f) in deduped.iter().enumerate() { if let Some(ref cwe) = f.cwe { let key = format!( "{}|{}|{}", cwe, f.endpoint.to_lowercase().trim_end_matches('/'), f.method.to_uppercase(), ); cwe_groups.entry(key).or_default().push(i); } } // For each CWE group with multiple findings, keep the one with highest severity // and most evidence, merge the rest into it let mut merge_map: HashMap> = HashMap::new(); let mut remove_indices: Vec = Vec::new(); for indices in cwe_groups.values() { if indices.len() <= 1 { continue; } // Find the "primary" finding: highest severity, then most evidence, then longest description let Some(&primary_idx) = indices.iter().max_by(|&&a, &&b| { deduped[a] .severity .cmp(&deduped[b].severity) .then_with(|| deduped[a].evidence.len().cmp(&deduped[b].evidence.len())) .then_with(|| { deduped[a] .description .len() .cmp(&deduped[b].description.len()) }) }) else { continue; }; for &idx in indices { if idx != primary_idx { remove_indices.push(idx); merge_map.entry(primary_idx).or_default().push(idx); } } } if !remove_indices.is_empty() { remove_indices.sort_unstable(); remove_indices.dedup(); // Merge evidence for (&primary, secondaries) in &merge_map { let extra_evidence: Vec<_> = secondaries .iter() .flat_map(|&i| deduped[i].evidence.clone()) .collect(); let any_exploitable = secondaries.iter().any(|&i| deduped[i].exploitable); deduped[primary].evidence.extend(extra_evidence); if any_exploitable { deduped[primary].exploitable = true; } } // Remove merged findings (iterate in reverse to preserve indices) for &idx in remove_indices.iter().rev() { deduped.remove(idx); } } let after = deduped.len(); if before != after { tracing::debug!( "DAST CWE-based dedup: {before} → {after} findings ({} merged)", before - after ); } deduped } /// Merge a duplicate DAST finding into a primary one. fn merge_dast_finding(primary: &mut DastFinding, duplicate: &DastFinding) { primary.evidence.extend(duplicate.evidence.clone()); if duplicate.severity > primary.severity { primary.severity = duplicate.severity.clone(); } if duplicate.exploitable { primary.exploitable = true; } // Keep the longer/better description if duplicate.description.len() > primary.description.len() { primary.description.clone_from(&duplicate.description); } // Keep remediation if primary doesn't have one if primary.remediation.is_none() && duplicate.remediation.is_some() { primary.remediation.clone_from(&duplicate.remediation); } } #[cfg(test)] mod tests { use super::*; use compliance_core::models::dast::DastVulnType; use compliance_core::models::finding::Severity; #[test] fn fingerprint_is_deterministic() { let a = compute_fingerprint(&["repo1", "rule-x", "src/main.rs", "42"]); let b = compute_fingerprint(&["repo1", "rule-x", "src/main.rs", "42"]); assert_eq!(a, b); } #[test] fn fingerprint_changes_with_different_input() { let a = compute_fingerprint(&["repo1", "rule-x", "src/main.rs", "42"]); let b = compute_fingerprint(&["repo1", "rule-x", "src/main.rs", "43"]); assert_ne!(a, b); } #[test] fn fingerprint_is_valid_hex_sha256() { let fp = compute_fingerprint(&["hello"]); assert_eq!(fp.len(), 64, "SHA-256 hex should be 64 chars"); assert!(fp.chars().all(|c| c.is_ascii_hexdigit())); } #[test] fn fingerprint_empty_parts() { let fp = compute_fingerprint(&[]); // Should still produce a valid hash (of empty input) assert_eq!(fp.len(), 64); } #[test] fn fingerprint_order_matters() { let a = compute_fingerprint(&["a", "b"]); let b = compute_fingerprint(&["b", "a"]); assert_ne!(a, b); } #[test] fn fingerprint_separator_prevents_collision() { // "ab" + "c" vs "a" + "bc" should differ because of the "|" separator let a = compute_fingerprint(&["ab", "c"]); let b = compute_fingerprint(&["a", "bc"]); assert_ne!(a, b); } fn make_dast(title: &str, endpoint: &str, vuln_type: DastVulnType) -> DastFinding { let mut f = DastFinding::new( "run1".into(), "target1".into(), vuln_type, title.into(), format!("Description for {title}"), Severity::Medium, endpoint.into(), "GET".into(), ); f.cwe = Some("CWE-319".into()); f } #[test] fn canonicalize_strips_domain_suffix() { let canon = canonicalize_dast_title("Missing HSTS header for comp-dev.meghsakha.com"); assert!(!canon.contains("meghsakha"), "domain should be stripped"); assert!( canon.contains("strict-transport-security"), "hsts should be resolved: {canon}" ); } #[test] fn canonicalize_resolves_synonyms() { let a = canonicalize_dast_title("Missing HSTS header"); let b = canonicalize_dast_title("Missing strict-transport-security header"); assert_eq!(a, b); } #[test] fn exact_dedup_merges_identical_findings() { let f1 = make_dast( "Missing strict-transport-security header", "https://example.com", DastVulnType::SecurityHeaderMissing, ); let f2 = make_dast( "Missing strict-transport-security header", "https://example.com", DastVulnType::SecurityHeaderMissing, ); let result = dedup_dast_findings(vec![f1, f2]); assert_eq!(result.len(), 1, "exact duplicates should be merged"); } #[test] fn synonym_dedup_merges_hsts_variants() { let f1 = make_dast( "Missing strict-transport-security header", "https://example.com", DastVulnType::SecurityHeaderMissing, ); let f2 = make_dast( "Missing HSTS header for example.com", "https://example.com", DastVulnType::TlsMisconfiguration, ); let result = dedup_dast_findings(vec![f1, f2]); assert_eq!( result.len(), 1, "HSTS synonym variants should merge to 1 finding" ); } #[test] fn different_headers_not_merged() { let mut f1 = make_dast( "Missing x-content-type-options header", "https://example.com", DastVulnType::SecurityHeaderMissing, ); f1.cwe = Some("CWE-16".into()); let mut f2 = make_dast( "Missing permissions-policy header", "https://example.com", DastVulnType::SecurityHeaderMissing, ); f2.cwe = Some("CWE-16".into()); // These share CWE-16 but are different headers — phase 2 will merge them // since they share the same CWE+endpoint. This is acceptable because they // have the same root cause (missing security headers configuration). let result = dedup_dast_findings(vec![f1, f2]); // CWE-based dedup will merge these into 1 assert!( result.len() <= 2, "same CWE+endpoint findings may be merged" ); } #[test] fn different_endpoints_not_merged() { let f1 = make_dast( "Missing strict-transport-security header", "https://example.com", DastVulnType::SecurityHeaderMissing, ); let f2 = make_dast( "Missing strict-transport-security header", "https://other.com", DastVulnType::SecurityHeaderMissing, ); let result = dedup_dast_findings(vec![f1, f2]); assert_eq!(result.len(), 2, "different endpoints should not merge"); } #[test] fn dedup_preserves_highest_severity() { let f1 = make_dast( "Missing strict-transport-security header", "https://example.com", DastVulnType::SecurityHeaderMissing, ); let mut f2 = make_dast( "Missing strict-transport-security header", "https://example.com", DastVulnType::SecurityHeaderMissing, ); f2.severity = Severity::High; let result = dedup_dast_findings(vec![f1, f2]); assert_eq!(result.len(), 1); assert_eq!(result[0].severity, Severity::High); } #[test] fn dedup_merges_evidence() { let mut f1 = make_dast( "Missing strict-transport-security header", "https://example.com", DastVulnType::SecurityHeaderMissing, ); f1.evidence .push(compliance_core::models::dast::DastEvidence { request_method: "GET".into(), request_url: "https://example.com".into(), request_headers: None, request_body: None, response_status: 200, response_headers: None, response_snippet: Some("pass 1".into()), screenshot_path: None, payload: None, response_time_ms: None, }); let mut f2 = f1.clone(); f2.evidence[0].response_snippet = Some("pass 2".into()); let result = dedup_dast_findings(vec![f1, f2]); assert_eq!(result.len(), 1); assert_eq!(result[0].evidence.len(), 2, "evidence should be merged"); } }