compliance-scanner-agent/compliance-agent/src/pipeline/dedup.rs

use sha2::{Digest, Sha256};

use compliance_core::models::dast::DastFinding;

pub fn compute_fingerprint(parts: &[&str]) -> String {
    let mut hasher = Sha256::new();
    for part in parts {
        hasher.update(part.as_bytes());
        hasher.update(b"|");
    }
    hex::encode(hasher.finalize())
}

/// Compute a dedup fingerprint for a DAST finding.
///
/// The key is derived from the *canonicalized* title (lowercased, domain names
/// stripped, known synonyms resolved), endpoint, and HTTP method.  This lets us
/// detect both exact duplicates (same tool reporting twice across passes) and
/// semantic duplicates (e.g., `security_header_missing` "Missing HSTS header"
/// vs `tls_misconfiguration` "Missing strict-transport-security header").
pub fn compute_dast_fingerprint(f: &DastFinding) -> String {
    let canon = canonicalize_dast_title(&f.title);
    let endpoint = f.endpoint.to_lowercase().trim_end_matches('/').to_string();
    let method = f.method.to_uppercase();
    let param = f.parameter.as_deref().unwrap_or("");
    compute_fingerprint(&[&canon, &endpoint, &method, param])
}

/// Canonicalize a DAST finding title for dedup purposes.
///
/// 1. Lowercase
/// 2. Strip domain names / URLs (e.g. "for comp-dev.meghsakha.com")
/// 3. Resolve known header synonyms (hsts ↔ strict-transport-security, etc.)
/// 4. Strip extra whitespace
fn canonicalize_dast_title(title: &str) -> String {
    let mut s = title.to_lowercase();

    // Strip "for <domain>" or "on <domain>" suffixes
    // Pattern: "for <word.word...>" or "on <method> <url>"
    if let Some(idx) = s.find(" for ") {
        // Check if what follows looks like a domain or URL
        let rest = &s[idx + 5..];
        if rest.contains('.') || rest.starts_with("http") {
            s.truncate(idx);
        }
    }
    if let Some(idx) = s.find(" on ") {
        let rest = &s[idx + 4..];
        if rest.contains("http") || rest.contains('/') {
            s.truncate(idx);
        }
    }

    // Resolve known header synonyms
    let synonyms: &[(&str, &str)] = &[
        ("hsts", "strict-transport-security"),
        ("csp", "content-security-policy"),
        ("cors", "cross-origin-resource-sharing"),
        ("xfo", "x-frame-options"),
    ];
    for &(short, canonical) in synonyms {
        // Only replace whole words — check boundaries
        if let Some(pos) = s.find(short) {
            let before_ok = pos == 0 || !s.as_bytes()[pos - 1].is_ascii_alphanumeric();
            let after_ok = pos + short.len() >= s.len()
                || !s.as_bytes()[pos + short.len()].is_ascii_alphanumeric();
            if before_ok && after_ok {
                s = format!("{}{}{}", &s[..pos], canonical, &s[pos + short.len()..]);
            }
        }
    }

    // Collapse whitespace
    s.split_whitespace().collect::<Vec<_>>().join(" ")
}

/// Deduplicate a list of DAST findings, merging evidence from duplicates.
///
/// Two-phase approach:
/// 1. **Exact dedup** — group by canonicalized `(title, endpoint, method, parameter)`.
///    Merge evidence arrays, keep the highest severity, preserve exploitable flag.
/// 2. **CWE-based dedup** — within the same `(cwe, endpoint, method)` group, merge
///    findings whose canonicalized titles resolve to the same subject (e.g., HSTS
///    reported as both `security_header_missing` and `tls_misconfiguration`).
pub fn dedup_dast_findings(findings: Vec<DastFinding>) -> Vec<DastFinding> {
    use std::collections::HashMap;

    if findings.len() <= 1 {
        return findings;
    }

    // Phase 1: exact fingerprint dedup
    let mut seen: HashMap<String, usize> = HashMap::new();
    let mut deduped: Vec<DastFinding> = Vec::new();

    for finding in findings {
        let fp = compute_dast_fingerprint(&finding);

        if let Some(&idx) = seen.get(&fp) {
            // Merge into existing
            merge_dast_finding(&mut deduped[idx], &finding);
        } else {
            seen.insert(fp, deduped.len());
            deduped.push(finding);
        }
    }

    let before = deduped.len();

    // Phase 2: CWE-based related dedup
    // Group by (cwe, endpoint_normalized, method) — only when CWE is present
    let mut cwe_groups: HashMap<String, Vec<usize>> = HashMap::new();
    for (i, f) in deduped.iter().enumerate() {
        if let Some(ref cwe) = f.cwe {
            let key = format!(
                "{}|{}|{}",
                cwe,
                f.endpoint.to_lowercase().trim_end_matches('/'),
                f.method.to_uppercase(),
            );
            cwe_groups.entry(key).or_default().push(i);
        }
    }

    // For each CWE group with multiple findings, keep the one with highest severity
    // and most evidence, merge the rest into it
    let mut merge_map: HashMap<usize, Vec<usize>> = HashMap::new();
    let mut remove_indices: Vec<usize> = Vec::new();

    for indices in cwe_groups.values() {
        if indices.len() <= 1 {
            continue;
        }
        // Find the "primary" finding: highest severity, then most evidence, then longest description
        let Some(&primary_idx) = indices.iter().max_by(|&&a, &&b| {
            deduped[a]
                .severity
                .cmp(&deduped[b].severity)
                .then_with(|| deduped[a].evidence.len().cmp(&deduped[b].evidence.len()))
                .then_with(|| {
                    deduped[a]
                        .description
                        .len()
                        .cmp(&deduped[b].description.len())
                })
        }) else {
            continue;
        };

        for &idx in indices {
            if idx != primary_idx {
                remove_indices.push(idx);
                merge_map.entry(primary_idx).or_default().push(idx);
            }
        }
    }

    if !remove_indices.is_empty() {
        remove_indices.sort_unstable();
        remove_indices.dedup();

        // Merge evidence
        for (&primary, secondaries) in &merge_map {
            let extra_evidence: Vec<_> = secondaries
                .iter()
                .flat_map(|&i| deduped[i].evidence.clone())
                .collect();
            let any_exploitable = secondaries.iter().any(|&i| deduped[i].exploitable);

            deduped[primary].evidence.extend(extra_evidence);
            if any_exploitable {
                deduped[primary].exploitable = true;
            }
        }

        // Remove merged findings (iterate in reverse to preserve indices)
        for &idx in remove_indices.iter().rev() {
            deduped.remove(idx);
        }
    }

    let after = deduped.len();
    if before != after {
        tracing::debug!(
            "DAST CWE-based dedup: {before} → {after} findings ({} merged)",
            before - after
        );
    }

    deduped
}

/// Merge a duplicate DAST finding into a primary one.
fn merge_dast_finding(primary: &mut DastFinding, duplicate: &DastFinding) {
    primary.evidence.extend(duplicate.evidence.clone());
    if duplicate.severity > primary.severity {
        primary.severity = duplicate.severity.clone();
    }
    if duplicate.exploitable {
        primary.exploitable = true;
    }
    // Keep the longer/better description
    if duplicate.description.len() > primary.description.len() {
        primary.description.clone_from(&duplicate.description);
    }
    // Keep remediation if primary doesn't have one
    if primary.remediation.is_none() && duplicate.remediation.is_some() {
        primary.remediation.clone_from(&duplicate.remediation);
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use compliance_core::models::dast::DastVulnType;
    use compliance_core::models::finding::Severity;

    #[test]
    fn fingerprint_is_deterministic() {
        let a = compute_fingerprint(&["repo1", "rule-x", "src/main.rs", "42"]);
        let b = compute_fingerprint(&["repo1", "rule-x", "src/main.rs", "42"]);
        assert_eq!(a, b);
    }

    #[test]
    fn fingerprint_changes_with_different_input() {
        let a = compute_fingerprint(&["repo1", "rule-x", "src/main.rs", "42"]);
        let b = compute_fingerprint(&["repo1", "rule-x", "src/main.rs", "43"]);
        assert_ne!(a, b);
    }

    #[test]
    fn fingerprint_is_valid_hex_sha256() {
        let fp = compute_fingerprint(&["hello"]);
        assert_eq!(fp.len(), 64, "SHA-256 hex should be 64 chars");
        assert!(fp.chars().all(|c| c.is_ascii_hexdigit()));
    }

    #[test]
    fn fingerprint_empty_parts() {
        let fp = compute_fingerprint(&[]);
        // Should still produce a valid hash (of empty input)
        assert_eq!(fp.len(), 64);
    }

    #[test]
    fn fingerprint_order_matters() {
        let a = compute_fingerprint(&["a", "b"]);
        let b = compute_fingerprint(&["b", "a"]);
        assert_ne!(a, b);
    }

    #[test]
    fn fingerprint_separator_prevents_collision() {
        // "ab" + "c" vs "a" + "bc" should differ because of the "|" separator
        let a = compute_fingerprint(&["ab", "c"]);
        let b = compute_fingerprint(&["a", "bc"]);
        assert_ne!(a, b);
    }

    fn make_dast(title: &str, endpoint: &str, vuln_type: DastVulnType) -> DastFinding {
        let mut f = DastFinding::new(
            "run1".into(),
            "target1".into(),
            vuln_type,
            title.into(),
            format!("Description for {title}"),
            Severity::Medium,
            endpoint.into(),
            "GET".into(),
        );
        f.cwe = Some("CWE-319".into());
        f
    }

    #[test]
    fn canonicalize_strips_domain_suffix() {
        let canon = canonicalize_dast_title("Missing HSTS header for comp-dev.meghsakha.com");
        assert!(!canon.contains("meghsakha"), "domain should be stripped");
        assert!(
            canon.contains("strict-transport-security"),
            "hsts should be resolved: {canon}"
        );
    }

    #[test]
    fn canonicalize_resolves_synonyms() {
        let a = canonicalize_dast_title("Missing HSTS header");
        let b = canonicalize_dast_title("Missing strict-transport-security header");
        assert_eq!(a, b);
    }

    #[test]
    fn exact_dedup_merges_identical_findings() {
        let f1 = make_dast(
            "Missing strict-transport-security header",
            "https://example.com",
            DastVulnType::SecurityHeaderMissing,
        );
        let f2 = make_dast(
            "Missing strict-transport-security header",
            "https://example.com",
            DastVulnType::SecurityHeaderMissing,
        );
        let result = dedup_dast_findings(vec![f1, f2]);
        assert_eq!(result.len(), 1, "exact duplicates should be merged");
    }

    #[test]
    fn synonym_dedup_merges_hsts_variants() {
        let f1 = make_dast(
            "Missing strict-transport-security header",
            "https://example.com",
            DastVulnType::SecurityHeaderMissing,
        );
        let f2 = make_dast(
            "Missing HSTS header for example.com",
            "https://example.com",
            DastVulnType::TlsMisconfiguration,
        );
        let result = dedup_dast_findings(vec![f1, f2]);
        assert_eq!(
            result.len(),
            1,
            "HSTS synonym variants should merge to 1 finding"
        );
    }

    #[test]
    fn different_headers_not_merged() {
        let mut f1 = make_dast(
            "Missing x-content-type-options header",
            "https://example.com",
            DastVulnType::SecurityHeaderMissing,
        );
        f1.cwe = Some("CWE-16".into());
        let mut f2 = make_dast(
            "Missing permissions-policy header",
            "https://example.com",
            DastVulnType::SecurityHeaderMissing,
        );
        f2.cwe = Some("CWE-16".into());
        // These share CWE-16 but are different headers — phase 2 will merge them
        // since they share the same CWE+endpoint. This is acceptable because they
        // have the same root cause (missing security headers configuration).
        let result = dedup_dast_findings(vec![f1, f2]);
        // CWE-based dedup will merge these into 1
        assert!(
            result.len() <= 2,
            "same CWE+endpoint findings may be merged"
        );
    }

    #[test]
    fn different_endpoints_not_merged() {
        let f1 = make_dast(
            "Missing strict-transport-security header",
            "https://example.com",
            DastVulnType::SecurityHeaderMissing,
        );
        let f2 = make_dast(
            "Missing strict-transport-security header",
            "https://other.com",
            DastVulnType::SecurityHeaderMissing,
        );
        let result = dedup_dast_findings(vec![f1, f2]);
        assert_eq!(result.len(), 2, "different endpoints should not merge");
    }

    #[test]
    fn dedup_preserves_highest_severity() {
        let f1 = make_dast(
            "Missing strict-transport-security header",
            "https://example.com",
            DastVulnType::SecurityHeaderMissing,
        );
        let mut f2 = make_dast(
            "Missing strict-transport-security header",
            "https://example.com",
            DastVulnType::SecurityHeaderMissing,
        );
        f2.severity = Severity::High;
        let result = dedup_dast_findings(vec![f1, f2]);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0].severity, Severity::High);
    }

    #[test]
    fn dedup_merges_evidence() {
        let mut f1 = make_dast(
            "Missing strict-transport-security header",
            "https://example.com",
            DastVulnType::SecurityHeaderMissing,
        );
        f1.evidence
            .push(compliance_core::models::dast::DastEvidence {
                request_method: "GET".into(),
                request_url: "https://example.com".into(),
                request_headers: None,
                request_body: None,
                response_status: 200,
                response_headers: None,
                response_snippet: Some("pass 1".into()),
                screenshot_path: None,
                payload: None,
                response_time_ms: None,
            });
        let mut f2 = f1.clone();
        f2.evidence[0].response_snippet = Some("pass 2".into());

        let result = dedup_dast_findings(vec![f1, f2]);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0].evidence.len(), 2, "evidence should be merged");
    }
}