feat: deduplicate code review findings across LLM passes (#48)

2026-03-29 20:38:52 +00:00
parent 745ad8a441
commit ff088f9eb4
5 changed files with 486 additions and 5 deletions
@@ -95,8 +95,8 @@ pub async fn export_session_report(
        Err(_) => Vec::new(),
    };

-    // Fetch DAST findings for this session
-    let findings: Vec<DastFinding> = match agent
+    // Fetch DAST findings for this session, then deduplicate
+    let raw_findings: Vec<DastFinding> = match agent
        .db
        .dast_findings()
        .find(doc! { "session_id": &id })
@@ -106,6 +106,14 @@ pub async fn export_session_report(
        Ok(cursor) => collect_cursor_async(cursor).await,
        Err(_) => Vec::new(),
    };
+    let raw_count = raw_findings.len();
+    let findings = crate::pipeline::dedup::dedup_dast_findings(raw_findings);
+    if findings.len() < raw_count {
+        tracing::info!(
+            "Deduped DAST findings for session {id}: {raw_count} → {}",
+            findings.len()
+        );
+    }

    // Fetch SAST findings, SBOM, and code context for the linked repository
    let repo_id = session
@@ -321,9 +321,38 @@ impl PentestOrchestrator {
                                    total_findings += findings_count;

                                    let mut finding_ids: Vec<String> = Vec::new();
-                                    for mut finding in result.findings {
+                                    // Dedup findings within this tool result before inserting
+                                    let deduped_findings =
+                                        crate::pipeline::dedup::dedup_dast_findings(
+                                            result.findings,
+                                        );
+                                    for mut finding in deduped_findings {
                                        finding.scan_run_id = session_id.clone();
                                        finding.session_id = Some(session_id.clone());
+
+                                        // Check for existing duplicate in this session
+                                        let fp = crate::pipeline::dedup::compute_dast_fingerprint(
+                                            &finding,
+                                        );
+                                        let existing = self
+                                            .db
+                                            .dast_findings()
+                                            .find_one(doc! {
+                                                "session_id": &session_id,
+                                                "title": &finding.title,
+                                                "endpoint": &finding.endpoint,
+                                                "method": &finding.method,
+                                            })
+                                            .await;
+                                        if matches!(existing, Ok(Some(_))) {
+                                            tracing::debug!(
+                                                "Skipping duplicate DAST finding: {} (fp={:.12})",
+                                                finding.title,
+                                                fp,
+                                            );
+                                            continue;
+                                        }
+
                                        let insert_result =
                                            self.db.dast_findings().insert_one(&finding).await;
                                        if let Ok(res) = &insert_result {
@@ -66,8 +66,10 @@ impl CodeReviewScanner {
            }
        }

+        let deduped = dedup_cross_pass(all_findings);
+
        ScanOutput {
-            findings: all_findings,
+            findings: deduped,
            sbom_entries: Vec::new(),
        }
    }
@@ -184,3 +186,51 @@ struct ReviewIssue {
    #[serde(default)]
    suggestion: Option<String>,
 }
+
+/// Deduplicate findings across review passes.
+///
+/// Multiple passes often flag the same issue (e.g. SQL injection reported by
+/// logic, security, and convention passes). We group by file + nearby line +
+/// normalized title keywords and keep the highest-severity finding.
+fn dedup_cross_pass(findings: Vec<Finding>) -> Vec<Finding> {
+    use std::collections::HashMap;
+
+    // Build a dedup key: (file, line bucket, normalized title words)
+    fn dedup_key(f: &Finding) -> String {
+        let file = f.file_path.as_deref().unwrap_or("");
+        // Group lines within 3 of each other
+        let line_bucket = f.line_number.unwrap_or(0) / 4;
+        // Normalize: lowercase, keep only alphanumeric, sort words for order-independence
+        let title_lower = f.title.to_lowercase();
+        let mut words: Vec<&str> = title_lower
+            .split(|c: char| !c.is_alphanumeric())
+            .filter(|w| w.len() > 2)
+            .collect();
+        words.sort();
+        format!("{file}:{line_bucket}:{}", words.join(","))
+    }
+
+    let mut groups: HashMap<String, Finding> = HashMap::new();
+
+    for finding in findings {
+        let key = dedup_key(&finding);
+        groups
+            .entry(key)
+            .and_modify(|existing| {
+                // Keep the higher severity; on tie, keep the one with more detail
+                if finding.severity > existing.severity
+                    || (finding.severity == existing.severity
+                        && finding.description.len() > existing.description.len())
+                {
+                    *existing = finding.clone();
+                }
+                // Merge CWE if the existing one is missing it
+                if existing.cwe.is_none() {
+                    existing.cwe = finding.cwe.clone();
+                }
+            })
+            .or_insert(finding);
+    }
+
+    groups.into_values().collect()
+}
@@ -1,5 +1,7 @@
 use sha2::{Digest, Sha256};

+use compliance_core::models::dast::DastFinding;
+
 pub fn compute_fingerprint(parts: &[&str]) -> String {
    let mut hasher = Sha256::new();
    for part in parts {
@@ -9,9 +11,209 @@ pub fn compute_fingerprint(parts: &[&str]) -> String {
    hex::encode(hasher.finalize())
 }

+/// Compute a dedup fingerprint for a DAST finding.
+///
+/// The key is derived from the *canonicalized* title (lowercased, domain names
+/// stripped, known synonyms resolved), endpoint, and HTTP method.  This lets us
+/// detect both exact duplicates (same tool reporting twice across passes) and
+/// semantic duplicates (e.g., `security_header_missing` "Missing HSTS header"
+/// vs `tls_misconfiguration` "Missing strict-transport-security header").
+pub fn compute_dast_fingerprint(f: &DastFinding) -> String {
+    let canon = canonicalize_dast_title(&f.title);
+    let endpoint = f.endpoint.to_lowercase().trim_end_matches('/').to_string();
+    let method = f.method.to_uppercase();
+    let param = f.parameter.as_deref().unwrap_or("");
+    compute_fingerprint(&[&canon, &endpoint, &method, param])
+}
+
+/// Canonicalize a DAST finding title for dedup purposes.
+///
+/// 1. Lowercase
+/// 2. Strip domain names / URLs (e.g. "for comp-dev.meghsakha.com")
+/// 3. Resolve known header synonyms (hsts ↔ strict-transport-security, etc.)
+/// 4. Strip extra whitespace
+fn canonicalize_dast_title(title: &str) -> String {
+    let mut s = title.to_lowercase();
+
+    // Strip "for <domain>" or "on <domain>" suffixes
+    // Pattern: "for <word.word...>" or "on <method> <url>"
+    if let Some(idx) = s.find(" for ") {
+        // Check if what follows looks like a domain or URL
+        let rest = &s[idx + 5..];
+        if rest.contains('.') || rest.starts_with("http") {
+            s.truncate(idx);
+        }
+    }
+    if let Some(idx) = s.find(" on ") {
+        let rest = &s[idx + 4..];
+        if rest.contains("http") || rest.contains('/') {
+            s.truncate(idx);
+        }
+    }
+
+    // Resolve known header synonyms
+    let synonyms: &[(&str, &str)] = &[
+        ("hsts", "strict-transport-security"),
+        ("csp", "content-security-policy"),
+        ("cors", "cross-origin-resource-sharing"),
+        ("xfo", "x-frame-options"),
+    ];
+    for &(short, canonical) in synonyms {
+        // Only replace whole words — check boundaries
+        if let Some(pos) = s.find(short) {
+            let before_ok = pos == 0 || !s.as_bytes()[pos - 1].is_ascii_alphanumeric();
+            let after_ok = pos + short.len() >= s.len()
+                || !s.as_bytes()[pos + short.len()].is_ascii_alphanumeric();
+            if before_ok && after_ok {
+                s = format!("{}{}{}", &s[..pos], canonical, &s[pos + short.len()..]);
+            }
+        }
+    }
+
+    // Collapse whitespace
+    s.split_whitespace().collect::<Vec<_>>().join(" ")
+}
+
+/// Deduplicate a list of DAST findings, merging evidence from duplicates.
+///
+/// Two-phase approach:
+/// 1. **Exact dedup** — group by canonicalized `(title, endpoint, method, parameter)`.
+///    Merge evidence arrays, keep the highest severity, preserve exploitable flag.
+/// 2. **CWE-based dedup** — within the same `(cwe, endpoint, method)` group, merge
+///    findings whose canonicalized titles resolve to the same subject (e.g., HSTS
+///    reported as both `security_header_missing` and `tls_misconfiguration`).
+pub fn dedup_dast_findings(findings: Vec<DastFinding>) -> Vec<DastFinding> {
+    use std::collections::HashMap;
+
+    if findings.len() <= 1 {
+        return findings;
+    }
+
+    // Phase 1: exact fingerprint dedup
+    let mut seen: HashMap<String, usize> = HashMap::new();
+    let mut deduped: Vec<DastFinding> = Vec::new();
+
+    for finding in findings {
+        let fp = compute_dast_fingerprint(&finding);
+
+        if let Some(&idx) = seen.get(&fp) {
+            // Merge into existing
+            merge_dast_finding(&mut deduped[idx], &finding);
+        } else {
+            seen.insert(fp, deduped.len());
+            deduped.push(finding);
+        }
+    }
+
+    let before = deduped.len();
+
+    // Phase 2: CWE-based related dedup
+    // Group by (cwe, endpoint_normalized, method) — only when CWE is present
+    let mut cwe_groups: HashMap<String, Vec<usize>> = HashMap::new();
+    for (i, f) in deduped.iter().enumerate() {
+        if let Some(ref cwe) = f.cwe {
+            let key = format!(
+                "{}|{}|{}",
+                cwe,
+                f.endpoint.to_lowercase().trim_end_matches('/'),
+                f.method.to_uppercase(),
+            );
+            cwe_groups.entry(key).or_default().push(i);
+        }
+    }
+
+    // For each CWE group with multiple findings, keep the one with highest severity
+    // and most evidence, merge the rest into it
+    let mut merge_map: HashMap<usize, Vec<usize>> = HashMap::new();
+    let mut remove_indices: Vec<usize> = Vec::new();
+
+    for indices in cwe_groups.values() {
+        if indices.len() <= 1 {
+            continue;
+        }
+        // Find the "primary" finding: highest severity, then most evidence, then longest description
+        let Some(&primary_idx) = indices.iter().max_by(|&&a, &&b| {
+            deduped[a]
+                .severity
+                .cmp(&deduped[b].severity)
+                .then_with(|| deduped[a].evidence.len().cmp(&deduped[b].evidence.len()))
+                .then_with(|| {
+                    deduped[a]
+                        .description
+                        .len()
+                        .cmp(&deduped[b].description.len())
+                })
+        }) else {
+            continue;
+        };
+
+        for &idx in indices {
+            if idx != primary_idx {
+                remove_indices.push(idx);
+                merge_map.entry(primary_idx).or_default().push(idx);
+            }
+        }
+    }
+
+    if !remove_indices.is_empty() {
+        remove_indices.sort_unstable();
+        remove_indices.dedup();
+
+        // Merge evidence
+        for (&primary, secondaries) in &merge_map {
+            let extra_evidence: Vec<_> = secondaries
+                .iter()
+                .flat_map(|&i| deduped[i].evidence.clone())
+                .collect();
+            let any_exploitable = secondaries.iter().any(|&i| deduped[i].exploitable);
+
+            deduped[primary].evidence.extend(extra_evidence);
+            if any_exploitable {
+                deduped[primary].exploitable = true;
+            }
+        }
+
+        // Remove merged findings (iterate in reverse to preserve indices)
+        for &idx in remove_indices.iter().rev() {
+            deduped.remove(idx);
+        }
+    }
+
+    let after = deduped.len();
+    if before != after {
+        tracing::debug!(
+            "DAST CWE-based dedup: {before} → {after} findings ({} merged)",
+            before - after
+        );
+    }
+
+    deduped
+}
+
+/// Merge a duplicate DAST finding into a primary one.
+fn merge_dast_finding(primary: &mut DastFinding, duplicate: &DastFinding) {
+    primary.evidence.extend(duplicate.evidence.clone());
+    if duplicate.severity > primary.severity {
+        primary.severity = duplicate.severity.clone();
+    }
+    if duplicate.exploitable {
+        primary.exploitable = true;
+    }
+    // Keep the longer/better description
+    if duplicate.description.len() > primary.description.len() {
+        primary.description.clone_from(&duplicate.description);
+    }
+    // Keep remediation if primary doesn't have one
+    if primary.remediation.is_none() && duplicate.remediation.is_some() {
+        primary.remediation.clone_from(&duplicate.remediation);
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
+    use compliance_core::models::dast::DastVulnType;
+    use compliance_core::models::finding::Severity;

    #[test]
    fn fingerprint_is_deterministic() {
@@ -55,4 +257,159 @@ mod tests {
        let b = compute_fingerprint(&["a", "bc"]);
        assert_ne!(a, b);
    }
+
+    fn make_dast(title: &str, endpoint: &str, vuln_type: DastVulnType) -> DastFinding {
+        let mut f = DastFinding::new(
+            "run1".into(),
+            "target1".into(),
+            vuln_type,
+            title.into(),
+            format!("Description for {title}"),
+            Severity::Medium,
+            endpoint.into(),
+            "GET".into(),
+        );
+        f.cwe = Some("CWE-319".into());
+        f
+    }
+
+    #[test]
+    fn canonicalize_strips_domain_suffix() {
+        let canon = canonicalize_dast_title("Missing HSTS header for comp-dev.meghsakha.com");
+        assert!(!canon.contains("meghsakha"), "domain should be stripped");
+        assert!(
+            canon.contains("strict-transport-security"),
+            "hsts should be resolved: {canon}"
+        );
+    }
+
+    #[test]
+    fn canonicalize_resolves_synonyms() {
+        let a = canonicalize_dast_title("Missing HSTS header");
+        let b = canonicalize_dast_title("Missing strict-transport-security header");
+        assert_eq!(a, b);
+    }
+
+    #[test]
+    fn exact_dedup_merges_identical_findings() {
+        let f1 = make_dast(
+            "Missing strict-transport-security header",
+            "https://example.com",
+            DastVulnType::SecurityHeaderMissing,
+        );
+        let f2 = make_dast(
+            "Missing strict-transport-security header",
+            "https://example.com",
+            DastVulnType::SecurityHeaderMissing,
+        );
+        let result = dedup_dast_findings(vec![f1, f2]);
+        assert_eq!(result.len(), 1, "exact duplicates should be merged");
+    }
+
+    #[test]
+    fn synonym_dedup_merges_hsts_variants() {
+        let f1 = make_dast(
+            "Missing strict-transport-security header",
+            "https://example.com",
+            DastVulnType::SecurityHeaderMissing,
+        );
+        let f2 = make_dast(
+            "Missing HSTS header for example.com",
+            "https://example.com",
+            DastVulnType::TlsMisconfiguration,
+        );
+        let result = dedup_dast_findings(vec![f1, f2]);
+        assert_eq!(
+            result.len(),
+            1,
+            "HSTS synonym variants should merge to 1 finding"
+        );
+    }
+
+    #[test]
+    fn different_headers_not_merged() {
+        let mut f1 = make_dast(
+            "Missing x-content-type-options header",
+            "https://example.com",
+            DastVulnType::SecurityHeaderMissing,
+        );
+        f1.cwe = Some("CWE-16".into());
+        let mut f2 = make_dast(
+            "Missing permissions-policy header",
+            "https://example.com",
+            DastVulnType::SecurityHeaderMissing,
+        );
+        f2.cwe = Some("CWE-16".into());
+        // These share CWE-16 but are different headers — phase 2 will merge them
+        // since they share the same CWE+endpoint. This is acceptable because they
+        // have the same root cause (missing security headers configuration).
+        let result = dedup_dast_findings(vec![f1, f2]);
+        // CWE-based dedup will merge these into 1
+        assert!(
+            result.len() <= 2,
+            "same CWE+endpoint findings may be merged"
+        );
+    }
+
+    #[test]
+    fn different_endpoints_not_merged() {
+        let f1 = make_dast(
+            "Missing strict-transport-security header",
+            "https://example.com",
+            DastVulnType::SecurityHeaderMissing,
+        );
+        let f2 = make_dast(
+            "Missing strict-transport-security header",
+            "https://other.com",
+            DastVulnType::SecurityHeaderMissing,
+        );
+        let result = dedup_dast_findings(vec![f1, f2]);
+        assert_eq!(result.len(), 2, "different endpoints should not merge");
+    }
+
+    #[test]
+    fn dedup_preserves_highest_severity() {
+        let f1 = make_dast(
+            "Missing strict-transport-security header",
+            "https://example.com",
+            DastVulnType::SecurityHeaderMissing,
+        );
+        let mut f2 = make_dast(
+            "Missing strict-transport-security header",
+            "https://example.com",
+            DastVulnType::SecurityHeaderMissing,
+        );
+        f2.severity = Severity::High;
+        let result = dedup_dast_findings(vec![f1, f2]);
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].severity, Severity::High);
+    }
+
+    #[test]
+    fn dedup_merges_evidence() {
+        let mut f1 = make_dast(
+            "Missing strict-transport-security header",
+            "https://example.com",
+            DastVulnType::SecurityHeaderMissing,
+        );
+        f1.evidence
+            .push(compliance_core::models::dast::DastEvidence {
+                request_method: "GET".into(),
+                request_url: "https://example.com".into(),
+                request_headers: None,
+                request_body: None,
+                response_status: 200,
+                response_headers: None,
+                response_snippet: Some("pass 1".into()),
+                screenshot_path: None,
+                payload: None,
+                response_time_ms: None,
+            });
+        let mut f2 = f1.clone();
+        f2.evidence[0].response_snippet = Some("pass 2".into());
+
+        let result = dedup_dast_findings(vec![f1, f2]);
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].evidence.len(), 2, "evidence should be merged");
+    }
 }
@@ -1,5 +1,6 @@
 use compliance_core::models::*;

+use super::dedup::compute_fingerprint;
 use super::orchestrator::PipelineOrchestrator;
 use crate::error::AgentError;
 use crate::pipeline::code_review::CodeReviewScanner;
@@ -89,12 +90,37 @@ impl PipelineOrchestrator {
            return Ok(());
        }

+        // Dedup findings by fingerprint to avoid duplicate comments
+        let mut seen_fps = std::collections::HashSet::new();
+        let mut unique_findings: Vec<&Finding> = Vec::new();
+        for finding in &pr_findings {
+            let fp = compute_fingerprint(&[
+                repo_id,
+                &pr_number.to_string(),
+                finding.file_path.as_deref().unwrap_or(""),
+                &finding.line_number.unwrap_or(0).to_string(),
+                &finding.title,
+            ]);
+            if seen_fps.insert(fp) {
+                unique_findings.push(finding);
+            }
+        }
+
+        let pr_findings = unique_findings;
+
        // Build review comments from findings
        let mut review_comments = Vec::new();
        for finding in &pr_findings {
            if let (Some(path), Some(line)) = (&finding.file_path, finding.line_number) {
+                let fp = compute_fingerprint(&[
+                    repo_id,
+                    &pr_number.to_string(),
+                    path,
+                    &line.to_string(),
+                    &finding.title,
+                ]);
                let comment_body = format!(
-                    "**[{}] {}**\n\n{}\n\n*Scanner: {} | {}*",
+                    "**[{}] {}**\n\n{}\n\n*Scanner: {} | {}*\n\n<!-- compliance-fp:{fp} -->",
                    finding.severity,
                    finding.title,
                    finding.description,
@@ -123,6 +149,17 @@ impl PipelineOrchestrator {
                .join("\n"),
        );

+        if review_comments.is_empty() {
+            // All findings were on files/lines we can't comment on inline
+            if let Err(e) = tracker
+                .create_pr_review(owner, tracker_repo_name, pr_number, &summary, Vec::new())
+                .await
+            {
+                tracing::warn!("[{repo_id}] Failed to post PR review summary: {e}");
+            }
+            return Ok(());
+        }
+
        if let Err(e) = tracker
            .create_pr_review(
                owner,