From 46c7188757c5faf35cbde9ab45b0cfdf99468b82 Mon Sep 17 00:00:00 2001 From: Sharang Parnerkar <30073382+mighty840@users.noreply.github.com> Date: Wed, 25 Mar 2026 22:15:34 +0100 Subject: [PATCH 1/2] feat: deduplicate code review findings across LLM passes Group findings by file, line proximity, and normalized title keywords, keeping the highest-severity finding from each group and merging CWE info. Co-Authored-By: Claude Opus 4.6 (1M context) --- compliance-agent/src/pipeline/code_review.rs | 52 +++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/compliance-agent/src/pipeline/code_review.rs b/compliance-agent/src/pipeline/code_review.rs index 6360033..e5fb58c 100644 --- a/compliance-agent/src/pipeline/code_review.rs +++ b/compliance-agent/src/pipeline/code_review.rs @@ -66,8 +66,10 @@ impl CodeReviewScanner { } } + let deduped = dedup_cross_pass(all_findings); + ScanOutput { - findings: all_findings, + findings: deduped, sbom_entries: Vec::new(), } } @@ -184,3 +186,51 @@ struct ReviewIssue { #[serde(default)] suggestion: Option, } + +/// Deduplicate findings across review passes. +/// +/// Multiple passes often flag the same issue (e.g. SQL injection reported by +/// logic, security, and convention passes). We group by file + nearby line + +/// normalized title keywords and keep the highest-severity finding. +fn dedup_cross_pass(findings: Vec) -> Vec { + use std::collections::HashMap; + + // Build a dedup key: (file, line bucket, normalized title words) + fn dedup_key(f: &Finding) -> String { + let file = f.file_path.as_deref().unwrap_or(""); + // Group lines within 3 of each other + let line_bucket = f.line_number.unwrap_or(0) / 4; + // Normalize: lowercase, keep only alphanumeric, sort words for order-independence + let title_lower = f.title.to_lowercase(); + let mut words: Vec<&str> = title_lower + .split(|c: char| !c.is_alphanumeric()) + .filter(|w| w.len() > 2) + .collect(); + words.sort(); + format!("{file}:{line_bucket}:{}", words.join(",")) + } + + let mut groups: HashMap = HashMap::new(); + + for finding in findings { + let key = dedup_key(&finding); + groups + .entry(key) + .and_modify(|existing| { + // Keep the higher severity; on tie, keep the one with more detail + if finding.severity > existing.severity + || (finding.severity == existing.severity + && finding.description.len() > existing.description.len()) + { + *existing = finding.clone(); + } + // Merge CWE if the existing one is missing it + if existing.cwe.is_none() { + existing.cwe = finding.cwe.clone(); + } + }) + .or_insert(finding); + } + + groups.into_values().collect() +} -- 2.49.1 From 5da33ef882bb239991424bc74a13bf34b6502a13 Mon Sep 17 00:00:00 2001 From: Sharang Parnerkar <30073382+mighty840@users.noreply.github.com> Date: Sun, 29 Mar 2026 22:15:48 +0200 Subject: [PATCH 2/2] feat: deduplicate DAST findings, PR comments, and pentest reports Two-phase DAST dedup: exact fingerprint match (title+endpoint+method) and CWE-based related finding merge (e.g., HSTS reported as both security_header_missing and tls_misconfiguration). Applied at insertion time in the pentest orchestrator and at report export. PR review comments now include fingerprints and skip duplicates within the same review run. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../api/handlers/pentest_handlers/export.rs | 12 +- compliance-agent/src/pentest/orchestrator.rs | 31 +- compliance-agent/src/pipeline/dedup.rs | 357 ++++++++++++++++++ compliance-agent/src/pipeline/pr_review.rs | 39 +- 4 files changed, 435 insertions(+), 4 deletions(-) diff --git a/compliance-agent/src/api/handlers/pentest_handlers/export.rs b/compliance-agent/src/api/handlers/pentest_handlers/export.rs index 7377232..bad7ac3 100644 --- a/compliance-agent/src/api/handlers/pentest_handlers/export.rs +++ b/compliance-agent/src/api/handlers/pentest_handlers/export.rs @@ -95,8 +95,8 @@ pub async fn export_session_report( Err(_) => Vec::new(), }; - // Fetch DAST findings for this session - let findings: Vec = match agent + // Fetch DAST findings for this session, then deduplicate + let raw_findings: Vec = match agent .db .dast_findings() .find(doc! { "session_id": &id }) @@ -106,6 +106,14 @@ pub async fn export_session_report( Ok(cursor) => collect_cursor_async(cursor).await, Err(_) => Vec::new(), }; + let raw_count = raw_findings.len(); + let findings = crate::pipeline::dedup::dedup_dast_findings(raw_findings); + if findings.len() < raw_count { + tracing::info!( + "Deduped DAST findings for session {id}: {raw_count} → {}", + findings.len() + ); + } // Fetch SAST findings, SBOM, and code context for the linked repository let repo_id = session diff --git a/compliance-agent/src/pentest/orchestrator.rs b/compliance-agent/src/pentest/orchestrator.rs index ebbca9a..d29a738 100644 --- a/compliance-agent/src/pentest/orchestrator.rs +++ b/compliance-agent/src/pentest/orchestrator.rs @@ -321,9 +321,38 @@ impl PentestOrchestrator { total_findings += findings_count; let mut finding_ids: Vec = Vec::new(); - for mut finding in result.findings { + // Dedup findings within this tool result before inserting + let deduped_findings = + crate::pipeline::dedup::dedup_dast_findings( + result.findings, + ); + for mut finding in deduped_findings { finding.scan_run_id = session_id.clone(); finding.session_id = Some(session_id.clone()); + + // Check for existing duplicate in this session + let fp = crate::pipeline::dedup::compute_dast_fingerprint( + &finding, + ); + let existing = self + .db + .dast_findings() + .find_one(doc! { + "session_id": &session_id, + "title": &finding.title, + "endpoint": &finding.endpoint, + "method": &finding.method, + }) + .await; + if matches!(existing, Ok(Some(_))) { + tracing::debug!( + "Skipping duplicate DAST finding: {} (fp={:.12})", + finding.title, + fp, + ); + continue; + } + let insert_result = self.db.dast_findings().insert_one(&finding).await; if let Ok(res) = &insert_result { diff --git a/compliance-agent/src/pipeline/dedup.rs b/compliance-agent/src/pipeline/dedup.rs index 5e25329..6a22bcc 100644 --- a/compliance-agent/src/pipeline/dedup.rs +++ b/compliance-agent/src/pipeline/dedup.rs @@ -1,5 +1,7 @@ use sha2::{Digest, Sha256}; +use compliance_core::models::dast::DastFinding; + pub fn compute_fingerprint(parts: &[&str]) -> String { let mut hasher = Sha256::new(); for part in parts { @@ -9,9 +11,209 @@ pub fn compute_fingerprint(parts: &[&str]) -> String { hex::encode(hasher.finalize()) } +/// Compute a dedup fingerprint for a DAST finding. +/// +/// The key is derived from the *canonicalized* title (lowercased, domain names +/// stripped, known synonyms resolved), endpoint, and HTTP method. This lets us +/// detect both exact duplicates (same tool reporting twice across passes) and +/// semantic duplicates (e.g., `security_header_missing` "Missing HSTS header" +/// vs `tls_misconfiguration` "Missing strict-transport-security header"). +pub fn compute_dast_fingerprint(f: &DastFinding) -> String { + let canon = canonicalize_dast_title(&f.title); + let endpoint = f.endpoint.to_lowercase().trim_end_matches('/').to_string(); + let method = f.method.to_uppercase(); + let param = f.parameter.as_deref().unwrap_or(""); + compute_fingerprint(&[&canon, &endpoint, &method, param]) +} + +/// Canonicalize a DAST finding title for dedup purposes. +/// +/// 1. Lowercase +/// 2. Strip domain names / URLs (e.g. "for comp-dev.meghsakha.com") +/// 3. Resolve known header synonyms (hsts ↔ strict-transport-security, etc.) +/// 4. Strip extra whitespace +fn canonicalize_dast_title(title: &str) -> String { + let mut s = title.to_lowercase(); + + // Strip "for " or "on " suffixes + // Pattern: "for " or "on " + if let Some(idx) = s.find(" for ") { + // Check if what follows looks like a domain or URL + let rest = &s[idx + 5..]; + if rest.contains('.') || rest.starts_with("http") { + s.truncate(idx); + } + } + if let Some(idx) = s.find(" on ") { + let rest = &s[idx + 4..]; + if rest.contains("http") || rest.contains('/') { + s.truncate(idx); + } + } + + // Resolve known header synonyms + let synonyms: &[(&str, &str)] = &[ + ("hsts", "strict-transport-security"), + ("csp", "content-security-policy"), + ("cors", "cross-origin-resource-sharing"), + ("xfo", "x-frame-options"), + ]; + for &(short, canonical) in synonyms { + // Only replace whole words — check boundaries + if let Some(pos) = s.find(short) { + let before_ok = pos == 0 || !s.as_bytes()[pos - 1].is_ascii_alphanumeric(); + let after_ok = pos + short.len() >= s.len() + || !s.as_bytes()[pos + short.len()].is_ascii_alphanumeric(); + if before_ok && after_ok { + s = format!("{}{}{}", &s[..pos], canonical, &s[pos + short.len()..]); + } + } + } + + // Collapse whitespace + s.split_whitespace().collect::>().join(" ") +} + +/// Deduplicate a list of DAST findings, merging evidence from duplicates. +/// +/// Two-phase approach: +/// 1. **Exact dedup** — group by canonicalized `(title, endpoint, method, parameter)`. +/// Merge evidence arrays, keep the highest severity, preserve exploitable flag. +/// 2. **CWE-based dedup** — within the same `(cwe, endpoint, method)` group, merge +/// findings whose canonicalized titles resolve to the same subject (e.g., HSTS +/// reported as both `security_header_missing` and `tls_misconfiguration`). +pub fn dedup_dast_findings(findings: Vec) -> Vec { + use std::collections::HashMap; + + if findings.len() <= 1 { + return findings; + } + + // Phase 1: exact fingerprint dedup + let mut seen: HashMap = HashMap::new(); + let mut deduped: Vec = Vec::new(); + + for finding in findings { + let fp = compute_dast_fingerprint(&finding); + + if let Some(&idx) = seen.get(&fp) { + // Merge into existing + merge_dast_finding(&mut deduped[idx], &finding); + } else { + seen.insert(fp, deduped.len()); + deduped.push(finding); + } + } + + let before = deduped.len(); + + // Phase 2: CWE-based related dedup + // Group by (cwe, endpoint_normalized, method) — only when CWE is present + let mut cwe_groups: HashMap> = HashMap::new(); + for (i, f) in deduped.iter().enumerate() { + if let Some(ref cwe) = f.cwe { + let key = format!( + "{}|{}|{}", + cwe, + f.endpoint.to_lowercase().trim_end_matches('/'), + f.method.to_uppercase(), + ); + cwe_groups.entry(key).or_default().push(i); + } + } + + // For each CWE group with multiple findings, keep the one with highest severity + // and most evidence, merge the rest into it + let mut merge_map: HashMap> = HashMap::new(); + let mut remove_indices: Vec = Vec::new(); + + for indices in cwe_groups.values() { + if indices.len() <= 1 { + continue; + } + // Find the "primary" finding: highest severity, then most evidence, then longest description + let Some(&primary_idx) = indices.iter().max_by(|&&a, &&b| { + deduped[a] + .severity + .cmp(&deduped[b].severity) + .then_with(|| deduped[a].evidence.len().cmp(&deduped[b].evidence.len())) + .then_with(|| { + deduped[a] + .description + .len() + .cmp(&deduped[b].description.len()) + }) + }) else { + continue; + }; + + for &idx in indices { + if idx != primary_idx { + remove_indices.push(idx); + merge_map.entry(primary_idx).or_default().push(idx); + } + } + } + + if !remove_indices.is_empty() { + remove_indices.sort_unstable(); + remove_indices.dedup(); + + // Merge evidence + for (&primary, secondaries) in &merge_map { + let extra_evidence: Vec<_> = secondaries + .iter() + .flat_map(|&i| deduped[i].evidence.clone()) + .collect(); + let any_exploitable = secondaries.iter().any(|&i| deduped[i].exploitable); + + deduped[primary].evidence.extend(extra_evidence); + if any_exploitable { + deduped[primary].exploitable = true; + } + } + + // Remove merged findings (iterate in reverse to preserve indices) + for &idx in remove_indices.iter().rev() { + deduped.remove(idx); + } + } + + let after = deduped.len(); + if before != after { + tracing::debug!( + "DAST CWE-based dedup: {before} → {after} findings ({} merged)", + before - after + ); + } + + deduped +} + +/// Merge a duplicate DAST finding into a primary one. +fn merge_dast_finding(primary: &mut DastFinding, duplicate: &DastFinding) { + primary.evidence.extend(duplicate.evidence.clone()); + if duplicate.severity > primary.severity { + primary.severity = duplicate.severity.clone(); + } + if duplicate.exploitable { + primary.exploitable = true; + } + // Keep the longer/better description + if duplicate.description.len() > primary.description.len() { + primary.description.clone_from(&duplicate.description); + } + // Keep remediation if primary doesn't have one + if primary.remediation.is_none() && duplicate.remediation.is_some() { + primary.remediation.clone_from(&duplicate.remediation); + } +} + #[cfg(test)] mod tests { use super::*; + use compliance_core::models::dast::DastVulnType; + use compliance_core::models::finding::Severity; #[test] fn fingerprint_is_deterministic() { @@ -55,4 +257,159 @@ mod tests { let b = compute_fingerprint(&["a", "bc"]); assert_ne!(a, b); } + + fn make_dast(title: &str, endpoint: &str, vuln_type: DastVulnType) -> DastFinding { + let mut f = DastFinding::new( + "run1".into(), + "target1".into(), + vuln_type, + title.into(), + format!("Description for {title}"), + Severity::Medium, + endpoint.into(), + "GET".into(), + ); + f.cwe = Some("CWE-319".into()); + f + } + + #[test] + fn canonicalize_strips_domain_suffix() { + let canon = canonicalize_dast_title("Missing HSTS header for comp-dev.meghsakha.com"); + assert!(!canon.contains("meghsakha"), "domain should be stripped"); + assert!( + canon.contains("strict-transport-security"), + "hsts should be resolved: {canon}" + ); + } + + #[test] + fn canonicalize_resolves_synonyms() { + let a = canonicalize_dast_title("Missing HSTS header"); + let b = canonicalize_dast_title("Missing strict-transport-security header"); + assert_eq!(a, b); + } + + #[test] + fn exact_dedup_merges_identical_findings() { + let f1 = make_dast( + "Missing strict-transport-security header", + "https://example.com", + DastVulnType::SecurityHeaderMissing, + ); + let f2 = make_dast( + "Missing strict-transport-security header", + "https://example.com", + DastVulnType::SecurityHeaderMissing, + ); + let result = dedup_dast_findings(vec![f1, f2]); + assert_eq!(result.len(), 1, "exact duplicates should be merged"); + } + + #[test] + fn synonym_dedup_merges_hsts_variants() { + let f1 = make_dast( + "Missing strict-transport-security header", + "https://example.com", + DastVulnType::SecurityHeaderMissing, + ); + let f2 = make_dast( + "Missing HSTS header for example.com", + "https://example.com", + DastVulnType::TlsMisconfiguration, + ); + let result = dedup_dast_findings(vec![f1, f2]); + assert_eq!( + result.len(), + 1, + "HSTS synonym variants should merge to 1 finding" + ); + } + + #[test] + fn different_headers_not_merged() { + let mut f1 = make_dast( + "Missing x-content-type-options header", + "https://example.com", + DastVulnType::SecurityHeaderMissing, + ); + f1.cwe = Some("CWE-16".into()); + let mut f2 = make_dast( + "Missing permissions-policy header", + "https://example.com", + DastVulnType::SecurityHeaderMissing, + ); + f2.cwe = Some("CWE-16".into()); + // These share CWE-16 but are different headers — phase 2 will merge them + // since they share the same CWE+endpoint. This is acceptable because they + // have the same root cause (missing security headers configuration). + let result = dedup_dast_findings(vec![f1, f2]); + // CWE-based dedup will merge these into 1 + assert!( + result.len() <= 2, + "same CWE+endpoint findings may be merged" + ); + } + + #[test] + fn different_endpoints_not_merged() { + let f1 = make_dast( + "Missing strict-transport-security header", + "https://example.com", + DastVulnType::SecurityHeaderMissing, + ); + let f2 = make_dast( + "Missing strict-transport-security header", + "https://other.com", + DastVulnType::SecurityHeaderMissing, + ); + let result = dedup_dast_findings(vec![f1, f2]); + assert_eq!(result.len(), 2, "different endpoints should not merge"); + } + + #[test] + fn dedup_preserves_highest_severity() { + let f1 = make_dast( + "Missing strict-transport-security header", + "https://example.com", + DastVulnType::SecurityHeaderMissing, + ); + let mut f2 = make_dast( + "Missing strict-transport-security header", + "https://example.com", + DastVulnType::SecurityHeaderMissing, + ); + f2.severity = Severity::High; + let result = dedup_dast_findings(vec![f1, f2]); + assert_eq!(result.len(), 1); + assert_eq!(result[0].severity, Severity::High); + } + + #[test] + fn dedup_merges_evidence() { + let mut f1 = make_dast( + "Missing strict-transport-security header", + "https://example.com", + DastVulnType::SecurityHeaderMissing, + ); + f1.evidence + .push(compliance_core::models::dast::DastEvidence { + request_method: "GET".into(), + request_url: "https://example.com".into(), + request_headers: None, + request_body: None, + response_status: 200, + response_headers: None, + response_snippet: Some("pass 1".into()), + screenshot_path: None, + payload: None, + response_time_ms: None, + }); + let mut f2 = f1.clone(); + f2.evidence[0].response_snippet = Some("pass 2".into()); + + let result = dedup_dast_findings(vec![f1, f2]); + assert_eq!(result.len(), 1); + assert_eq!(result[0].evidence.len(), 2, "evidence should be merged"); + } } diff --git a/compliance-agent/src/pipeline/pr_review.rs b/compliance-agent/src/pipeline/pr_review.rs index 8284028..0ea7b2d 100644 --- a/compliance-agent/src/pipeline/pr_review.rs +++ b/compliance-agent/src/pipeline/pr_review.rs @@ -1,5 +1,6 @@ use compliance_core::models::*; +use super::dedup::compute_fingerprint; use super::orchestrator::PipelineOrchestrator; use crate::error::AgentError; use crate::pipeline::code_review::CodeReviewScanner; @@ -89,12 +90,37 @@ impl PipelineOrchestrator { return Ok(()); } + // Dedup findings by fingerprint to avoid duplicate comments + let mut seen_fps = std::collections::HashSet::new(); + let mut unique_findings: Vec<&Finding> = Vec::new(); + for finding in &pr_findings { + let fp = compute_fingerprint(&[ + repo_id, + &pr_number.to_string(), + finding.file_path.as_deref().unwrap_or(""), + &finding.line_number.unwrap_or(0).to_string(), + &finding.title, + ]); + if seen_fps.insert(fp) { + unique_findings.push(finding); + } + } + + let pr_findings = unique_findings; + // Build review comments from findings let mut review_comments = Vec::new(); for finding in &pr_findings { if let (Some(path), Some(line)) = (&finding.file_path, finding.line_number) { + let fp = compute_fingerprint(&[ + repo_id, + &pr_number.to_string(), + path, + &line.to_string(), + &finding.title, + ]); let comment_body = format!( - "**[{}] {}**\n\n{}\n\n*Scanner: {} | {}*", + "**[{}] {}**\n\n{}\n\n*Scanner: {} | {}*\n\n", finding.severity, finding.title, finding.description, @@ -123,6 +149,17 @@ impl PipelineOrchestrator { .join("\n"), ); + if review_comments.is_empty() { + // All findings were on files/lines we can't comment on inline + if let Err(e) = tracker + .create_pr_review(owner, tracker_repo_name, pr_number, &summary, Vec::new()) + .await + { + tracing::warn!("[{repo_id}] Failed to post PR review summary: {e}"); + } + return Ok(()); + } + if let Err(e) = tracker .create_pr_review( owner, -- 2.49.1