feat: deduplicate code review findings across LLM passes (#48)
All checks were successful
CI / Check (push) Has been skipped
CI / Detect Changes (push) Successful in 7s
CI / Deploy Agent (push) Successful in 2s
CI / Deploy Dashboard (push) Has been skipped
CI / Deploy Docs (push) Has been skipped
CI / Deploy MCP (push) Has been skipped

This commit was merged in pull request #48.
This commit is contained in:
2026-03-29 20:38:52 +00:00
parent 745ad8a441
commit ff088f9eb4
5 changed files with 486 additions and 5 deletions

View File

@@ -95,8 +95,8 @@ pub async fn export_session_report(
Err(_) => Vec::new(),
};
// Fetch DAST findings for this session
let findings: Vec<DastFinding> = match agent
// Fetch DAST findings for this session, then deduplicate
let raw_findings: Vec<DastFinding> = match agent
.db
.dast_findings()
.find(doc! { "session_id": &id })
@@ -106,6 +106,14 @@ pub async fn export_session_report(
Ok(cursor) => collect_cursor_async(cursor).await,
Err(_) => Vec::new(),
};
let raw_count = raw_findings.len();
let findings = crate::pipeline::dedup::dedup_dast_findings(raw_findings);
if findings.len() < raw_count {
tracing::info!(
"Deduped DAST findings for session {id}: {raw_count} → {}",
findings.len()
);
}
// Fetch SAST findings, SBOM, and code context for the linked repository
let repo_id = session

View File

@@ -321,9 +321,38 @@ impl PentestOrchestrator {
total_findings += findings_count;
let mut finding_ids: Vec<String> = Vec::new();
for mut finding in result.findings {
// Dedup findings within this tool result before inserting
let deduped_findings =
crate::pipeline::dedup::dedup_dast_findings(
result.findings,
);
for mut finding in deduped_findings {
finding.scan_run_id = session_id.clone();
finding.session_id = Some(session_id.clone());
// Check for existing duplicate in this session
let fp = crate::pipeline::dedup::compute_dast_fingerprint(
&finding,
);
let existing = self
.db
.dast_findings()
.find_one(doc! {
"session_id": &session_id,
"title": &finding.title,
"endpoint": &finding.endpoint,
"method": &finding.method,
})
.await;
if matches!(existing, Ok(Some(_))) {
tracing::debug!(
"Skipping duplicate DAST finding: {} (fp={:.12})",
finding.title,
fp,
);
continue;
}
let insert_result =
self.db.dast_findings().insert_one(&finding).await;
if let Ok(res) = &insert_result {

View File

@@ -66,8 +66,10 @@ impl CodeReviewScanner {
}
}
let deduped = dedup_cross_pass(all_findings);
ScanOutput {
findings: all_findings,
findings: deduped,
sbom_entries: Vec::new(),
}
}
@@ -184,3 +186,51 @@ struct ReviewIssue {
#[serde(default)]
suggestion: Option<String>,
}
/// Deduplicate findings across review passes.
///
/// Multiple passes often flag the same issue (e.g. SQL injection reported by
/// logic, security, and convention passes). We group by file + nearby line +
/// normalized title keywords and keep the highest-severity finding.
fn dedup_cross_pass(findings: Vec<Finding>) -> Vec<Finding> {
use std::collections::HashMap;
// Build a dedup key: (file, line bucket, normalized title words)
fn dedup_key(f: &Finding) -> String {
let file = f.file_path.as_deref().unwrap_or("");
// Group lines within 3 of each other
let line_bucket = f.line_number.unwrap_or(0) / 4;
// Normalize: lowercase, keep only alphanumeric, sort words for order-independence
let title_lower = f.title.to_lowercase();
let mut words: Vec<&str> = title_lower
.split(|c: char| !c.is_alphanumeric())
.filter(|w| w.len() > 2)
.collect();
words.sort();
format!("{file}:{line_bucket}:{}", words.join(","))
}
let mut groups: HashMap<String, Finding> = HashMap::new();
for finding in findings {
let key = dedup_key(&finding);
groups
.entry(key)
.and_modify(|existing| {
// Keep the higher severity; on tie, keep the one with more detail
if finding.severity > existing.severity
|| (finding.severity == existing.severity
&& finding.description.len() > existing.description.len())
{
*existing = finding.clone();
}
// Merge CWE if the existing one is missing it
if existing.cwe.is_none() {
existing.cwe = finding.cwe.clone();
}
})
.or_insert(finding);
}
groups.into_values().collect()
}

View File

@@ -1,5 +1,7 @@
use sha2::{Digest, Sha256};
use compliance_core::models::dast::DastFinding;
pub fn compute_fingerprint(parts: &[&str]) -> String {
let mut hasher = Sha256::new();
for part in parts {
@@ -9,9 +11,209 @@ pub fn compute_fingerprint(parts: &[&str]) -> String {
hex::encode(hasher.finalize())
}
/// Compute a dedup fingerprint for a DAST finding.
///
/// The key is derived from the *canonicalized* title (lowercased, domain names
/// stripped, known synonyms resolved), endpoint, and HTTP method. This lets us
/// detect both exact duplicates (same tool reporting twice across passes) and
/// semantic duplicates (e.g., `security_header_missing` "Missing HSTS header"
/// vs `tls_misconfiguration` "Missing strict-transport-security header").
pub fn compute_dast_fingerprint(f: &DastFinding) -> String {
let canon = canonicalize_dast_title(&f.title);
let endpoint = f.endpoint.to_lowercase().trim_end_matches('/').to_string();
let method = f.method.to_uppercase();
let param = f.parameter.as_deref().unwrap_or("");
compute_fingerprint(&[&canon, &endpoint, &method, param])
}
/// Canonicalize a DAST finding title for dedup purposes.
///
/// 1. Lowercase
/// 2. Strip domain names / URLs (e.g. "for comp-dev.meghsakha.com")
/// 3. Resolve known header synonyms (hsts ↔ strict-transport-security, etc.)
/// 4. Strip extra whitespace
fn canonicalize_dast_title(title: &str) -> String {
let mut s = title.to_lowercase();
// Strip "for <domain>" or "on <domain>" suffixes
// Pattern: "for <word.word...>" or "on <method> <url>"
if let Some(idx) = s.find(" for ") {
// Check if what follows looks like a domain or URL
let rest = &s[idx + 5..];
if rest.contains('.') || rest.starts_with("http") {
s.truncate(idx);
}
}
if let Some(idx) = s.find(" on ") {
let rest = &s[idx + 4..];
if rest.contains("http") || rest.contains('/') {
s.truncate(idx);
}
}
// Resolve known header synonyms
let synonyms: &[(&str, &str)] = &[
("hsts", "strict-transport-security"),
("csp", "content-security-policy"),
("cors", "cross-origin-resource-sharing"),
("xfo", "x-frame-options"),
];
for &(short, canonical) in synonyms {
// Only replace whole words — check boundaries
if let Some(pos) = s.find(short) {
let before_ok = pos == 0 || !s.as_bytes()[pos - 1].is_ascii_alphanumeric();
let after_ok = pos + short.len() >= s.len()
|| !s.as_bytes()[pos + short.len()].is_ascii_alphanumeric();
if before_ok && after_ok {
s = format!("{}{}{}", &s[..pos], canonical, &s[pos + short.len()..]);
}
}
}
// Collapse whitespace
s.split_whitespace().collect::<Vec<_>>().join(" ")
}
/// Deduplicate a list of DAST findings, merging evidence from duplicates.
///
/// Two-phase approach:
/// 1. **Exact dedup** — group by canonicalized `(title, endpoint, method, parameter)`.
/// Merge evidence arrays, keep the highest severity, preserve exploitable flag.
/// 2. **CWE-based dedup** — within the same `(cwe, endpoint, method)` group, merge
/// findings whose canonicalized titles resolve to the same subject (e.g., HSTS
/// reported as both `security_header_missing` and `tls_misconfiguration`).
pub fn dedup_dast_findings(findings: Vec<DastFinding>) -> Vec<DastFinding> {
use std::collections::HashMap;
if findings.len() <= 1 {
return findings;
}
// Phase 1: exact fingerprint dedup
let mut seen: HashMap<String, usize> = HashMap::new();
let mut deduped: Vec<DastFinding> = Vec::new();
for finding in findings {
let fp = compute_dast_fingerprint(&finding);
if let Some(&idx) = seen.get(&fp) {
// Merge into existing
merge_dast_finding(&mut deduped[idx], &finding);
} else {
seen.insert(fp, deduped.len());
deduped.push(finding);
}
}
let before = deduped.len();
// Phase 2: CWE-based related dedup
// Group by (cwe, endpoint_normalized, method) — only when CWE is present
let mut cwe_groups: HashMap<String, Vec<usize>> = HashMap::new();
for (i, f) in deduped.iter().enumerate() {
if let Some(ref cwe) = f.cwe {
let key = format!(
"{}|{}|{}",
cwe,
f.endpoint.to_lowercase().trim_end_matches('/'),
f.method.to_uppercase(),
);
cwe_groups.entry(key).or_default().push(i);
}
}
// For each CWE group with multiple findings, keep the one with highest severity
// and most evidence, merge the rest into it
let mut merge_map: HashMap<usize, Vec<usize>> = HashMap::new();
let mut remove_indices: Vec<usize> = Vec::new();
for indices in cwe_groups.values() {
if indices.len() <= 1 {
continue;
}
// Find the "primary" finding: highest severity, then most evidence, then longest description
let Some(&primary_idx) = indices.iter().max_by(|&&a, &&b| {
deduped[a]
.severity
.cmp(&deduped[b].severity)
.then_with(|| deduped[a].evidence.len().cmp(&deduped[b].evidence.len()))
.then_with(|| {
deduped[a]
.description
.len()
.cmp(&deduped[b].description.len())
})
}) else {
continue;
};
for &idx in indices {
if idx != primary_idx {
remove_indices.push(idx);
merge_map.entry(primary_idx).or_default().push(idx);
}
}
}
if !remove_indices.is_empty() {
remove_indices.sort_unstable();
remove_indices.dedup();
// Merge evidence
for (&primary, secondaries) in &merge_map {
let extra_evidence: Vec<_> = secondaries
.iter()
.flat_map(|&i| deduped[i].evidence.clone())
.collect();
let any_exploitable = secondaries.iter().any(|&i| deduped[i].exploitable);
deduped[primary].evidence.extend(extra_evidence);
if any_exploitable {
deduped[primary].exploitable = true;
}
}
// Remove merged findings (iterate in reverse to preserve indices)
for &idx in remove_indices.iter().rev() {
deduped.remove(idx);
}
}
let after = deduped.len();
if before != after {
tracing::debug!(
"DAST CWE-based dedup: {before} → {after} findings ({} merged)",
before - after
);
}
deduped
}
/// Merge a duplicate DAST finding into a primary one.
fn merge_dast_finding(primary: &mut DastFinding, duplicate: &DastFinding) {
primary.evidence.extend(duplicate.evidence.clone());
if duplicate.severity > primary.severity {
primary.severity = duplicate.severity.clone();
}
if duplicate.exploitable {
primary.exploitable = true;
}
// Keep the longer/better description
if duplicate.description.len() > primary.description.len() {
primary.description.clone_from(&duplicate.description);
}
// Keep remediation if primary doesn't have one
if primary.remediation.is_none() && duplicate.remediation.is_some() {
primary.remediation.clone_from(&duplicate.remediation);
}
}
#[cfg(test)]
mod tests {
use super::*;
use compliance_core::models::dast::DastVulnType;
use compliance_core::models::finding::Severity;
#[test]
fn fingerprint_is_deterministic() {
@@ -55,4 +257,159 @@ mod tests {
let b = compute_fingerprint(&["a", "bc"]);
assert_ne!(a, b);
}
fn make_dast(title: &str, endpoint: &str, vuln_type: DastVulnType) -> DastFinding {
let mut f = DastFinding::new(
"run1".into(),
"target1".into(),
vuln_type,
title.into(),
format!("Description for {title}"),
Severity::Medium,
endpoint.into(),
"GET".into(),
);
f.cwe = Some("CWE-319".into());
f
}
#[test]
fn canonicalize_strips_domain_suffix() {
let canon = canonicalize_dast_title("Missing HSTS header for comp-dev.meghsakha.com");
assert!(!canon.contains("meghsakha"), "domain should be stripped");
assert!(
canon.contains("strict-transport-security"),
"hsts should be resolved: {canon}"
);
}
#[test]
fn canonicalize_resolves_synonyms() {
let a = canonicalize_dast_title("Missing HSTS header");
let b = canonicalize_dast_title("Missing strict-transport-security header");
assert_eq!(a, b);
}
#[test]
fn exact_dedup_merges_identical_findings() {
let f1 = make_dast(
"Missing strict-transport-security header",
"https://example.com",
DastVulnType::SecurityHeaderMissing,
);
let f2 = make_dast(
"Missing strict-transport-security header",
"https://example.com",
DastVulnType::SecurityHeaderMissing,
);
let result = dedup_dast_findings(vec![f1, f2]);
assert_eq!(result.len(), 1, "exact duplicates should be merged");
}
#[test]
fn synonym_dedup_merges_hsts_variants() {
let f1 = make_dast(
"Missing strict-transport-security header",
"https://example.com",
DastVulnType::SecurityHeaderMissing,
);
let f2 = make_dast(
"Missing HSTS header for example.com",
"https://example.com",
DastVulnType::TlsMisconfiguration,
);
let result = dedup_dast_findings(vec![f1, f2]);
assert_eq!(
result.len(),
1,
"HSTS synonym variants should merge to 1 finding"
);
}
#[test]
fn different_headers_not_merged() {
let mut f1 = make_dast(
"Missing x-content-type-options header",
"https://example.com",
DastVulnType::SecurityHeaderMissing,
);
f1.cwe = Some("CWE-16".into());
let mut f2 = make_dast(
"Missing permissions-policy header",
"https://example.com",
DastVulnType::SecurityHeaderMissing,
);
f2.cwe = Some("CWE-16".into());
// These share CWE-16 but are different headers — phase 2 will merge them
// since they share the same CWE+endpoint. This is acceptable because they
// have the same root cause (missing security headers configuration).
let result = dedup_dast_findings(vec![f1, f2]);
// CWE-based dedup will merge these into 1
assert!(
result.len() <= 2,
"same CWE+endpoint findings may be merged"
);
}
#[test]
fn different_endpoints_not_merged() {
let f1 = make_dast(
"Missing strict-transport-security header",
"https://example.com",
DastVulnType::SecurityHeaderMissing,
);
let f2 = make_dast(
"Missing strict-transport-security header",
"https://other.com",
DastVulnType::SecurityHeaderMissing,
);
let result = dedup_dast_findings(vec![f1, f2]);
assert_eq!(result.len(), 2, "different endpoints should not merge");
}
#[test]
fn dedup_preserves_highest_severity() {
let f1 = make_dast(
"Missing strict-transport-security header",
"https://example.com",
DastVulnType::SecurityHeaderMissing,
);
let mut f2 = make_dast(
"Missing strict-transport-security header",
"https://example.com",
DastVulnType::SecurityHeaderMissing,
);
f2.severity = Severity::High;
let result = dedup_dast_findings(vec![f1, f2]);
assert_eq!(result.len(), 1);
assert_eq!(result[0].severity, Severity::High);
}
#[test]
fn dedup_merges_evidence() {
let mut f1 = make_dast(
"Missing strict-transport-security header",
"https://example.com",
DastVulnType::SecurityHeaderMissing,
);
f1.evidence
.push(compliance_core::models::dast::DastEvidence {
request_method: "GET".into(),
request_url: "https://example.com".into(),
request_headers: None,
request_body: None,
response_status: 200,
response_headers: None,
response_snippet: Some("pass 1".into()),
screenshot_path: None,
payload: None,
response_time_ms: None,
});
let mut f2 = f1.clone();
f2.evidence[0].response_snippet = Some("pass 2".into());
let result = dedup_dast_findings(vec![f1, f2]);
assert_eq!(result.len(), 1);
assert_eq!(result[0].evidence.len(), 2, "evidence should be merged");
}
}

View File

@@ -1,5 +1,6 @@
use compliance_core::models::*;
use super::dedup::compute_fingerprint;
use super::orchestrator::PipelineOrchestrator;
use crate::error::AgentError;
use crate::pipeline::code_review::CodeReviewScanner;
@@ -89,12 +90,37 @@ impl PipelineOrchestrator {
return Ok(());
}
// Dedup findings by fingerprint to avoid duplicate comments
let mut seen_fps = std::collections::HashSet::new();
let mut unique_findings: Vec<&Finding> = Vec::new();
for finding in &pr_findings {
let fp = compute_fingerprint(&[
repo_id,
&pr_number.to_string(),
finding.file_path.as_deref().unwrap_or(""),
&finding.line_number.unwrap_or(0).to_string(),
&finding.title,
]);
if seen_fps.insert(fp) {
unique_findings.push(finding);
}
}
let pr_findings = unique_findings;
// Build review comments from findings
let mut review_comments = Vec::new();
for finding in &pr_findings {
if let (Some(path), Some(line)) = (&finding.file_path, finding.line_number) {
let fp = compute_fingerprint(&[
repo_id,
&pr_number.to_string(),
path,
&line.to_string(),
&finding.title,
]);
let comment_body = format!(
"**[{}] {}**\n\n{}\n\n*Scanner: {} | {}*",
"**[{}] {}**\n\n{}\n\n*Scanner: {} | {}*\n\n<!-- compliance-fp:{fp} -->",
finding.severity,
finding.title,
finding.description,
@@ -123,6 +149,17 @@ impl PipelineOrchestrator {
.join("\n"),
);
if review_comments.is_empty() {
// All findings were on files/lines we can't comment on inline
if let Err(e) = tracker
.create_pr_review(owner, tracker_repo_name, pr_number, &summary, Vec::new())
.await
{
tracing::warn!("[{repo_id}] Failed to post PR review summary: {e}");
}
return Ok(());
}
if let Err(e) = tracker
.create_pr_review(
owner,