From 46c7188757c5faf35cbde9ab45b0cfdf99468b82 Mon Sep 17 00:00:00 2001 From: Sharang Parnerkar <30073382+mighty840@users.noreply.github.com> Date: Wed, 25 Mar 2026 22:15:34 +0100 Subject: [PATCH] feat: deduplicate code review findings across LLM passes Group findings by file, line proximity, and normalized title keywords, keeping the highest-severity finding from each group and merging CWE info. Co-Authored-By: Claude Opus 4.6 (1M context) --- compliance-agent/src/pipeline/code_review.rs | 52 +++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/compliance-agent/src/pipeline/code_review.rs b/compliance-agent/src/pipeline/code_review.rs index 6360033..e5fb58c 100644 --- a/compliance-agent/src/pipeline/code_review.rs +++ b/compliance-agent/src/pipeline/code_review.rs @@ -66,8 +66,10 @@ impl CodeReviewScanner { } } + let deduped = dedup_cross_pass(all_findings); + ScanOutput { - findings: all_findings, + findings: deduped, sbom_entries: Vec::new(), } } @@ -184,3 +186,51 @@ struct ReviewIssue { #[serde(default)] suggestion: Option, } + +/// Deduplicate findings across review passes. +/// +/// Multiple passes often flag the same issue (e.g. SQL injection reported by +/// logic, security, and convention passes). We group by file + nearby line + +/// normalized title keywords and keep the highest-severity finding. +fn dedup_cross_pass(findings: Vec) -> Vec { + use std::collections::HashMap; + + // Build a dedup key: (file, line bucket, normalized title words) + fn dedup_key(f: &Finding) -> String { + let file = f.file_path.as_deref().unwrap_or(""); + // Group lines within 3 of each other + let line_bucket = f.line_number.unwrap_or(0) / 4; + // Normalize: lowercase, keep only alphanumeric, sort words for order-independence + let title_lower = f.title.to_lowercase(); + let mut words: Vec<&str> = title_lower + .split(|c: char| !c.is_alphanumeric()) + .filter(|w| w.len() > 2) + .collect(); + words.sort(); + format!("{file}:{line_bucket}:{}", words.join(",")) + } + + let mut groups: HashMap = HashMap::new(); + + for finding in findings { + let key = dedup_key(&finding); + groups + .entry(key) + .and_modify(|existing| { + // Keep the higher severity; on tie, keep the one with more detail + if finding.severity > existing.severity + || (finding.severity == existing.severity + && finding.description.len() > existing.description.len()) + { + *existing = finding.clone(); + } + // Merge CWE if the existing one is missing it + if existing.cwe.is_none() { + existing.cwe = finding.cwe.clone(); + } + }) + .or_insert(finding); + } + + groups.into_values().collect() +}