feat: deduplicate code review findings across LLM passes
All checks were successful
CI / Check (pull_request) Successful in 12m46s
CI / Detect Changes (pull_request) Has been skipped
CI / Deploy Agent (pull_request) Has been skipped
CI / Deploy Dashboard (pull_request) Has been skipped
CI / Deploy Docs (pull_request) Has been skipped
CI / Deploy MCP (pull_request) Has been skipped

Group findings by file, line proximity, and normalized title keywords,
keeping the highest-severity finding from each group and merging CWE info.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Sharang Parnerkar
2026-03-25 22:15:34 +01:00
parent 745ad8a441
commit 46c7188757

View File

@@ -66,8 +66,10 @@ impl CodeReviewScanner {
}
}
let deduped = dedup_cross_pass(all_findings);
ScanOutput {
findings: all_findings,
findings: deduped,
sbom_entries: Vec::new(),
}
}
@@ -184,3 +186,51 @@ struct ReviewIssue {
#[serde(default)]
suggestion: Option<String>,
}
/// Deduplicate findings across review passes.
///
/// Multiple passes often flag the same issue (e.g. SQL injection reported by
/// logic, security, and convention passes). We group by file + nearby line +
/// normalized title keywords and keep the highest-severity finding.
fn dedup_cross_pass(findings: Vec<Finding>) -> Vec<Finding> {
use std::collections::HashMap;
// Build a dedup key: (file, line bucket, normalized title words)
fn dedup_key(f: &Finding) -> String {
let file = f.file_path.as_deref().unwrap_or("");
// Group lines within 3 of each other
let line_bucket = f.line_number.unwrap_or(0) / 4;
// Normalize: lowercase, keep only alphanumeric, sort words for order-independence
let title_lower = f.title.to_lowercase();
let mut words: Vec<&str> = title_lower
.split(|c: char| !c.is_alphanumeric())
.filter(|w| w.len() > 2)
.collect();
words.sort();
format!("{file}:{line_bucket}:{}", words.join(","))
}
let mut groups: HashMap<String, Finding> = HashMap::new();
for finding in findings {
let key = dedup_key(&finding);
groups
.entry(key)
.and_modify(|existing| {
// Keep the higher severity; on tie, keep the one with more detail
if finding.severity > existing.severity
|| (finding.severity == existing.severity
&& finding.description.len() > existing.description.len())
{
*existing = finding.clone();
}
// Merge CWE if the existing one is missing it
if existing.cwe.is_none() {
existing.cwe = finding.cwe.clone();
}
})
.or_insert(finding);
}
groups.into_values().collect()
}