feat: deduplicate code review findings across LLM passes (#48)

2026-03-29 20:38:52 +00:00
parent 745ad8a441
commit ff088f9eb4
5 changed files with 486 additions and 5 deletions
@@ -66,8 +66,10 @@ impl CodeReviewScanner {
            }
        }

+        let deduped = dedup_cross_pass(all_findings);
+
        ScanOutput {
-            findings: all_findings,
+            findings: deduped,
            sbom_entries: Vec::new(),
        }
    }
@@ -184,3 +186,51 @@ struct ReviewIssue {
    #[serde(default)]
    suggestion: Option<String>,
 }
+
+/// Deduplicate findings across review passes.
+///
+/// Multiple passes often flag the same issue (e.g. SQL injection reported by
+/// logic, security, and convention passes). We group by file + nearby line +
+/// normalized title keywords and keep the highest-severity finding.
+fn dedup_cross_pass(findings: Vec<Finding>) -> Vec<Finding> {
+    use std::collections::HashMap;
+
+    // Build a dedup key: (file, line bucket, normalized title words)
+    fn dedup_key(f: &Finding) -> String {
+        let file = f.file_path.as_deref().unwrap_or("");
+        // Group lines within 3 of each other
+        let line_bucket = f.line_number.unwrap_or(0) / 4;
+        // Normalize: lowercase, keep only alphanumeric, sort words for order-independence
+        let title_lower = f.title.to_lowercase();
+        let mut words: Vec<&str> = title_lower
+            .split(|c: char| !c.is_alphanumeric())
+            .filter(|w| w.len() > 2)
+            .collect();
+        words.sort();
+        format!("{file}:{line_bucket}:{}", words.join(","))
+    }
+
+    let mut groups: HashMap<String, Finding> = HashMap::new();
+
+    for finding in findings {
+        let key = dedup_key(&finding);
+        groups
+            .entry(key)
+            .and_modify(|existing| {
+                // Keep the higher severity; on tie, keep the one with more detail
+                if finding.severity > existing.severity
+                    || (finding.severity == existing.severity
+                        && finding.description.len() > existing.description.len())
+                {
+                    *existing = finding.clone();
+                }
+                // Merge CWE if the existing one is missing it
+                if existing.cwe.is_none() {
+                    existing.cwe = finding.cwe.clone();
+                }
+            })
+            .or_insert(finding);
+    }
+
+    groups.into_values().collect()
+}