feat: deduplicate code review findings across LLM passes

Group findings by file, line proximity, and normalized title keywords, keeping the highest-severity finding from each group and merging CWE info. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-25 22:15:34 +01:00
parent 745ad8a441
commit 46c7188757
1 changed files with 51 additions and 1 deletions
--- a/compliance-agent/src/pipeline/code_review.rs
+++ b/compliance-agent/src/pipeline/code_review.rs
@@ -66,8 +66,10 @@ impl CodeReviewScanner {
            }
        }
        let deduped = dedup_cross_pass(all_findings);
        ScanOutput {
-            findings: all_findings,
+            findings: deduped,
            sbom_entries: Vec::new(),
        }
    }
@@ -184,3 +186,51 @@ struct ReviewIssue {
    #[serde(default)]
    suggestion: Option<String>,
 }
 /// Deduplicate findings across review passes.
 ///
 /// Multiple passes often flag the same issue (e.g. SQL injection reported by
 /// logic, security, and convention passes). We group by file + nearby line +
 /// normalized title keywords and keep the highest-severity finding.
 fn dedup_cross_pass(findings: Vec<Finding>) -> Vec<Finding> {
    use std::collections::HashMap;
    // Build a dedup key: (file, line bucket, normalized title words)
    fn dedup_key(f: &Finding) -> String {
        let file = f.file_path.as_deref().unwrap_or("");
        // Group lines within 3 of each other
        let line_bucket = f.line_number.unwrap_or(0) / 4;
        // Normalize: lowercase, keep only alphanumeric, sort words for order-independence
        let title_lower = f.title.to_lowercase();
        let mut words: Vec<&str> = title_lower
            .split(|c: char| !c.is_alphanumeric())
            .filter(|w| w.len() > 2)
            .collect();
        words.sort();
        format!("{file}:{line_bucket}:{}", words.join(","))
    }
    let mut groups: HashMap<String, Finding> = HashMap::new();
    for finding in findings {
        let key = dedup_key(&finding);
        groups
            .entry(key)
            .and_modify(|existing| {
                // Keep the higher severity; on tie, keep the one with more detail
                if finding.severity > existing.severity
                    || (finding.severity == existing.severity
                        && finding.description.len() > existing.description.len())
                {
                    *existing = finding.clone();
                }
                // Merge CWE if the existing one is missing it
                if existing.cwe.is_none() {
                    existing.cwe = finding.cwe.clone();
                }
            })
            .or_insert(finding);
    }
    groups.into_values().collect()
 }