Add RAG embedding and AI chat feature
Some checks failed
CI / Security Audit (push) Has been cancelled
CI / Tests (push) Has been cancelled
CI / Clippy (push) Has been cancelled
CI / Format (push) Failing after 3s

Implement end-to-end RAG pipeline: AST-aware code chunking, LiteLLM
embedding generation, MongoDB vector storage with brute-force cosine
similarity fallback for self-hosted instances, and a chat API with
RAG-augmented responses. Add dedicated /chat/:repo_id dashboard page
with embedding build controls, message history, and source reference
cards.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Sharang Parnerkar
2026-03-04 23:29:40 +01:00
parent db454867f3
commit 89c30a62dd
25 changed files with 1692 additions and 25 deletions

View File

@@ -8,6 +8,7 @@ pub struct AgentConfig {
pub litellm_url: String,
pub litellm_api_key: SecretString,
pub litellm_model: String,
pub litellm_embed_model: String,
pub github_token: Option<SecretString>,
pub github_webhook_secret: Option<SecretString>,
pub gitlab_url: Option<String>,

View File

@@ -0,0 +1,35 @@
use serde::{Deserialize, Serialize};
/// A message in the chat history
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatMessage {
pub role: String,
pub content: String,
}
/// Request body for the chat endpoint
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatRequest {
pub message: String,
#[serde(default)]
pub history: Vec<ChatMessage>,
}
/// A source reference from the RAG retrieval
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SourceReference {
pub file_path: String,
pub qualified_name: String,
pub start_line: u32,
pub end_line: u32,
pub language: String,
pub snippet: String,
pub score: f64,
}
/// Response from the chat endpoint
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatResponse {
pub message: String,
pub sources: Vec<SourceReference>,
}

View File

@@ -0,0 +1,100 @@
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
/// Status of an embedding build operation
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum EmbeddingBuildStatus {
Running,
Completed,
Failed,
}
/// A code embedding stored in MongoDB Atlas Vector Search
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CodeEmbedding {
#[serde(rename = "_id", skip_serializing_if = "Option::is_none")]
pub id: Option<bson::oid::ObjectId>,
pub repo_id: String,
pub graph_build_id: String,
pub qualified_name: String,
pub kind: String,
pub file_path: String,
pub start_line: u32,
pub end_line: u32,
pub language: String,
pub content: String,
pub context_header: String,
pub embedding: Vec<f64>,
pub token_estimate: u32,
#[serde(with = "bson::serde_helpers::chrono_datetime_as_bson_datetime")]
pub created_at: DateTime<Utc>,
}
/// Tracks an embedding build operation for a repository
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EmbeddingBuildRun {
#[serde(rename = "_id", skip_serializing_if = "Option::is_none")]
pub id: Option<bson::oid::ObjectId>,
pub repo_id: String,
pub graph_build_id: String,
pub status: EmbeddingBuildStatus,
pub total_chunks: u32,
pub embedded_chunks: u32,
pub embedding_model: String,
pub error_message: Option<String>,
#[serde(with = "bson::serde_helpers::chrono_datetime_as_bson_datetime")]
pub started_at: DateTime<Utc>,
#[serde(
default,
skip_serializing_if = "Option::is_none",
with = "opt_chrono_as_bson"
)]
pub completed_at: Option<DateTime<Utc>>,
}
impl EmbeddingBuildRun {
pub fn new(repo_id: String, graph_build_id: String, embedding_model: String) -> Self {
Self {
id: None,
repo_id,
graph_build_id,
status: EmbeddingBuildStatus::Running,
total_chunks: 0,
embedded_chunks: 0,
embedding_model,
error_message: None,
started_at: Utc::now(),
completed_at: None,
}
}
}
/// Serde helper for Option<DateTime<Utc>> as BSON DateTime
mod opt_chrono_as_bson {
use chrono::{DateTime, Utc};
use serde::{Deserialize, Deserializer, Serialize, Serializer};
#[derive(Serialize, Deserialize)]
struct BsonDt(
#[serde(with = "bson::serde_helpers::chrono_datetime_as_bson_datetime")] DateTime<Utc>,
);
pub fn serialize<S>(value: &Option<DateTime<Utc>>, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
match value {
Some(dt) => BsonDt(*dt).serialize(serializer),
None => serializer.serialize_none(),
}
}
pub fn deserialize<'de, D>(deserializer: D) -> Result<Option<DateTime<Utc>>, D::Error>
where
D: Deserializer<'de>,
{
let opt: Option<BsonDt> = Option::deserialize(deserializer)?;
Ok(opt.map(|d| d.0))
}
}

View File

@@ -1,5 +1,7 @@
pub mod chat;
pub mod cve;
pub mod dast;
pub mod embedding;
pub mod finding;
pub mod graph;
pub mod issue;
@@ -7,15 +9,16 @@ pub mod repository;
pub mod sbom;
pub mod scan;
pub use chat::{ChatMessage, ChatRequest, ChatResponse, SourceReference};
pub use cve::{CveAlert, CveSource};
pub use dast::{
DastAuthConfig, DastEvidence, DastFinding, DastScanPhase, DastScanRun, DastScanStatus,
DastTarget, DastTargetType, DastVulnType,
};
pub use embedding::{CodeEmbedding, EmbeddingBuildRun, EmbeddingBuildStatus};
pub use finding::{Finding, FindingStatus, Severity};
pub use graph::{
CodeEdge, CodeEdgeKind, CodeNode, CodeNodeKind, GraphBuildRun, GraphBuildStatus,
ImpactAnalysis,
CodeEdge, CodeEdgeKind, CodeNode, CodeNodeKind, GraphBuildRun, GraphBuildStatus, ImpactAnalysis,
};
pub use issue::{IssueStatus, TrackerIssue, TrackerType};
pub use repository::{ScanTrigger, TrackedRepository};