Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| b96dda11fb | |||
| e67a13535a | |||
| df0063abc0 |
@@ -0,0 +1,10 @@
|
|||||||
|
[advisories]
|
||||||
|
ignore = [
|
||||||
|
# hickory-proto 0.25.x pulled in transitively via mongodb → hickory-resolver.
|
||||||
|
# MongoDB 3.x has not yet released with hickory-resolver 0.26.x, so we cannot
|
||||||
|
# upgrade past this without a mongodb release. Both are DNS-layer DoS vectors
|
||||||
|
# requiring a MITM/controlled DNS server against MongoDB's hostname resolution —
|
||||||
|
# not a realistic attack surface here. Revisit when mongodb bumps hickory.
|
||||||
|
"RUSTSEC-2026-0118", # NSEC3 loop, no fix available upstream
|
||||||
|
"RUSTSEC-2026-0119", # O(n²) name compression, fixed in hickory-proto >=0.26.1
|
||||||
|
]
|
||||||
Generated
+6
-6
@@ -3524,9 +3524,9 @@ checksum = "224484c5d09285a7b8cb0a0c117e847ebd14cb6e4470ecf68cdb89c503b0edb9"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mongodb"
|
name = "mongodb"
|
||||||
version = "3.5.1"
|
version = "3.6.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "803dd859e8afa084c255a8effd8000ff86f7c8076a50cd6d8c99e8f3496f75c2"
|
checksum = "1ef2c933617431ad0246fb5b43c425ebdae18c7f7259c87de0726d93b0e7e91b"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"base64",
|
"base64",
|
||||||
"bitflags",
|
"bitflags",
|
||||||
@@ -3570,9 +3570,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mongodb-internal-macros"
|
name = "mongodb-internal-macros"
|
||||||
version = "3.5.1"
|
version = "3.6.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a973ef3dd3dbc6f6e65bbdecfd9ec5e781b9e7493b0f369a7c62e35d8e5ae2c8"
|
checksum = "9e5758dc828eb2d02ec30563cba365609d56ddd833190b192beaee2b475a7bb3"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"macro_magic",
|
"macro_magic",
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
@@ -4699,9 +4699,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustls-webpki"
|
name = "rustls-webpki"
|
||||||
version = "0.103.10"
|
version = "0.103.13"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef"
|
checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ring",
|
"ring",
|
||||||
"rustls-pki-types",
|
"rustls-pki-types",
|
||||||
|
|||||||
@@ -35,11 +35,16 @@ impl ComplianceAgent {
|
|||||||
config.litellm_model.clone(),
|
config.litellm_model.clone(),
|
||||||
config.litellm_embed_model.clone(),
|
config.litellm_embed_model.clone(),
|
||||||
));
|
));
|
||||||
|
let http = reqwest::Client::builder()
|
||||||
|
.timeout(std::time::Duration::from_secs(30))
|
||||||
|
.connect_timeout(std::time::Duration::from_secs(10))
|
||||||
|
.build()
|
||||||
|
.unwrap_or_default();
|
||||||
Self {
|
Self {
|
||||||
config,
|
config,
|
||||||
db,
|
db,
|
||||||
llm,
|
llm,
|
||||||
http: reqwest::Client::new(),
|
http,
|
||||||
session_streams: Arc::new(DashMap::new()),
|
session_streams: Arc::new(DashMap::new()),
|
||||||
session_pause: Arc::new(DashMap::new()),
|
session_pause: Arc::new(DashMap::new()),
|
||||||
session_semaphore: Arc::new(Semaphore::new(DEFAULT_MAX_CONCURRENT_SESSIONS)),
|
session_semaphore: Arc::new(Semaphore::new(DEFAULT_MAX_CONCURRENT_SESSIONS)),
|
||||||
|
|||||||
@@ -19,12 +19,17 @@ impl LlmClient {
|
|||||||
model: String,
|
model: String,
|
||||||
embed_model: String,
|
embed_model: String,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
|
let http = reqwest::Client::builder()
|
||||||
|
.timeout(std::time::Duration::from_secs(300))
|
||||||
|
.connect_timeout(std::time::Duration::from_secs(10))
|
||||||
|
.build()
|
||||||
|
.unwrap_or_default();
|
||||||
Self {
|
Self {
|
||||||
base_url,
|
base_url,
|
||||||
api_key,
|
api_key,
|
||||||
model,
|
model,
|
||||||
embed_model,
|
embed_model,
|
||||||
http: reqwest::Client::new(),
|
http,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -19,26 +19,33 @@ impl Scanner for GitleaksScanner {
|
|||||||
|
|
||||||
#[tracing::instrument(skip_all)]
|
#[tracing::instrument(skip_all)]
|
||||||
async fn scan(&self, repo_path: &Path, repo_id: &str) -> Result<ScanOutput, CoreError> {
|
async fn scan(&self, repo_path: &Path, repo_id: &str) -> Result<ScanOutput, CoreError> {
|
||||||
let output = tokio::process::Command::new("gitleaks")
|
let output = tokio::time::timeout(
|
||||||
.args([
|
std::time::Duration::from_secs(300),
|
||||||
"detect",
|
tokio::process::Command::new("gitleaks")
|
||||||
"--source",
|
.args([
|
||||||
".",
|
"detect",
|
||||||
"--report-format",
|
"--source",
|
||||||
"json",
|
".",
|
||||||
"--report-path",
|
"--report-format",
|
||||||
"/dev/stdout",
|
"json",
|
||||||
"--no-banner",
|
"--report-path",
|
||||||
"--exit-code",
|
"/dev/stdout",
|
||||||
"0",
|
"--no-banner",
|
||||||
])
|
"--exit-code",
|
||||||
.current_dir(repo_path)
|
"0",
|
||||||
.output()
|
])
|
||||||
.await
|
.current_dir(repo_path)
|
||||||
.map_err(|e| CoreError::Scanner {
|
.output(),
|
||||||
scanner: "gitleaks".to_string(),
|
)
|
||||||
source: Box::new(e),
|
.await
|
||||||
})?;
|
.map_err(|_| CoreError::Scanner {
|
||||||
|
scanner: "gitleaks".to_string(),
|
||||||
|
source: "timed out after 5 minutes".into(),
|
||||||
|
})?
|
||||||
|
.map_err(|e| CoreError::Scanner {
|
||||||
|
scanner: "gitleaks".to_string(),
|
||||||
|
source: Box::new(e),
|
||||||
|
})?;
|
||||||
|
|
||||||
if output.stdout.is_empty() {
|
if output.stdout.is_empty() {
|
||||||
return Ok(ScanOutput::default());
|
return Ok(ScanOutput::default());
|
||||||
|
|||||||
@@ -174,19 +174,26 @@ impl PipelineOrchestrator {
|
|||||||
k.expose_secret().to_string()
|
k.expose_secret().to_string()
|
||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
let cve_alerts = match async {
|
let cve_alerts = match tokio::time::timeout(
|
||||||
cve_scanner
|
std::time::Duration::from_secs(600),
|
||||||
.scan_dependencies(&repo_id, &mut sbom_entries)
|
async {
|
||||||
.await
|
cve_scanner
|
||||||
}
|
.scan_dependencies(&repo_id, &mut sbom_entries)
|
||||||
.instrument(tracing::info_span!("stage_cve_scanning"))
|
.await
|
||||||
|
}
|
||||||
|
.instrument(tracing::info_span!("stage_cve_scanning")),
|
||||||
|
)
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(alerts) => alerts,
|
Ok(Ok(alerts)) => alerts,
|
||||||
Err(e) => {
|
Ok(Err(e)) => {
|
||||||
tracing::warn!("[{repo_id}] CVE scanning failed: {e}");
|
tracing::warn!("[{repo_id}] CVE scanning failed: {e}");
|
||||||
Vec::new()
|
Vec::new()
|
||||||
}
|
}
|
||||||
|
Err(_) => {
|
||||||
|
tracing::warn!("[{repo_id}] CVE scanning timed out after 10 minutes");
|
||||||
|
Vec::new()
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Stage 4: Pattern Scanning (GDPR + OAuth)
|
// Stage 4: Pattern Scanning (GDPR + OAuth)
|
||||||
|
|||||||
@@ -5,20 +5,26 @@ use compliance_core::CoreError;
|
|||||||
|
|
||||||
#[tracing::instrument(skip_all, fields(repo_id = %repo_id))]
|
#[tracing::instrument(skip_all, fields(repo_id = %repo_id))]
|
||||||
pub(super) async fn run_syft(repo_path: &Path, repo_id: &str) -> Result<Vec<SbomEntry>, CoreError> {
|
pub(super) async fn run_syft(repo_path: &Path, repo_id: &str) -> Result<Vec<SbomEntry>, CoreError> {
|
||||||
let output = tokio::process::Command::new("syft")
|
let output = tokio::time::timeout(
|
||||||
.arg(repo_path)
|
std::time::Duration::from_secs(300),
|
||||||
.args(["-o", "cyclonedx-json"])
|
tokio::process::Command::new("syft")
|
||||||
// Enable remote license lookups for all ecosystems
|
.arg(repo_path)
|
||||||
.env("SYFT_GOLANG_SEARCH_REMOTE_LICENSES", "true")
|
.args(["-o", "cyclonedx-json"])
|
||||||
.env("SYFT_JAVASCRIPT_SEARCH_REMOTE_LICENSES", "true")
|
.env("SYFT_GOLANG_SEARCH_REMOTE_LICENSES", "true")
|
||||||
.env("SYFT_PYTHON_SEARCH_REMOTE_LICENSES", "true")
|
.env("SYFT_JAVASCRIPT_SEARCH_REMOTE_LICENSES", "true")
|
||||||
.env("SYFT_JAVA_USE_NETWORK", "true")
|
.env("SYFT_PYTHON_SEARCH_REMOTE_LICENSES", "true")
|
||||||
.output()
|
.env("SYFT_JAVA_USE_NETWORK", "true")
|
||||||
.await
|
.output(),
|
||||||
.map_err(|e| CoreError::Scanner {
|
)
|
||||||
scanner: "syft".to_string(),
|
.await
|
||||||
source: Box::new(e),
|
.map_err(|_| CoreError::Scanner {
|
||||||
})?;
|
scanner: "syft".to_string(),
|
||||||
|
source: "timed out after 5 minutes".into(),
|
||||||
|
})?
|
||||||
|
.map_err(|e| CoreError::Scanner {
|
||||||
|
scanner: "syft".to_string(),
|
||||||
|
source: Box::new(e),
|
||||||
|
})?;
|
||||||
|
|
||||||
if !output.status.success() {
|
if !output.status.success() {
|
||||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||||
|
|||||||
@@ -19,15 +19,30 @@ impl Scanner for SemgrepScanner {
|
|||||||
|
|
||||||
#[tracing::instrument(skip_all)]
|
#[tracing::instrument(skip_all)]
|
||||||
async fn scan(&self, repo_path: &Path, repo_id: &str) -> Result<ScanOutput, CoreError> {
|
async fn scan(&self, repo_path: &Path, repo_id: &str) -> Result<ScanOutput, CoreError> {
|
||||||
let output = tokio::process::Command::new("semgrep")
|
let output = tokio::time::timeout(
|
||||||
.args(["--config=auto", "--json", "--quiet"])
|
std::time::Duration::from_secs(600),
|
||||||
.arg(repo_path)
|
tokio::process::Command::new("semgrep")
|
||||||
.output()
|
.args([
|
||||||
.await
|
"--config=auto",
|
||||||
.map_err(|e| CoreError::Scanner {
|
"--json",
|
||||||
scanner: "semgrep".to_string(),
|
"--quiet",
|
||||||
source: Box::new(e),
|
"--max-memory",
|
||||||
})?;
|
"500",
|
||||||
|
"--jobs",
|
||||||
|
"1",
|
||||||
|
])
|
||||||
|
.arg(repo_path)
|
||||||
|
.output(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.map_err(|_| CoreError::Scanner {
|
||||||
|
scanner: "semgrep".to_string(),
|
||||||
|
source: "timed out after 10 minutes".into(),
|
||||||
|
})?
|
||||||
|
.map_err(|e| CoreError::Scanner {
|
||||||
|
scanner: "semgrep".to_string(),
|
||||||
|
source: Box::new(e),
|
||||||
|
})?;
|
||||||
|
|
||||||
if !output.status.success() && output.stdout.is_empty() {
|
if !output.status.success() && output.stdout.is_empty() {
|
||||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||||
|
|||||||
@@ -6,11 +6,16 @@ use compliance_core::models::embedding::{CodeEmbedding, EmbeddingBuildRun, Embed
|
|||||||
use compliance_core::models::graph::CodeNode;
|
use compliance_core::models::graph::CodeNode;
|
||||||
use compliance_graph::graph::chunking::extract_chunks;
|
use compliance_graph::graph::chunking::extract_chunks;
|
||||||
use compliance_graph::graph::embedding_store::EmbeddingStore;
|
use compliance_graph::graph::embedding_store::EmbeddingStore;
|
||||||
|
use futures_util::stream::{FuturesUnordered, StreamExt};
|
||||||
use tracing::{error, info};
|
use tracing::{error, info};
|
||||||
|
|
||||||
use crate::error::AgentError;
|
use crate::error::AgentError;
|
||||||
use crate::llm::LlmClient;
|
use crate::llm::LlmClient;
|
||||||
|
|
||||||
|
const EMBED_BATCH_SIZE: usize = 20;
|
||||||
|
const EMBED_CONCURRENCY: usize = 4;
|
||||||
|
const EMBED_FLUSH_EVERY: usize = 200;
|
||||||
|
|
||||||
/// RAG pipeline for building embeddings and performing retrieval
|
/// RAG pipeline for building embeddings and performing retrieval
|
||||||
pub struct RagPipeline {
|
pub struct RagPipeline {
|
||||||
llm: Arc<LlmClient>,
|
llm: Arc<LlmClient>,
|
||||||
@@ -77,25 +82,33 @@ impl RagPipeline {
|
|||||||
.await
|
.await
|
||||||
.map_err(|e| AgentError::Other(format!("Failed to delete old embeddings: {e}")))?;
|
.map_err(|e| AgentError::Other(format!("Failed to delete old embeddings: {e}")))?;
|
||||||
|
|
||||||
// Step 3: Batch embed (small batches to stay within model limits)
|
// Step 3: Batch embed with bounded concurrency. Flush to Mongo and
|
||||||
let batch_size = 20;
|
// update progress periodically so the dashboard can show live status.
|
||||||
let mut all_embeddings = Vec::new();
|
let mut pending = Vec::with_capacity(EMBED_FLUSH_EVERY);
|
||||||
let mut embedded_count = 0u32;
|
let mut embedded_count = 0u32;
|
||||||
|
|
||||||
for batch_start in (0..chunks.len()).step_by(batch_size) {
|
// Build the list of batch indices to process.
|
||||||
let batch_end = (batch_start + batch_size).min(chunks.len());
|
let batches: Vec<(usize, usize)> = (0..chunks.len())
|
||||||
let batch_chunks = &chunks[batch_start..batch_end];
|
.step_by(EMBED_BATCH_SIZE)
|
||||||
|
.map(|start| (start, (start + EMBED_BATCH_SIZE).min(chunks.len())))
|
||||||
|
.collect();
|
||||||
|
|
||||||
// Prepare texts: context_header + content
|
let mut batch_iter = batches.into_iter();
|
||||||
let texts: Vec<String> = batch_chunks
|
let mut in_flight = FuturesUnordered::new();
|
||||||
.iter()
|
|
||||||
.map(|c| format!("{}\n{}", c.context_header, c.content))
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
match self.llm.embed(texts).await {
|
// Prime up to EMBED_CONCURRENCY batches.
|
||||||
Ok(vectors) => {
|
for _ in 0..EMBED_CONCURRENCY {
|
||||||
|
if let Some((start, end)) = batch_iter.next() {
|
||||||
|
in_flight.push(self.embed_batch(&chunks[start..end], start, end));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while let Some(result) = in_flight.next().await {
|
||||||
|
match result {
|
||||||
|
Ok((start, end, vectors)) => {
|
||||||
|
let batch_chunks = &chunks[start..end];
|
||||||
for (chunk, embedding) in batch_chunks.iter().zip(vectors) {
|
for (chunk, embedding) in batch_chunks.iter().zip(vectors) {
|
||||||
all_embeddings.push(CodeEmbedding {
|
pending.push(CodeEmbedding {
|
||||||
id: None,
|
id: None,
|
||||||
repo_id: repo_id.to_string(),
|
repo_id: repo_id.to_string(),
|
||||||
graph_build_id: graph_build_id.to_string(),
|
graph_build_id: graph_build_id.to_string(),
|
||||||
@@ -113,9 +126,45 @@ impl RagPipeline {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
embedded_count += batch_chunks.len() as u32;
|
embedded_count += batch_chunks.len() as u32;
|
||||||
|
|
||||||
|
// Flush pending embeddings to Mongo periodically and update progress.
|
||||||
|
if pending.len() >= EMBED_FLUSH_EVERY {
|
||||||
|
self.embedding_store
|
||||||
|
.store_embeddings(&pending)
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
AgentError::Other(format!("Failed to store embeddings: {e}"))
|
||||||
|
})?;
|
||||||
|
pending.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Always update the progress counter on the build doc — even if
|
||||||
|
// we haven't flushed embeddings yet — so the UI shows movement.
|
||||||
|
if let Err(e) = self
|
||||||
|
.embedding_store
|
||||||
|
.update_build(
|
||||||
|
repo_id,
|
||||||
|
graph_build_id,
|
||||||
|
EmbeddingBuildStatus::Running,
|
||||||
|
embedded_count,
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
error!("[{repo_id}] Failed to update build progress: {e}");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Queue the next batch to keep concurrency saturated.
|
||||||
|
if let Some((s, e)) = batch_iter.next() {
|
||||||
|
in_flight.push(self.embed_batch(&chunks[s..e], s, e));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("[{repo_id}] Embedding batch failed: {e}");
|
error!("[{repo_id}] Embedding batch failed: {e}");
|
||||||
|
// Flush whatever we have so partial progress isn't lost.
|
||||||
|
if !pending.is_empty() {
|
||||||
|
let _ = self.embedding_store.store_embeddings(&pending).await;
|
||||||
|
}
|
||||||
build.status = EmbeddingBuildStatus::Failed;
|
build.status = EmbeddingBuildStatus::Failed;
|
||||||
build.error_message = Some(e.to_string());
|
build.error_message = Some(e.to_string());
|
||||||
build.completed_at = Some(Utc::now());
|
build.completed_at = Some(Utc::now());
|
||||||
@@ -134,11 +183,13 @@ impl RagPipeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Step 4: Store all embeddings
|
// Step 4: Flush any remaining embeddings
|
||||||
self.embedding_store
|
if !pending.is_empty() {
|
||||||
.store_embeddings(&all_embeddings)
|
self.embedding_store
|
||||||
.await
|
.store_embeddings(&pending)
|
||||||
.map_err(|e| AgentError::Other(format!("Failed to store embeddings: {e}")))?;
|
.await
|
||||||
|
.map_err(|e| AgentError::Other(format!("Failed to store embeddings: {e}")))?;
|
||||||
|
}
|
||||||
|
|
||||||
// Step 5: Update build status
|
// Step 5: Update build status
|
||||||
build.status = EmbeddingBuildStatus::Completed;
|
build.status = EmbeddingBuildStatus::Completed;
|
||||||
@@ -161,4 +212,21 @@ impl RagPipeline {
|
|||||||
);
|
);
|
||||||
Ok(build)
|
Ok(build)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Embed one batch of chunks. Returns the (start, end, vectors) tuple so
|
||||||
|
/// out-of-order completion from `FuturesUnordered` can still be reconciled
|
||||||
|
/// against the original chunk slice.
|
||||||
|
async fn embed_batch(
|
||||||
|
&self,
|
||||||
|
batch_chunks: &[compliance_graph::graph::chunking::CodeChunk],
|
||||||
|
start: usize,
|
||||||
|
end: usize,
|
||||||
|
) -> Result<(usize, usize, Vec<Vec<f64>>), AgentError> {
|
||||||
|
let texts: Vec<String> = batch_chunks
|
||||||
|
.iter()
|
||||||
|
.map(|c| format!("{}\n{}", c.context_header, c.content))
|
||||||
|
.collect();
|
||||||
|
let vectors = self.llm.embed(texts).await?;
|
||||||
|
Ok((start, end, vectors))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ pub fn AppShell() -> Element {
|
|||||||
// Not authenticated — redirect to Keycloak login
|
// Not authenticated — redirect to Keycloak login
|
||||||
rsx! {
|
rsx! {
|
||||||
document::Script {
|
document::Script {
|
||||||
dangerous_inner_html: "window.location.href = '/auth';"
|
"window.location.href = '/auth';"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user