Compare commits

...

3 Commits

Author SHA1 Message Date
Sharang Parnerkar b96dda11fb fix: live progress + concurrency for embedding builds
CI / Check (pull_request) Successful in 10m34s
CI / Detect Changes (pull_request) Has been skipped
CI / Deploy Agent (pull_request) Has been skipped
CI / Deploy Dashboard (pull_request) Has been skipped
CI / Deploy Docs (pull_request) Has been skipped
CI / Deploy MCP (pull_request) Has been skipped
The embedding build progress was only written to MongoDB after every
batch completed (and the final flush + status update only happened at
the very end), so the dashboard would show "0/N chunks (0%)" for the
entire run, then jump straight to "complete." For a repo with 2k+
chunks this looked like the build was stuck.

Three fixes:
- pipeline: call update_build(Running, embedded_count, ...) after each
  batch so /api/v1/chat/:repo_id/status reflects real progress, and
  flush embeddings to Mongo every 200 records so a partial failure does
  not lose everything.
- pipeline: drive batches with FuturesUnordered at concurrency=4 so
  litellm requests overlap instead of going strictly serial (112
  sequential requests for a 2221-chunk repo were the wall-time floor).
- llm client: give the reqwest client a 300s request timeout and 10s
  connect timeout. Previously LlmClient used reqwest::Client::new()
  with no timeout, so a hung embedding call would block the build
  indefinitely.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-13 11:39:37 +02:00
sharang e67a13535a fix: add HTTP timeout to reqwest client and CVE stage timeout (#79)
CI / Check (push) Has been skipped
CI / Detect Changes (push) Successful in 5s
CI / Deploy Agent (push) Successful in 8m26s
CI / Deploy Dashboard (push) Has been skipped
CI / Deploy Docs (push) Has been skipped
CI / Deploy MCP (push) Has been skipped
2026-05-13 07:30:26 +00:00
sharang df0063abc0 fix: scanner timeouts, semgrep memory cap, syft remote lookups, Script error (#78)
CI / Check (push) Has been skipped
CI / Detect Changes (push) Successful in 5s
CI / Deploy Agent (push) Successful in 9m41s
CI / Deploy Dashboard (push) Successful in 15m19s
CI / Deploy Docs (push) Has been skipped
CI / Deploy MCP (push) Successful in 3m7s
## Summary

- **Scan produces no results in Orca** — semgrep (`--config=auto`, unbounded memory) and syft (remote license network calls) were getting OOM-killed or hanging in resource-constrained Orca containers. Scan would "complete" with 0 findings/SBOMs silently because each scanner failure is caught and logged as a warning.
- **Dashboard Script error spam** — `document::Script` in Dioxus 0.7 needs a single text node child for inline scripts; `dangerous_inner_html` was invalid and spammed the error log on every unauthenticated page load.

## Changes

| File | Change |
|------|--------|
| `semgrep.rs` | Add `--max-memory 500 --jobs 1`; 10-minute timeout |
| `syft.rs` | Remove remote license lookup env vars; 5-minute timeout |
| `gitleaks.rs` | 5-minute timeout |
| `app_shell.rs` | Fix `dangerous_inner_html` → text child in `document::Script` |

## Test plan

- [ ] Trigger a scan on a repo in Orca — findings and SBOM entries should now appear
- [ ] Agent logs should show timeout/error warnings rather than silent empty results when tools are killed
- [ ] Navigate to dashboard unauthenticated — Script error gone from logs
- [ ] Verify scans work end-to-end with `docker compose up`

---------

Co-authored-by: Sharang Parnerkar <30073382+mighty840@users.noreply.github.com>
Reviewed-on: #78
2026-05-12 11:27:24 +00:00
10 changed files with 202 additions and 79 deletions
+10
View File
@@ -0,0 +1,10 @@
[advisories]
ignore = [
# hickory-proto 0.25.x pulled in transitively via mongodb → hickory-resolver.
# MongoDB 3.x has not yet released with hickory-resolver 0.26.x, so we cannot
# upgrade past this without a mongodb release. Both are DNS-layer DoS vectors
# requiring a MITM/controlled DNS server against MongoDB's hostname resolution —
# not a realistic attack surface here. Revisit when mongodb bumps hickory.
"RUSTSEC-2026-0118", # NSEC3 loop, no fix available upstream
"RUSTSEC-2026-0119", # O(n²) name compression, fixed in hickory-proto >=0.26.1
]
Generated
+6 -6
View File
@@ -3524,9 +3524,9 @@ checksum = "224484c5d09285a7b8cb0a0c117e847ebd14cb6e4470ecf68cdb89c503b0edb9"
[[package]] [[package]]
name = "mongodb" name = "mongodb"
version = "3.5.1" version = "3.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "803dd859e8afa084c255a8effd8000ff86f7c8076a50cd6d8c99e8f3496f75c2" checksum = "1ef2c933617431ad0246fb5b43c425ebdae18c7f7259c87de0726d93b0e7e91b"
dependencies = [ dependencies = [
"base64", "base64",
"bitflags", "bitflags",
@@ -3570,9 +3570,9 @@ dependencies = [
[[package]] [[package]]
name = "mongodb-internal-macros" name = "mongodb-internal-macros"
version = "3.5.1" version = "3.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a973ef3dd3dbc6f6e65bbdecfd9ec5e781b9e7493b0f369a7c62e35d8e5ae2c8" checksum = "9e5758dc828eb2d02ec30563cba365609d56ddd833190b192beaee2b475a7bb3"
dependencies = [ dependencies = [
"macro_magic", "macro_magic",
"proc-macro2", "proc-macro2",
@@ -4699,9 +4699,9 @@ dependencies = [
[[package]] [[package]]
name = "rustls-webpki" name = "rustls-webpki"
version = "0.103.10" version = "0.103.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e"
dependencies = [ dependencies = [
"ring", "ring",
"rustls-pki-types", "rustls-pki-types",
+6 -1
View File
@@ -35,11 +35,16 @@ impl ComplianceAgent {
config.litellm_model.clone(), config.litellm_model.clone(),
config.litellm_embed_model.clone(), config.litellm_embed_model.clone(),
)); ));
let http = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(30))
.connect_timeout(std::time::Duration::from_secs(10))
.build()
.unwrap_or_default();
Self { Self {
config, config,
db, db,
llm, llm,
http: reqwest::Client::new(), http,
session_streams: Arc::new(DashMap::new()), session_streams: Arc::new(DashMap::new()),
session_pause: Arc::new(DashMap::new()), session_pause: Arc::new(DashMap::new()),
session_semaphore: Arc::new(Semaphore::new(DEFAULT_MAX_CONCURRENT_SESSIONS)), session_semaphore: Arc::new(Semaphore::new(DEFAULT_MAX_CONCURRENT_SESSIONS)),
+6 -1
View File
@@ -19,12 +19,17 @@ impl LlmClient {
model: String, model: String,
embed_model: String, embed_model: String,
) -> Self { ) -> Self {
let http = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(300))
.connect_timeout(std::time::Duration::from_secs(10))
.build()
.unwrap_or_default();
Self { Self {
base_url, base_url,
api_key, api_key,
model, model,
embed_model, embed_model,
http: reqwest::Client::new(), http,
} }
} }
+27 -20
View File
@@ -19,26 +19,33 @@ impl Scanner for GitleaksScanner {
#[tracing::instrument(skip_all)] #[tracing::instrument(skip_all)]
async fn scan(&self, repo_path: &Path, repo_id: &str) -> Result<ScanOutput, CoreError> { async fn scan(&self, repo_path: &Path, repo_id: &str) -> Result<ScanOutput, CoreError> {
let output = tokio::process::Command::new("gitleaks") let output = tokio::time::timeout(
.args([ std::time::Duration::from_secs(300),
"detect", tokio::process::Command::new("gitleaks")
"--source", .args([
".", "detect",
"--report-format", "--source",
"json", ".",
"--report-path", "--report-format",
"/dev/stdout", "json",
"--no-banner", "--report-path",
"--exit-code", "/dev/stdout",
"0", "--no-banner",
]) "--exit-code",
.current_dir(repo_path) "0",
.output() ])
.await .current_dir(repo_path)
.map_err(|e| CoreError::Scanner { .output(),
scanner: "gitleaks".to_string(), )
source: Box::new(e), .await
})?; .map_err(|_| CoreError::Scanner {
scanner: "gitleaks".to_string(),
source: "timed out after 5 minutes".into(),
})?
.map_err(|e| CoreError::Scanner {
scanner: "gitleaks".to_string(),
source: Box::new(e),
})?;
if output.stdout.is_empty() { if output.stdout.is_empty() {
return Ok(ScanOutput::default()); return Ok(ScanOutput::default());
+15 -8
View File
@@ -174,19 +174,26 @@ impl PipelineOrchestrator {
k.expose_secret().to_string() k.expose_secret().to_string()
}), }),
); );
let cve_alerts = match async { let cve_alerts = match tokio::time::timeout(
cve_scanner std::time::Duration::from_secs(600),
.scan_dependencies(&repo_id, &mut sbom_entries) async {
.await cve_scanner
} .scan_dependencies(&repo_id, &mut sbom_entries)
.instrument(tracing::info_span!("stage_cve_scanning")) .await
}
.instrument(tracing::info_span!("stage_cve_scanning")),
)
.await .await
{ {
Ok(alerts) => alerts, Ok(Ok(alerts)) => alerts,
Err(e) => { Ok(Err(e)) => {
tracing::warn!("[{repo_id}] CVE scanning failed: {e}"); tracing::warn!("[{repo_id}] CVE scanning failed: {e}");
Vec::new() Vec::new()
} }
Err(_) => {
tracing::warn!("[{repo_id}] CVE scanning timed out after 10 minutes");
Vec::new()
}
}; };
// Stage 4: Pattern Scanning (GDPR + OAuth) // Stage 4: Pattern Scanning (GDPR + OAuth)
+20 -14
View File
@@ -5,20 +5,26 @@ use compliance_core::CoreError;
#[tracing::instrument(skip_all, fields(repo_id = %repo_id))] #[tracing::instrument(skip_all, fields(repo_id = %repo_id))]
pub(super) async fn run_syft(repo_path: &Path, repo_id: &str) -> Result<Vec<SbomEntry>, CoreError> { pub(super) async fn run_syft(repo_path: &Path, repo_id: &str) -> Result<Vec<SbomEntry>, CoreError> {
let output = tokio::process::Command::new("syft") let output = tokio::time::timeout(
.arg(repo_path) std::time::Duration::from_secs(300),
.args(["-o", "cyclonedx-json"]) tokio::process::Command::new("syft")
// Enable remote license lookups for all ecosystems .arg(repo_path)
.env("SYFT_GOLANG_SEARCH_REMOTE_LICENSES", "true") .args(["-o", "cyclonedx-json"])
.env("SYFT_JAVASCRIPT_SEARCH_REMOTE_LICENSES", "true") .env("SYFT_GOLANG_SEARCH_REMOTE_LICENSES", "true")
.env("SYFT_PYTHON_SEARCH_REMOTE_LICENSES", "true") .env("SYFT_JAVASCRIPT_SEARCH_REMOTE_LICENSES", "true")
.env("SYFT_JAVA_USE_NETWORK", "true") .env("SYFT_PYTHON_SEARCH_REMOTE_LICENSES", "true")
.output() .env("SYFT_JAVA_USE_NETWORK", "true")
.await .output(),
.map_err(|e| CoreError::Scanner { )
scanner: "syft".to_string(), .await
source: Box::new(e), .map_err(|_| CoreError::Scanner {
})?; scanner: "syft".to_string(),
source: "timed out after 5 minutes".into(),
})?
.map_err(|e| CoreError::Scanner {
scanner: "syft".to_string(),
source: Box::new(e),
})?;
if !output.status.success() { if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr); let stderr = String::from_utf8_lossy(&output.stderr);
+24 -9
View File
@@ -19,15 +19,30 @@ impl Scanner for SemgrepScanner {
#[tracing::instrument(skip_all)] #[tracing::instrument(skip_all)]
async fn scan(&self, repo_path: &Path, repo_id: &str) -> Result<ScanOutput, CoreError> { async fn scan(&self, repo_path: &Path, repo_id: &str) -> Result<ScanOutput, CoreError> {
let output = tokio::process::Command::new("semgrep") let output = tokio::time::timeout(
.args(["--config=auto", "--json", "--quiet"]) std::time::Duration::from_secs(600),
.arg(repo_path) tokio::process::Command::new("semgrep")
.output() .args([
.await "--config=auto",
.map_err(|e| CoreError::Scanner { "--json",
scanner: "semgrep".to_string(), "--quiet",
source: Box::new(e), "--max-memory",
})?; "500",
"--jobs",
"1",
])
.arg(repo_path)
.output(),
)
.await
.map_err(|_| CoreError::Scanner {
scanner: "semgrep".to_string(),
source: "timed out after 10 minutes".into(),
})?
.map_err(|e| CoreError::Scanner {
scanner: "semgrep".to_string(),
source: Box::new(e),
})?;
if !output.status.success() && output.stdout.is_empty() { if !output.status.success() && output.stdout.is_empty() {
let stderr = String::from_utf8_lossy(&output.stderr); let stderr = String::from_utf8_lossy(&output.stderr);
+87 -19
View File
@@ -6,11 +6,16 @@ use compliance_core::models::embedding::{CodeEmbedding, EmbeddingBuildRun, Embed
use compliance_core::models::graph::CodeNode; use compliance_core::models::graph::CodeNode;
use compliance_graph::graph::chunking::extract_chunks; use compliance_graph::graph::chunking::extract_chunks;
use compliance_graph::graph::embedding_store::EmbeddingStore; use compliance_graph::graph::embedding_store::EmbeddingStore;
use futures_util::stream::{FuturesUnordered, StreamExt};
use tracing::{error, info}; use tracing::{error, info};
use crate::error::AgentError; use crate::error::AgentError;
use crate::llm::LlmClient; use crate::llm::LlmClient;
const EMBED_BATCH_SIZE: usize = 20;
const EMBED_CONCURRENCY: usize = 4;
const EMBED_FLUSH_EVERY: usize = 200;
/// RAG pipeline for building embeddings and performing retrieval /// RAG pipeline for building embeddings and performing retrieval
pub struct RagPipeline { pub struct RagPipeline {
llm: Arc<LlmClient>, llm: Arc<LlmClient>,
@@ -77,25 +82,33 @@ impl RagPipeline {
.await .await
.map_err(|e| AgentError::Other(format!("Failed to delete old embeddings: {e}")))?; .map_err(|e| AgentError::Other(format!("Failed to delete old embeddings: {e}")))?;
// Step 3: Batch embed (small batches to stay within model limits) // Step 3: Batch embed with bounded concurrency. Flush to Mongo and
let batch_size = 20; // update progress periodically so the dashboard can show live status.
let mut all_embeddings = Vec::new(); let mut pending = Vec::with_capacity(EMBED_FLUSH_EVERY);
let mut embedded_count = 0u32; let mut embedded_count = 0u32;
for batch_start in (0..chunks.len()).step_by(batch_size) { // Build the list of batch indices to process.
let batch_end = (batch_start + batch_size).min(chunks.len()); let batches: Vec<(usize, usize)> = (0..chunks.len())
let batch_chunks = &chunks[batch_start..batch_end]; .step_by(EMBED_BATCH_SIZE)
.map(|start| (start, (start + EMBED_BATCH_SIZE).min(chunks.len())))
.collect();
// Prepare texts: context_header + content let mut batch_iter = batches.into_iter();
let texts: Vec<String> = batch_chunks let mut in_flight = FuturesUnordered::new();
.iter()
.map(|c| format!("{}\n{}", c.context_header, c.content))
.collect();
match self.llm.embed(texts).await { // Prime up to EMBED_CONCURRENCY batches.
Ok(vectors) => { for _ in 0..EMBED_CONCURRENCY {
if let Some((start, end)) = batch_iter.next() {
in_flight.push(self.embed_batch(&chunks[start..end], start, end));
}
}
while let Some(result) = in_flight.next().await {
match result {
Ok((start, end, vectors)) => {
let batch_chunks = &chunks[start..end];
for (chunk, embedding) in batch_chunks.iter().zip(vectors) { for (chunk, embedding) in batch_chunks.iter().zip(vectors) {
all_embeddings.push(CodeEmbedding { pending.push(CodeEmbedding {
id: None, id: None,
repo_id: repo_id.to_string(), repo_id: repo_id.to_string(),
graph_build_id: graph_build_id.to_string(), graph_build_id: graph_build_id.to_string(),
@@ -113,9 +126,45 @@ impl RagPipeline {
}); });
} }
embedded_count += batch_chunks.len() as u32; embedded_count += batch_chunks.len() as u32;
// Flush pending embeddings to Mongo periodically and update progress.
if pending.len() >= EMBED_FLUSH_EVERY {
self.embedding_store
.store_embeddings(&pending)
.await
.map_err(|e| {
AgentError::Other(format!("Failed to store embeddings: {e}"))
})?;
pending.clear();
}
// Always update the progress counter on the build doc — even if
// we haven't flushed embeddings yet — so the UI shows movement.
if let Err(e) = self
.embedding_store
.update_build(
repo_id,
graph_build_id,
EmbeddingBuildStatus::Running,
embedded_count,
None,
)
.await
{
error!("[{repo_id}] Failed to update build progress: {e}");
}
// Queue the next batch to keep concurrency saturated.
if let Some((s, e)) = batch_iter.next() {
in_flight.push(self.embed_batch(&chunks[s..e], s, e));
}
} }
Err(e) => { Err(e) => {
error!("[{repo_id}] Embedding batch failed: {e}"); error!("[{repo_id}] Embedding batch failed: {e}");
// Flush whatever we have so partial progress isn't lost.
if !pending.is_empty() {
let _ = self.embedding_store.store_embeddings(&pending).await;
}
build.status = EmbeddingBuildStatus::Failed; build.status = EmbeddingBuildStatus::Failed;
build.error_message = Some(e.to_string()); build.error_message = Some(e.to_string());
build.completed_at = Some(Utc::now()); build.completed_at = Some(Utc::now());
@@ -134,11 +183,13 @@ impl RagPipeline {
} }
} }
// Step 4: Store all embeddings // Step 4: Flush any remaining embeddings
self.embedding_store if !pending.is_empty() {
.store_embeddings(&all_embeddings) self.embedding_store
.await .store_embeddings(&pending)
.map_err(|e| AgentError::Other(format!("Failed to store embeddings: {e}")))?; .await
.map_err(|e| AgentError::Other(format!("Failed to store embeddings: {e}")))?;
}
// Step 5: Update build status // Step 5: Update build status
build.status = EmbeddingBuildStatus::Completed; build.status = EmbeddingBuildStatus::Completed;
@@ -161,4 +212,21 @@ impl RagPipeline {
); );
Ok(build) Ok(build)
} }
/// Embed one batch of chunks. Returns the (start, end, vectors) tuple so
/// out-of-order completion from `FuturesUnordered` can still be reconciled
/// against the original chunk slice.
async fn embed_batch(
&self,
batch_chunks: &[compliance_graph::graph::chunking::CodeChunk],
start: usize,
end: usize,
) -> Result<(usize, usize, Vec<Vec<f64>>), AgentError> {
let texts: Vec<String> = batch_chunks
.iter()
.map(|c| format!("{}\n{}", c.context_header, c.content))
.collect();
let vectors = self.llm.embed(texts).await?;
Ok((start, end, vectors))
}
} }
@@ -32,7 +32,7 @@ pub fn AppShell() -> Element {
// Not authenticated — redirect to Keycloak login // Not authenticated — redirect to Keycloak login
rsx! { rsx! {
document::Script { document::Script {
dangerous_inner_html: "window.location.href = '/auth';" "window.location.href = '/auth';"
} }
} }
} }