Files
compliance-scanner-agent/compliance-dast/src/crawler/mod.rs
Sharang Parnerkar cea8f59e10 Add DAST, graph modules, toast notifications, and dashboard enhancements
Add DAST scanning and code knowledge graph features across the stack:
- compliance-dast and compliance-graph workspace crates
- Agent API handlers and routes for DAST targets/scans and graph builds
- Core models and traits for DAST and graph domains
- Dashboard pages for DAST targets/findings/overview and graph explorer/impact
- Toast notification system with auto-dismiss for async action feedback
- Button click animations and disabled states for better UX

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 13:53:50 +01:00

201 lines
6.5 KiB
Rust

use std::collections::HashSet;
use compliance_core::error::CoreError;
use compliance_core::traits::dast_agent::{DiscoveredEndpoint, EndpointParameter};
use scraper::{Html, Selector};
use tracing::info;
use url::Url;
/// Web crawler that discovers endpoints and forms
pub struct WebCrawler {
http: reqwest::Client,
max_depth: u32,
rate_limit_ms: u64,
}
impl WebCrawler {
pub fn new(http: reqwest::Client, max_depth: u32, rate_limit_ms: u64) -> Self {
Self {
http,
max_depth,
rate_limit_ms,
}
}
/// Crawl a target starting from the base URL
pub async fn crawl(
&self,
base_url: &str,
excluded_paths: &[String],
) -> Result<Vec<DiscoveredEndpoint>, CoreError> {
let base = Url::parse(base_url)
.map_err(|e| CoreError::Dast(format!("Invalid base URL: {e}")))?;
let mut visited: HashSet<String> = HashSet::new();
let mut endpoints: Vec<DiscoveredEndpoint> = Vec::new();
let mut queue: Vec<(String, u32)> = vec![(base_url.to_string(), 0)];
while let Some((url, depth)) = queue.pop() {
if depth > self.max_depth {
continue;
}
if visited.contains(&url) {
continue;
}
// Check exclusions
if excluded_paths
.iter()
.any(|excl| url.contains(excl.as_str()))
{
continue;
}
visited.insert(url.clone());
// Rate limiting
if self.rate_limit_ms > 0 {
tokio::time::sleep(tokio::time::Duration::from_millis(self.rate_limit_ms)).await;
}
// Fetch the page
let response = match self.http.get(&url).send().await {
Ok(r) => r,
Err(_) => continue,
};
let status = response.status();
let content_type = response
.headers()
.get("content-type")
.and_then(|v| v.to_str().ok())
.unwrap_or("")
.to_string();
// Record this endpoint
endpoints.push(DiscoveredEndpoint {
url: url.clone(),
method: "GET".to_string(),
parameters: Vec::new(),
content_type: Some(content_type.clone()),
requires_auth: status.as_u16() == 401 || status.as_u16() == 403,
});
if !content_type.contains("text/html") {
continue;
}
let body = match response.text().await {
Ok(b) => b,
Err(_) => continue,
};
// Parse HTML for links and forms
let document = Html::parse_document(&body);
// Extract links
let link_selector =
Selector::parse("a[href]").unwrap_or_else(|_| Selector::parse("a").expect("valid selector"));
for element in document.select(&link_selector) {
if let Some(href) = element.value().attr("href") {
if let Some(absolute_url) = self.resolve_url(&base, &url, href) {
if self.is_same_origin(&base, &absolute_url) && !visited.contains(&absolute_url)
{
queue.push((absolute_url, depth + 1));
}
}
}
}
// Extract forms
let form_selector = Selector::parse("form")
.unwrap_or_else(|_| Selector::parse("form").expect("valid selector"));
let input_selector = Selector::parse("input, select, textarea")
.unwrap_or_else(|_| Selector::parse("input").expect("valid selector"));
for form in document.select(&form_selector) {
let action = form.value().attr("action").unwrap_or("");
let method = form
.value()
.attr("method")
.unwrap_or("GET")
.to_uppercase();
let form_url = self
.resolve_url(&base, &url, action)
.unwrap_or_else(|| url.clone());
let mut params = Vec::new();
for input in form.select(&input_selector) {
let name = input
.value()
.attr("name")
.unwrap_or("")
.to_string();
if name.is_empty() {
continue;
}
let input_type = input
.value()
.attr("type")
.unwrap_or("text")
.to_string();
let location = if method == "GET" {
"query".to_string()
} else {
"body".to_string()
};
params.push(EndpointParameter {
name,
location,
param_type: Some(input_type),
example_value: input.value().attr("value").map(|v| v.to_string()),
});
}
endpoints.push(DiscoveredEndpoint {
url: form_url,
method,
parameters: params,
content_type: Some("application/x-www-form-urlencoded".to_string()),
requires_auth: false,
});
}
}
info!(endpoints = endpoints.len(), "Crawling complete");
Ok(endpoints)
}
fn resolve_url(&self, _base: &Url, current_page: &str, href: &str) -> Option<String> {
// Skip anchors, javascript:, mailto:, etc.
if href.starts_with('#')
|| href.starts_with("javascript:")
|| href.starts_with("mailto:")
|| href.starts_with("tel:")
{
return None;
}
if let Ok(absolute) = Url::parse(href) {
return Some(absolute.to_string());
}
// Relative URL
let current = Url::parse(current_page).ok()?;
current.join(href).ok().map(|u| u.to_string())
}
fn is_same_origin(&self, base: &Url, url: &str) -> bool {
if let Ok(parsed) = Url::parse(url) {
parsed.host() == base.host() && parsed.scheme() == base.scheme()
} else {
false
}
}
}