Add DAST scanning and code knowledge graph features across the stack: - compliance-dast and compliance-graph workspace crates - Agent API handlers and routes for DAST targets/scans and graph builds - Core models and traits for DAST and graph domains - Dashboard pages for DAST targets/findings/overview and graph explorer/impact - Toast notification system with auto-dismiss for async action feedback - Button click animations and disabled states for better UX Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
201 lines
6.5 KiB
Rust
201 lines
6.5 KiB
Rust
use std::collections::HashSet;
|
|
|
|
use compliance_core::error::CoreError;
|
|
use compliance_core::traits::dast_agent::{DiscoveredEndpoint, EndpointParameter};
|
|
use scraper::{Html, Selector};
|
|
use tracing::info;
|
|
use url::Url;
|
|
|
|
/// Web crawler that discovers endpoints and forms
|
|
pub struct WebCrawler {
|
|
http: reqwest::Client,
|
|
max_depth: u32,
|
|
rate_limit_ms: u64,
|
|
}
|
|
|
|
impl WebCrawler {
|
|
pub fn new(http: reqwest::Client, max_depth: u32, rate_limit_ms: u64) -> Self {
|
|
Self {
|
|
http,
|
|
max_depth,
|
|
rate_limit_ms,
|
|
}
|
|
}
|
|
|
|
/// Crawl a target starting from the base URL
|
|
pub async fn crawl(
|
|
&self,
|
|
base_url: &str,
|
|
excluded_paths: &[String],
|
|
) -> Result<Vec<DiscoveredEndpoint>, CoreError> {
|
|
let base = Url::parse(base_url)
|
|
.map_err(|e| CoreError::Dast(format!("Invalid base URL: {e}")))?;
|
|
|
|
let mut visited: HashSet<String> = HashSet::new();
|
|
let mut endpoints: Vec<DiscoveredEndpoint> = Vec::new();
|
|
let mut queue: Vec<(String, u32)> = vec![(base_url.to_string(), 0)];
|
|
|
|
while let Some((url, depth)) = queue.pop() {
|
|
if depth > self.max_depth {
|
|
continue;
|
|
}
|
|
|
|
if visited.contains(&url) {
|
|
continue;
|
|
}
|
|
|
|
// Check exclusions
|
|
if excluded_paths
|
|
.iter()
|
|
.any(|excl| url.contains(excl.as_str()))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
visited.insert(url.clone());
|
|
|
|
// Rate limiting
|
|
if self.rate_limit_ms > 0 {
|
|
tokio::time::sleep(tokio::time::Duration::from_millis(self.rate_limit_ms)).await;
|
|
}
|
|
|
|
// Fetch the page
|
|
let response = match self.http.get(&url).send().await {
|
|
Ok(r) => r,
|
|
Err(_) => continue,
|
|
};
|
|
|
|
let status = response.status();
|
|
let content_type = response
|
|
.headers()
|
|
.get("content-type")
|
|
.and_then(|v| v.to_str().ok())
|
|
.unwrap_or("")
|
|
.to_string();
|
|
|
|
// Record this endpoint
|
|
endpoints.push(DiscoveredEndpoint {
|
|
url: url.clone(),
|
|
method: "GET".to_string(),
|
|
parameters: Vec::new(),
|
|
content_type: Some(content_type.clone()),
|
|
requires_auth: status.as_u16() == 401 || status.as_u16() == 403,
|
|
});
|
|
|
|
if !content_type.contains("text/html") {
|
|
continue;
|
|
}
|
|
|
|
let body = match response.text().await {
|
|
Ok(b) => b,
|
|
Err(_) => continue,
|
|
};
|
|
|
|
// Parse HTML for links and forms
|
|
let document = Html::parse_document(&body);
|
|
|
|
// Extract links
|
|
let link_selector =
|
|
Selector::parse("a[href]").unwrap_or_else(|_| Selector::parse("a").expect("valid selector"));
|
|
for element in document.select(&link_selector) {
|
|
if let Some(href) = element.value().attr("href") {
|
|
if let Some(absolute_url) = self.resolve_url(&base, &url, href) {
|
|
if self.is_same_origin(&base, &absolute_url) && !visited.contains(&absolute_url)
|
|
{
|
|
queue.push((absolute_url, depth + 1));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extract forms
|
|
let form_selector = Selector::parse("form")
|
|
.unwrap_or_else(|_| Selector::parse("form").expect("valid selector"));
|
|
let input_selector = Selector::parse("input, select, textarea")
|
|
.unwrap_or_else(|_| Selector::parse("input").expect("valid selector"));
|
|
|
|
for form in document.select(&form_selector) {
|
|
let action = form.value().attr("action").unwrap_or("");
|
|
let method = form
|
|
.value()
|
|
.attr("method")
|
|
.unwrap_or("GET")
|
|
.to_uppercase();
|
|
|
|
let form_url = self
|
|
.resolve_url(&base, &url, action)
|
|
.unwrap_or_else(|| url.clone());
|
|
|
|
let mut params = Vec::new();
|
|
for input in form.select(&input_selector) {
|
|
let name = input
|
|
.value()
|
|
.attr("name")
|
|
.unwrap_or("")
|
|
.to_string();
|
|
if name.is_empty() {
|
|
continue;
|
|
}
|
|
|
|
let input_type = input
|
|
.value()
|
|
.attr("type")
|
|
.unwrap_or("text")
|
|
.to_string();
|
|
|
|
let location = if method == "GET" {
|
|
"query".to_string()
|
|
} else {
|
|
"body".to_string()
|
|
};
|
|
|
|
params.push(EndpointParameter {
|
|
name,
|
|
location,
|
|
param_type: Some(input_type),
|
|
example_value: input.value().attr("value").map(|v| v.to_string()),
|
|
});
|
|
}
|
|
|
|
endpoints.push(DiscoveredEndpoint {
|
|
url: form_url,
|
|
method,
|
|
parameters: params,
|
|
content_type: Some("application/x-www-form-urlencoded".to_string()),
|
|
requires_auth: false,
|
|
});
|
|
}
|
|
}
|
|
|
|
info!(endpoints = endpoints.len(), "Crawling complete");
|
|
Ok(endpoints)
|
|
}
|
|
|
|
fn resolve_url(&self, _base: &Url, current_page: &str, href: &str) -> Option<String> {
|
|
// Skip anchors, javascript:, mailto:, etc.
|
|
if href.starts_with('#')
|
|
|| href.starts_with("javascript:")
|
|
|| href.starts_with("mailto:")
|
|
|| href.starts_with("tel:")
|
|
{
|
|
return None;
|
|
}
|
|
|
|
if let Ok(absolute) = Url::parse(href) {
|
|
return Some(absolute.to_string());
|
|
}
|
|
|
|
// Relative URL
|
|
let current = Url::parse(current_page).ok()?;
|
|
current.join(href).ok().map(|u| u.to_string())
|
|
}
|
|
|
|
fn is_same_origin(&self, base: &Url, url: &str) -> bool {
|
|
if let Ok(parsed) = Url::parse(url) {
|
|
parsed.host() == base.host() && parsed.scheme() == base.scheme()
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
}
|