use std::collections::HashSet; use compliance_core::error::CoreError; use compliance_core::traits::dast_agent::{DiscoveredEndpoint, EndpointParameter}; use scraper::{Html, Selector}; use tracing::info; use url::Url; /// Web crawler that discovers endpoints and forms pub struct WebCrawler { http: reqwest::Client, max_depth: u32, rate_limit_ms: u64, } impl WebCrawler { pub fn new(http: reqwest::Client, max_depth: u32, rate_limit_ms: u64) -> Self { Self { http, max_depth, rate_limit_ms, } } /// Crawl a target starting from the base URL pub async fn crawl( &self, base_url: &str, excluded_paths: &[String], ) -> Result, CoreError> { let base = Url::parse(base_url) .map_err(|e| CoreError::Dast(format!("Invalid base URL: {e}")))?; let mut visited: HashSet = HashSet::new(); let mut endpoints: Vec = Vec::new(); let mut queue: Vec<(String, u32)> = vec![(base_url.to_string(), 0)]; while let Some((url, depth)) = queue.pop() { if depth > self.max_depth { continue; } if visited.contains(&url) { continue; } // Check exclusions if excluded_paths .iter() .any(|excl| url.contains(excl.as_str())) { continue; } visited.insert(url.clone()); // Rate limiting if self.rate_limit_ms > 0 { tokio::time::sleep(tokio::time::Duration::from_millis(self.rate_limit_ms)).await; } // Fetch the page let response = match self.http.get(&url).send().await { Ok(r) => r, Err(_) => continue, }; let status = response.status(); let content_type = response .headers() .get("content-type") .and_then(|v| v.to_str().ok()) .unwrap_or("") .to_string(); // Record this endpoint endpoints.push(DiscoveredEndpoint { url: url.clone(), method: "GET".to_string(), parameters: Vec::new(), content_type: Some(content_type.clone()), requires_auth: status.as_u16() == 401 || status.as_u16() == 403, }); if !content_type.contains("text/html") { continue; } let body = match response.text().await { Ok(b) => b, Err(_) => continue, }; // Parse HTML for links and forms let document = Html::parse_document(&body); // Extract links let link_selector = Selector::parse("a[href]").unwrap_or_else(|_| Selector::parse("a").expect("valid selector")); for element in document.select(&link_selector) { if let Some(href) = element.value().attr("href") { if let Some(absolute_url) = self.resolve_url(&base, &url, href) { if self.is_same_origin(&base, &absolute_url) && !visited.contains(&absolute_url) { queue.push((absolute_url, depth + 1)); } } } } // Extract forms let form_selector = Selector::parse("form") .unwrap_or_else(|_| Selector::parse("form").expect("valid selector")); let input_selector = Selector::parse("input, select, textarea") .unwrap_or_else(|_| Selector::parse("input").expect("valid selector")); for form in document.select(&form_selector) { let action = form.value().attr("action").unwrap_or(""); let method = form .value() .attr("method") .unwrap_or("GET") .to_uppercase(); let form_url = self .resolve_url(&base, &url, action) .unwrap_or_else(|| url.clone()); let mut params = Vec::new(); for input in form.select(&input_selector) { let name = input .value() .attr("name") .unwrap_or("") .to_string(); if name.is_empty() { continue; } let input_type = input .value() .attr("type") .unwrap_or("text") .to_string(); let location = if method == "GET" { "query".to_string() } else { "body".to_string() }; params.push(EndpointParameter { name, location, param_type: Some(input_type), example_value: input.value().attr("value").map(|v| v.to_string()), }); } endpoints.push(DiscoveredEndpoint { url: form_url, method, parameters: params, content_type: Some("application/x-www-form-urlencoded".to_string()), requires_auth: false, }); } } info!(endpoints = endpoints.len(), "Crawling complete"); Ok(endpoints) } fn resolve_url(&self, _base: &Url, current_page: &str, href: &str) -> Option { // Skip anchors, javascript:, mailto:, etc. if href.starts_with('#') || href.starts_with("javascript:") || href.starts_with("mailto:") || href.starts_with("tel:") { return None; } if let Ok(absolute) = Url::parse(href) { return Some(absolute.to_string()); } // Relative URL let current = Url::parse(current_page).ok()?; current.join(href).ok().map(|u| u.to_string()) } fn is_same_origin(&self, base: &Url, url: &str) -> bool { if let Ok(parsed) = Url::parse(url) { parsed.host() == base.host() && parsed.scheme() == base.scheme() } else { false } } }