compliance-scanner-agent/compliance-dast/src/crawler/mod.rs

use std::collections::HashSet;

use compliance_core::error::CoreError;
use compliance_core::traits::dast_agent::{DiscoveredEndpoint, EndpointParameter};
use scraper::{Html, Selector};
use tracing::info;
use url::Url;

/// Web crawler that discovers endpoints and forms
pub struct WebCrawler {
    http: reqwest::Client,
    max_depth: u32,
    rate_limit_ms: u64,
}

impl WebCrawler {
    pub fn new(http: reqwest::Client, max_depth: u32, rate_limit_ms: u64) -> Self {
        Self {
            http,
            max_depth,
            rate_limit_ms,
        }
    }

    /// Crawl a target starting from the base URL
    pub async fn crawl(
        &self,
        base_url: &str,
        excluded_paths: &[String],
    ) -> Result<Vec<DiscoveredEndpoint>, CoreError> {
        let base = Url::parse(base_url)
            .map_err(|e| CoreError::Dast(format!("Invalid base URL: {e}")))?;

        let mut visited: HashSet<String> = HashSet::new();
        let mut endpoints: Vec<DiscoveredEndpoint> = Vec::new();
        let mut queue: Vec<(String, u32)> = vec![(base_url.to_string(), 0)];

        while let Some((url, depth)) = queue.pop() {
            if depth > self.max_depth {
                continue;
            }

            if visited.contains(&url) {
                continue;
            }

            // Check exclusions
            if excluded_paths
                .iter()
                .any(|excl| url.contains(excl.as_str()))
            {
                continue;
            }

            visited.insert(url.clone());

            // Rate limiting
            if self.rate_limit_ms > 0 {
                tokio::time::sleep(tokio::time::Duration::from_millis(self.rate_limit_ms)).await;
            }

            // Fetch the page
            let response = match self.http.get(&url).send().await {
                Ok(r) => r,
                Err(_) => continue,
            };

            let status = response.status();
            let content_type = response
                .headers()
                .get("content-type")
                .and_then(|v| v.to_str().ok())
                .unwrap_or("")
                .to_string();

            // Record this endpoint
            endpoints.push(DiscoveredEndpoint {
                url: url.clone(),
                method: "GET".to_string(),
                parameters: Vec::new(),
                content_type: Some(content_type.clone()),
                requires_auth: status.as_u16() == 401 || status.as_u16() == 403,
            });

            if !content_type.contains("text/html") {
                continue;
            }

            let body = match response.text().await {
                Ok(b) => b,
                Err(_) => continue,
            };

            // Parse HTML for links and forms
            let document = Html::parse_document(&body);

            // Extract links
            let link_selector =
                Selector::parse("a[href]").unwrap_or_else(|_| Selector::parse("a").expect("valid selector"));
            for element in document.select(&link_selector) {
                if let Some(href) = element.value().attr("href") {
                    if let Some(absolute_url) = self.resolve_url(&base, &url, href) {
                        if self.is_same_origin(&base, &absolute_url) && !visited.contains(&absolute_url)
                        {
                            queue.push((absolute_url, depth + 1));
                        }
                    }
                }
            }

            // Extract forms
            let form_selector = Selector::parse("form")
                .unwrap_or_else(|_| Selector::parse("form").expect("valid selector"));
            let input_selector = Selector::parse("input, select, textarea")
                .unwrap_or_else(|_| Selector::parse("input").expect("valid selector"));

            for form in document.select(&form_selector) {
                let action = form.value().attr("action").unwrap_or("");
                let method = form
                    .value()
                    .attr("method")
                    .unwrap_or("GET")
                    .to_uppercase();

                let form_url = self
                    .resolve_url(&base, &url, action)
                    .unwrap_or_else(|| url.clone());

                let mut params = Vec::new();
                for input in form.select(&input_selector) {
                    let name = input
                        .value()
                        .attr("name")
                        .unwrap_or("")
                        .to_string();
                    if name.is_empty() {
                        continue;
                    }

                    let input_type = input
                        .value()
                        .attr("type")
                        .unwrap_or("text")
                        .to_string();

                    let location = if method == "GET" {
                        "query".to_string()
                    } else {
                        "body".to_string()
                    };

                    params.push(EndpointParameter {
                        name,
                        location,
                        param_type: Some(input_type),
                        example_value: input.value().attr("value").map(|v| v.to_string()),
                    });
                }

                endpoints.push(DiscoveredEndpoint {
                    url: form_url,
                    method,
                    parameters: params,
                    content_type: Some("application/x-www-form-urlencoded".to_string()),
                    requires_auth: false,
                });
            }
        }

        info!(endpoints = endpoints.len(), "Crawling complete");
        Ok(endpoints)
    }

    fn resolve_url(&self, _base: &Url, current_page: &str, href: &str) -> Option<String> {
        // Skip anchors, javascript:, mailto:, etc.
        if href.starts_with('#')
            || href.starts_with("javascript:")
            || href.starts_with("mailto:")
            || href.starts_with("tel:")
        {
            return None;
        }

        if let Ok(absolute) = Url::parse(href) {
            return Some(absolute.to_string());
        }

        // Relative URL
        let current = Url::parse(current_page).ok()?;
        current.join(href).ok().map(|u| u.to_string())
    }

    fn is_same_origin(&self, base: &Url, url: &str) -> bool {
        if let Ok(parsed) = Url::parse(url) {
            parsed.host() == base.host() && parsed.scheme() == base.scheme()
        } else {
            false
        }
    }
}