feat: rag-embedding-ai-chat (#1)
Co-authored-by: Sharang Parnerkar <parnerkarsharang@gmail.com> Reviewed-on: #1
This commit was merged in pull request #1.
This commit is contained in:
@@ -28,8 +28,8 @@ impl WebCrawler {
|
||||
base_url: &str,
|
||||
excluded_paths: &[String],
|
||||
) -> Result<Vec<DiscoveredEndpoint>, CoreError> {
|
||||
let base = Url::parse(base_url)
|
||||
.map_err(|e| CoreError::Dast(format!("Invalid base URL: {e}")))?;
|
||||
let base =
|
||||
Url::parse(base_url).map_err(|e| CoreError::Dast(format!("Invalid base URL: {e}")))?;
|
||||
|
||||
let mut visited: HashSet<String> = HashSet::new();
|
||||
let mut endpoints: Vec<DiscoveredEndpoint> = Vec::new();
|
||||
@@ -95,12 +95,15 @@ impl WebCrawler {
|
||||
let document = Html::parse_document(&body);
|
||||
|
||||
// Extract links
|
||||
let link_selector =
|
||||
Selector::parse("a[href]").unwrap_or_else(|_| Selector::parse("a").expect("valid selector"));
|
||||
let link_selector = match Selector::parse("a[href]") {
|
||||
Ok(s) => s,
|
||||
Err(_) => continue,
|
||||
};
|
||||
for element in document.select(&link_selector) {
|
||||
if let Some(href) = element.value().attr("href") {
|
||||
if let Some(absolute_url) = self.resolve_url(&base, &url, href) {
|
||||
if self.is_same_origin(&base, &absolute_url) && !visited.contains(&absolute_url)
|
||||
if self.is_same_origin(&base, &absolute_url)
|
||||
&& !visited.contains(&absolute_url)
|
||||
{
|
||||
queue.push((absolute_url, depth + 1));
|
||||
}
|
||||
@@ -109,18 +112,18 @@ impl WebCrawler {
|
||||
}
|
||||
|
||||
// Extract forms
|
||||
let form_selector = Selector::parse("form")
|
||||
.unwrap_or_else(|_| Selector::parse("form").expect("valid selector"));
|
||||
let input_selector = Selector::parse("input, select, textarea")
|
||||
.unwrap_or_else(|_| Selector::parse("input").expect("valid selector"));
|
||||
let form_selector = match Selector::parse("form") {
|
||||
Ok(s) => s,
|
||||
Err(_) => continue,
|
||||
};
|
||||
let input_selector = match Selector::parse("input, select, textarea") {
|
||||
Ok(s) => s,
|
||||
Err(_) => continue,
|
||||
};
|
||||
|
||||
for form in document.select(&form_selector) {
|
||||
let action = form.value().attr("action").unwrap_or("");
|
||||
let method = form
|
||||
.value()
|
||||
.attr("method")
|
||||
.unwrap_or("GET")
|
||||
.to_uppercase();
|
||||
let method = form.value().attr("method").unwrap_or("GET").to_uppercase();
|
||||
|
||||
let form_url = self
|
||||
.resolve_url(&base, &url, action)
|
||||
@@ -128,20 +131,12 @@ impl WebCrawler {
|
||||
|
||||
let mut params = Vec::new();
|
||||
for input in form.select(&input_selector) {
|
||||
let name = input
|
||||
.value()
|
||||
.attr("name")
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
let name = input.value().attr("name").unwrap_or("").to_string();
|
||||
if name.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let input_type = input
|
||||
.value()
|
||||
.attr("type")
|
||||
.unwrap_or("text")
|
||||
.to_string();
|
||||
let input_type = input.value().attr("type").unwrap_or("text").to_string();
|
||||
|
||||
let location = if method == "GET" {
|
||||
"query".to_string()
|
||||
|
||||
Reference in New Issue
Block a user