Some checks failed
Co-authored-by: Sharang Parnerkar <parnerkarsharang@gmail.com> Reviewed-on: #16
451 lines
14 KiB
Rust
451 lines
14 KiB
Rust
use crate::models::NewsCard;
|
|
use dioxus::prelude::*;
|
|
|
|
// Server-side helpers and types are only needed for the server build.
|
|
// The #[server] macro generates a client stub for the web build that
|
|
// sends a network request instead of executing this function body.
|
|
#[cfg(feature = "server")]
|
|
pub(crate) mod inner {
|
|
use serde::Deserialize;
|
|
use std::collections::HashSet;
|
|
|
|
/// Individual result from the SearXNG search API.
|
|
#[derive(Debug, Deserialize)]
|
|
pub(crate) struct SearxngResult {
|
|
pub title: String,
|
|
pub url: String,
|
|
pub content: Option<String>,
|
|
#[serde(rename = "publishedDate")]
|
|
pub published_date: Option<String>,
|
|
pub thumbnail: Option<String>,
|
|
/// Relevance score assigned by SearXNG (higher = more relevant).
|
|
#[serde(default)]
|
|
pub score: f64,
|
|
}
|
|
|
|
/// Top-level response from the SearXNG search API.
|
|
#[derive(Debug, Deserialize)]
|
|
pub(crate) struct SearxngResponse {
|
|
pub results: Vec<SearxngResult>,
|
|
}
|
|
|
|
/// Extract the domain name from a URL to use as the source label.
|
|
///
|
|
/// Strips common prefixes like "www." for cleaner display.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `url_str` - The full URL string
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// The domain host or a fallback "Web" string
|
|
pub(crate) fn extract_source(url_str: &str) -> String {
|
|
url::Url::parse(url_str)
|
|
.ok()
|
|
.and_then(|u| u.host_str().map(String::from))
|
|
.map(|host| host.strip_prefix("www.").unwrap_or(&host).to_string())
|
|
.unwrap_or_else(|| "Web".into())
|
|
}
|
|
|
|
/// Deduplicate and rank search results for quality, similar to Perplexity.
|
|
///
|
|
/// Applies the following filters in order:
|
|
/// 1. Remove results with empty content (no snippet = low value)
|
|
/// 2. Deduplicate by domain (keep highest-scored result per domain)
|
|
/// 3. Sort by SearXNG relevance score (descending)
|
|
/// 4. Cap at `max_results`
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `results` - Raw search results from SearXNG
|
|
/// * `max_results` - Maximum number of results to return
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// Filtered, deduplicated, and ranked results
|
|
pub(crate) fn rank_and_deduplicate(
|
|
mut results: Vec<SearxngResult>,
|
|
max_results: usize,
|
|
) -> Vec<SearxngResult> {
|
|
// Filter out results with no meaningful content
|
|
results.retain(|r| r.content.as_ref().is_some_and(|c| c.trim().len() >= 20));
|
|
|
|
// Sort by score descending so we keep the best result per domain
|
|
results.sort_by(|a, b| {
|
|
b.score
|
|
.partial_cmp(&a.score)
|
|
.unwrap_or(std::cmp::Ordering::Equal)
|
|
});
|
|
|
|
// Deduplicate by domain: keep only the first (highest-scored) per domain
|
|
let mut seen_domains = HashSet::new();
|
|
results.retain(|r| {
|
|
let domain = extract_source(&r.url);
|
|
seen_domains.insert(domain)
|
|
});
|
|
|
|
results.truncate(max_results);
|
|
results
|
|
}
|
|
}
|
|
|
|
/// Search for news using the SearXNG meta-search engine.
|
|
///
|
|
/// Uses Perplexity-style query enrichment and result ranking:
|
|
/// - Queries the "news" and "general" categories for fresh, relevant results
|
|
/// - Filters to the last month for recency
|
|
/// - Deduplicates by domain for source diversity
|
|
/// - Ranks by SearXNG relevance score
|
|
/// - Filters out results without meaningful content
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `query` - The search query string
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// Up to 15 high-quality `NewsCard` results, or a `ServerFnError` on failure
|
|
///
|
|
/// # Errors
|
|
///
|
|
/// Returns `ServerFnError` if the SearXNG request fails or response parsing fails
|
|
#[post("/api/search")]
|
|
pub async fn search_topic(query: String) -> Result<Vec<NewsCard>, ServerFnError> {
|
|
use inner::{extract_source, rank_and_deduplicate, SearxngResponse};
|
|
|
|
let state: crate::infrastructure::ServerState =
|
|
dioxus_fullstack::FullstackContext::extract().await?;
|
|
let searxng_url = state.services.searxng_url.clone();
|
|
|
|
// Enrich the query with "latest news" context for better results,
|
|
// similar to how Perplexity reformulates queries before searching.
|
|
let enriched_query = format!("{query} latest news");
|
|
|
|
// Use POST with form-encoded body because SearXNG's default config
|
|
// sets `method: "POST"` which rejects GET requests with 405.
|
|
let search_url = format!("{searxng_url}/search");
|
|
let params = [
|
|
("q", enriched_query.as_str()),
|
|
("format", "json"),
|
|
("language", "en"),
|
|
("categories", "news,general"),
|
|
("time_range", "month"),
|
|
];
|
|
|
|
let client = reqwest::Client::new();
|
|
let resp = client
|
|
.post(&search_url)
|
|
.form(¶ms)
|
|
.send()
|
|
.await
|
|
.map_err(|e| ServerFnError::new(format!("SearXNG request failed: {e}")))?;
|
|
|
|
if !resp.status().is_success() {
|
|
return Err(ServerFnError::new(format!(
|
|
"SearXNG returned status {}",
|
|
resp.status()
|
|
)));
|
|
}
|
|
|
|
let body: SearxngResponse = resp
|
|
.json()
|
|
.await
|
|
.map_err(|e| ServerFnError::new(format!("Failed to parse SearXNG response: {e}")))?;
|
|
|
|
// Apply Perplexity-style ranking: filter empties, deduplicate domains, sort by score
|
|
let ranked = rank_and_deduplicate(body.results, 15);
|
|
|
|
let cards: Vec<NewsCard> = ranked
|
|
.into_iter()
|
|
.map(|r| {
|
|
let summary = r
|
|
.content
|
|
.clone()
|
|
.unwrap_or_default()
|
|
.chars()
|
|
.take(200)
|
|
.collect::<String>();
|
|
let content = r.content.unwrap_or_default();
|
|
NewsCard {
|
|
title: r.title,
|
|
source: extract_source(&r.url),
|
|
summary,
|
|
content,
|
|
category: query.clone(),
|
|
url: r.url,
|
|
thumbnail_url: r.thumbnail,
|
|
published_at: r.published_date.unwrap_or_else(|| "Recent".into()),
|
|
}
|
|
})
|
|
.collect();
|
|
|
|
Ok(cards)
|
|
}
|
|
|
|
/// Fetch trending topic keywords by running a broad news search and
|
|
/// extracting the most frequent meaningful terms from result titles.
|
|
///
|
|
/// This approach works regardless of whether SearXNG has autocomplete
|
|
/// configured, since it uses the standard search API.
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// Up to 8 trending keyword strings, or a `ServerFnError` on failure
|
|
///
|
|
/// # Errors
|
|
///
|
|
/// Returns `ServerFnError` if the SearXNG search request fails
|
|
#[get("/api/trending")]
|
|
pub async fn get_trending_topics() -> Result<Vec<String>, ServerFnError> {
|
|
use inner::SearxngResponse;
|
|
use std::collections::HashMap;
|
|
|
|
let state: crate::infrastructure::ServerState =
|
|
dioxus_fullstack::FullstackContext::extract().await?;
|
|
let searxng_url = state.services.searxng_url.clone();
|
|
|
|
// Use POST to match SearXNG's default `method: "POST"` setting
|
|
let search_url = format!("{searxng_url}/search");
|
|
let params = [
|
|
("q", "trending technology AI"),
|
|
("format", "json"),
|
|
("language", "en"),
|
|
("categories", "news"),
|
|
("time_range", "week"),
|
|
];
|
|
|
|
let client = reqwest::Client::builder()
|
|
.timeout(std::time::Duration::from_secs(5))
|
|
.build()
|
|
.map_err(|e| ServerFnError::new(format!("HTTP client error: {e}")))?;
|
|
|
|
let resp = client
|
|
.post(&search_url)
|
|
.form(¶ms)
|
|
.send()
|
|
.await
|
|
.map_err(|e| ServerFnError::new(format!("SearXNG trending search failed: {e}")))?;
|
|
|
|
if !resp.status().is_success() {
|
|
return Err(ServerFnError::new(format!(
|
|
"SearXNG trending search returned status {}",
|
|
resp.status()
|
|
)));
|
|
}
|
|
|
|
let body: SearxngResponse = resp
|
|
.json()
|
|
.await
|
|
.map_err(|e| ServerFnError::new(format!("Failed to parse trending response: {e}")))?;
|
|
|
|
// Common stop words to exclude from trending keywords
|
|
const STOP_WORDS: &[&str] = &[
|
|
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by",
|
|
"from", "is", "are", "was", "were", "be", "been", "has", "have", "had", "do", "does",
|
|
"did", "will", "would", "could", "should", "may", "can", "not", "no", "it", "its", "this",
|
|
"that", "these", "how", "what", "why", "who", "when", "new", "says", "said", "about",
|
|
"after", "over", "into", "up", "out", "as", "all", "more", "than", "just", "now", "also",
|
|
"us", "we", "you", "your", "our", "if", "so", "like", "get", "make", "year", "years",
|
|
"one", "two",
|
|
];
|
|
|
|
// Count word frequency across all result titles. Words are lowercased
|
|
// and must be at least 3 characters to filter out noise.
|
|
let mut word_counts: HashMap<String, u32> = HashMap::new();
|
|
for result in &body.results {
|
|
for word in result.title.split_whitespace() {
|
|
// Strip punctuation from edges, lowercase
|
|
let clean: String = word
|
|
.trim_matches(|c: char| !c.is_alphanumeric())
|
|
.to_lowercase();
|
|
if clean.len() >= 3 && !STOP_WORDS.contains(&clean.as_str()) {
|
|
*word_counts.entry(clean).or_insert(0) += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Sort by frequency descending, take top 8
|
|
let mut sorted: Vec<(String, u32)> = word_counts.into_iter().collect();
|
|
sorted.sort_by(|a, b| b.1.cmp(&a.1));
|
|
|
|
// Capitalize first letter for display
|
|
let topics: Vec<String> = sorted
|
|
.into_iter()
|
|
.filter(|(_, count)| *count >= 2)
|
|
.take(8)
|
|
.map(|(word, _)| {
|
|
let mut chars = word.chars();
|
|
match chars.next() {
|
|
Some(c) => c.to_uppercase().to_string() + chars.as_str(),
|
|
None => word,
|
|
}
|
|
})
|
|
.collect();
|
|
|
|
Ok(topics)
|
|
}
|
|
|
|
#[cfg(all(test, feature = "server"))]
|
|
mod tests {
|
|
#![allow(clippy::unwrap_used, clippy::expect_used)]
|
|
|
|
use super::inner::*;
|
|
use pretty_assertions::assert_eq;
|
|
|
|
// -----------------------------------------------------------------------
|
|
// extract_source()
|
|
// -----------------------------------------------------------------------
|
|
|
|
#[test]
|
|
fn extract_source_strips_www() {
|
|
assert_eq!(
|
|
extract_source("https://www.example.com/page"),
|
|
"example.com"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn extract_source_returns_domain() {
|
|
assert_eq!(
|
|
extract_source("https://techcrunch.com/article"),
|
|
"techcrunch.com"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn extract_source_invalid_url_returns_web() {
|
|
assert_eq!(extract_source("not-a-url"), "Web");
|
|
}
|
|
|
|
#[test]
|
|
fn extract_source_no_scheme_returns_web() {
|
|
// url::Url::parse requires a scheme; bare domain fails
|
|
assert_eq!(extract_source("example.com/path"), "Web");
|
|
}
|
|
|
|
// -----------------------------------------------------------------------
|
|
// rank_and_deduplicate()
|
|
// -----------------------------------------------------------------------
|
|
|
|
fn make_result(url: &str, content: &str, score: f64) -> SearxngResult {
|
|
SearxngResult {
|
|
title: "Title".into(),
|
|
url: url.into(),
|
|
content: if content.is_empty() {
|
|
None
|
|
} else {
|
|
Some(content.into())
|
|
},
|
|
published_date: None,
|
|
thumbnail: None,
|
|
score,
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn rank_filters_empty_content() {
|
|
let results = vec![
|
|
make_result("https://a.com", "", 10.0),
|
|
make_result(
|
|
"https://b.com",
|
|
"This is meaningful content that passes the length filter",
|
|
5.0,
|
|
),
|
|
];
|
|
let ranked = rank_and_deduplicate(results, 10);
|
|
assert_eq!(ranked.len(), 1);
|
|
assert_eq!(ranked[0].url, "https://b.com");
|
|
}
|
|
|
|
#[test]
|
|
fn rank_filters_short_content() {
|
|
let results = vec![
|
|
make_result("https://a.com", "short", 10.0),
|
|
make_result(
|
|
"https://b.com",
|
|
"This content is long enough to pass the 20-char filter threshold",
|
|
5.0,
|
|
),
|
|
];
|
|
let ranked = rank_and_deduplicate(results, 10);
|
|
assert_eq!(ranked.len(), 1);
|
|
}
|
|
|
|
#[test]
|
|
fn rank_deduplicates_by_domain_keeps_highest() {
|
|
let results = vec![
|
|
make_result(
|
|
"https://example.com/page1",
|
|
"First result with enough content here for the filter",
|
|
3.0,
|
|
),
|
|
make_result(
|
|
"https://example.com/page2",
|
|
"Second result with enough content here for the filter",
|
|
8.0,
|
|
),
|
|
];
|
|
let ranked = rank_and_deduplicate(results, 10);
|
|
assert_eq!(ranked.len(), 1);
|
|
// Should keep the highest-scored one (page2 with score 8.0)
|
|
assert_eq!(ranked[0].url, "https://example.com/page2");
|
|
}
|
|
|
|
#[test]
|
|
fn rank_sorts_by_score_descending() {
|
|
let results = vec![
|
|
make_result(
|
|
"https://a.com/p",
|
|
"Content A that is long enough to pass the filter check",
|
|
1.0,
|
|
),
|
|
make_result(
|
|
"https://b.com/p",
|
|
"Content B that is long enough to pass the filter check",
|
|
5.0,
|
|
),
|
|
make_result(
|
|
"https://c.com/p",
|
|
"Content C that is long enough to pass the filter check",
|
|
3.0,
|
|
),
|
|
];
|
|
let ranked = rank_and_deduplicate(results, 10);
|
|
assert_eq!(ranked.len(), 3);
|
|
assert!(ranked[0].score >= ranked[1].score);
|
|
assert!(ranked[1].score >= ranked[2].score);
|
|
}
|
|
|
|
#[test]
|
|
fn rank_truncates_to_max_results() {
|
|
let results: Vec<_> = (0..20)
|
|
.map(|i| {
|
|
make_result(
|
|
&format!("https://site{i}.com/page"),
|
|
&format!("Content for site {i} that is long enough to pass the filter"),
|
|
i as f64,
|
|
)
|
|
})
|
|
.collect();
|
|
let ranked = rank_and_deduplicate(results, 5);
|
|
assert_eq!(ranked.len(), 5);
|
|
}
|
|
|
|
#[test]
|
|
fn rank_empty_input_returns_empty() {
|
|
let ranked = rank_and_deduplicate(vec![], 10);
|
|
assert!(ranked.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn rank_all_filtered_returns_empty() {
|
|
let results = vec![
|
|
make_result("https://a.com", "", 10.0),
|
|
make_result("https://b.com", "too short", 5.0),
|
|
];
|
|
let ranked = rank_and_deduplicate(results, 10);
|
|
assert!(ranked.is_empty());
|
|
}
|
|
}
|