Files
certifai/src/infrastructure/searxng.rs
Sharang Parnerkar e130969cd9
All checks were successful
CI / Clippy (pull_request) Successful in 2m21s
CI / Security Audit (pull_request) Has been skipped
CI / Tests (pull_request) Has been skipped
CI / Deploy (push) Has been skipped
CI / Deploy (pull_request) Has been skipped
CI / Format (push) Successful in 3s
CI / Clippy (push) Successful in 2m22s
CI / Security Audit (push) Has been skipped
CI / Tests (push) Has been skipped
CI / Format (pull_request) Successful in 2s
feat(infra): add ServerState, MongoDB, auth middleware, and DaisyUI theme toggle
Introduce centralized ServerState (Arc-wrapped, Box::leaked configs) loaded
once at startup, replacing per-request dotenvy/env::var calls across all
server functions. Add MongoDB Database wrapper with connection pooling.
Add tower middleware that gates all /api/ server function endpoints behind
session authentication (401 for unauthenticated callers, except check-auth).
Fix DaisyUI theme toggle to use certifai-dark/certifai-light theme names
and replace hardcoded hex colors in main.css with CSS variables.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 15:35:59 +01:00

288 lines
9.7 KiB
Rust

use crate::models::NewsCard;
use dioxus::prelude::*;
// Server-side helpers and types are only needed for the server build.
// The #[server] macro generates a client stub for the web build that
// sends a network request instead of executing this function body.
#[cfg(feature = "server")]
mod inner {
use serde::Deserialize;
use std::collections::HashSet;
/// Individual result from the SearXNG search API.
#[derive(Debug, Deserialize)]
pub(super) struct SearxngResult {
pub title: String,
pub url: String,
pub content: Option<String>,
#[serde(rename = "publishedDate")]
pub published_date: Option<String>,
pub thumbnail: Option<String>,
/// Relevance score assigned by SearXNG (higher = more relevant).
#[serde(default)]
pub score: f64,
}
/// Top-level response from the SearXNG search API.
#[derive(Debug, Deserialize)]
pub(super) struct SearxngResponse {
pub results: Vec<SearxngResult>,
}
/// Extract the domain name from a URL to use as the source label.
///
/// Strips common prefixes like "www." for cleaner display.
///
/// # Arguments
///
/// * `url_str` - The full URL string
///
/// # Returns
///
/// The domain host or a fallback "Web" string
pub(super) fn extract_source(url_str: &str) -> String {
url::Url::parse(url_str)
.ok()
.and_then(|u| u.host_str().map(String::from))
.map(|host| host.strip_prefix("www.").unwrap_or(&host).to_string())
.unwrap_or_else(|| "Web".into())
}
/// Deduplicate and rank search results for quality, similar to Perplexity.
///
/// Applies the following filters in order:
/// 1. Remove results with empty content (no snippet = low value)
/// 2. Deduplicate by domain (keep highest-scored result per domain)
/// 3. Sort by SearXNG relevance score (descending)
/// 4. Cap at `max_results`
///
/// # Arguments
///
/// * `results` - Raw search results from SearXNG
/// * `max_results` - Maximum number of results to return
///
/// # Returns
///
/// Filtered, deduplicated, and ranked results
pub(super) fn rank_and_deduplicate(
mut results: Vec<SearxngResult>,
max_results: usize,
) -> Vec<SearxngResult> {
// Filter out results with no meaningful content
results.retain(|r| r.content.as_ref().is_some_and(|c| c.trim().len() >= 20));
// Sort by score descending so we keep the best result per domain
results.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
// Deduplicate by domain: keep only the first (highest-scored) per domain
let mut seen_domains = HashSet::new();
results.retain(|r| {
let domain = extract_source(&r.url);
seen_domains.insert(domain)
});
results.truncate(max_results);
results
}
}
/// Search for news using the SearXNG meta-search engine.
///
/// Uses Perplexity-style query enrichment and result ranking:
/// - Queries the "news" and "general" categories for fresh, relevant results
/// - Filters to the last month for recency
/// - Deduplicates by domain for source diversity
/// - Ranks by SearXNG relevance score
/// - Filters out results without meaningful content
///
/// # Arguments
///
/// * `query` - The search query string
///
/// # Returns
///
/// Up to 15 high-quality `NewsCard` results, or a `ServerFnError` on failure
///
/// # Errors
///
/// Returns `ServerFnError` if the SearXNG request fails or response parsing fails
#[post("/api/search")]
pub async fn search_topic(query: String) -> Result<Vec<NewsCard>, ServerFnError> {
use inner::{extract_source, rank_and_deduplicate, SearxngResponse};
let state: crate::infrastructure::ServerState =
dioxus_fullstack::FullstackContext::extract().await?;
let searxng_url = state.services.searxng_url.clone();
// Enrich the query with "latest news" context for better results,
// similar to how Perplexity reformulates queries before searching.
let enriched_query = format!("{query} latest news");
// Use POST with form-encoded body because SearXNG's default config
// sets `method: "POST"` which rejects GET requests with 405.
let search_url = format!("{searxng_url}/search");
let params = [
("q", enriched_query.as_str()),
("format", "json"),
("language", "en"),
("categories", "news,general"),
("time_range", "month"),
];
let client = reqwest::Client::new();
let resp = client
.post(&search_url)
.form(&params)
.send()
.await
.map_err(|e| ServerFnError::new(format!("SearXNG request failed: {e}")))?;
if !resp.status().is_success() {
return Err(ServerFnError::new(format!(
"SearXNG returned status {}",
resp.status()
)));
}
let body: SearxngResponse = resp
.json()
.await
.map_err(|e| ServerFnError::new(format!("Failed to parse SearXNG response: {e}")))?;
// Apply Perplexity-style ranking: filter empties, deduplicate domains, sort by score
let ranked = rank_and_deduplicate(body.results, 15);
let cards: Vec<NewsCard> = ranked
.into_iter()
.map(|r| {
let summary = r
.content
.clone()
.unwrap_or_default()
.chars()
.take(200)
.collect::<String>();
let content = r.content.unwrap_or_default();
NewsCard {
title: r.title,
source: extract_source(&r.url),
summary,
content,
category: query.clone(),
url: r.url,
thumbnail_url: r.thumbnail,
published_at: r.published_date.unwrap_or_else(|| "Recent".into()),
}
})
.collect();
Ok(cards)
}
/// Fetch trending topic keywords by running a broad news search and
/// extracting the most frequent meaningful terms from result titles.
///
/// This approach works regardless of whether SearXNG has autocomplete
/// configured, since it uses the standard search API.
///
/// # Returns
///
/// Up to 8 trending keyword strings, or a `ServerFnError` on failure
///
/// # Errors
///
/// Returns `ServerFnError` if the SearXNG search request fails
#[get("/api/trending")]
pub async fn get_trending_topics() -> Result<Vec<String>, ServerFnError> {
use inner::SearxngResponse;
use std::collections::HashMap;
let state: crate::infrastructure::ServerState =
dioxus_fullstack::FullstackContext::extract().await?;
let searxng_url = state.services.searxng_url.clone();
// Use POST to match SearXNG's default `method: "POST"` setting
let search_url = format!("{searxng_url}/search");
let params = [
("q", "trending technology AI"),
("format", "json"),
("language", "en"),
("categories", "news"),
("time_range", "week"),
];
let client = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(5))
.build()
.map_err(|e| ServerFnError::new(format!("HTTP client error: {e}")))?;
let resp = client
.post(&search_url)
.form(&params)
.send()
.await
.map_err(|e| ServerFnError::new(format!("SearXNG trending search failed: {e}")))?;
if !resp.status().is_success() {
return Err(ServerFnError::new(format!(
"SearXNG trending search returned status {}",
resp.status()
)));
}
let body: SearxngResponse = resp
.json()
.await
.map_err(|e| ServerFnError::new(format!("Failed to parse trending response: {e}")))?;
// Common stop words to exclude from trending keywords
const STOP_WORDS: &[&str] = &[
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by",
"from", "is", "are", "was", "were", "be", "been", "has", "have", "had", "do", "does",
"did", "will", "would", "could", "should", "may", "can", "not", "no", "it", "its", "this",
"that", "these", "how", "what", "why", "who", "when", "new", "says", "said", "about",
"after", "over", "into", "up", "out", "as", "all", "more", "than", "just", "now", "also",
"us", "we", "you", "your", "our", "if", "so", "like", "get", "make", "year", "years",
"one", "two",
];
// Count word frequency across all result titles. Words are lowercased
// and must be at least 3 characters to filter out noise.
let mut word_counts: HashMap<String, u32> = HashMap::new();
for result in &body.results {
for word in result.title.split_whitespace() {
// Strip punctuation from edges, lowercase
let clean: String = word
.trim_matches(|c: char| !c.is_alphanumeric())
.to_lowercase();
if clean.len() >= 3 && !STOP_WORDS.contains(&clean.as_str()) {
*word_counts.entry(clean).or_insert(0) += 1;
}
}
}
// Sort by frequency descending, take top 8
let mut sorted: Vec<(String, u32)> = word_counts.into_iter().collect();
sorted.sort_by(|a, b| b.1.cmp(&a.1));
// Capitalize first letter for display
let topics: Vec<String> = sorted
.into_iter()
.filter(|(_, count)| *count >= 2)
.take(8)
.map(|(word, _)| {
let mut chars = word.chars();
match chars.next() {
Some(c) => c.to_uppercase().to_string() + chars.as_str(),
None => word,
}
})
.collect();
Ok(topics)
}