certifai/src/infrastructure/searxng.rs

use crate::models::NewsCard;
use dioxus::prelude::*;

// Server-side helpers and types are only needed for the server build.
// The #[server] macro generates a client stub for the web build that
// sends a network request instead of executing this function body.
#[cfg(feature = "server")]
pub(crate) mod inner {
    use serde::Deserialize;
    use std::collections::HashSet;

    /// Individual result from the SearXNG search API.
    #[derive(Debug, Deserialize)]
    pub(crate) struct SearxngResult {
        pub title: String,
        pub url: String,
        pub content: Option<String>,
        #[serde(rename = "publishedDate")]
        pub published_date: Option<String>,
        pub thumbnail: Option<String>,
        /// Relevance score assigned by SearXNG (higher = more relevant).
        #[serde(default)]
        pub score: f64,
    }

    /// Top-level response from the SearXNG search API.
    #[derive(Debug, Deserialize)]
    pub(crate) struct SearxngResponse {
        pub results: Vec<SearxngResult>,
    }

    /// Extract the domain name from a URL to use as the source label.
    ///
    /// Strips common prefixes like "www." for cleaner display.
    ///
    /// # Arguments
    ///
    /// * `url_str` - The full URL string
    ///
    /// # Returns
    ///
    /// The domain host or a fallback "Web" string
    pub(crate) fn extract_source(url_str: &str) -> String {
        url::Url::parse(url_str)
            .ok()
            .and_then(|u| u.host_str().map(String::from))
            .map(|host| host.strip_prefix("www.").unwrap_or(&host).to_string())
            .unwrap_or_else(|| "Web".into())
    }

    /// Deduplicate and rank search results for quality, similar to Perplexity.
    ///
    /// Applies the following filters in order:
    /// 1. Remove results with empty content (no snippet = low value)
    /// 2. Deduplicate by domain (keep highest-scored result per domain)
    /// 3. Sort by SearXNG relevance score (descending)
    /// 4. Cap at `max_results`
    ///
    /// # Arguments
    ///
    /// * `results` - Raw search results from SearXNG
    /// * `max_results` - Maximum number of results to return
    ///
    /// # Returns
    ///
    /// Filtered, deduplicated, and ranked results
    pub(crate) fn rank_and_deduplicate(
        mut results: Vec<SearxngResult>,
        max_results: usize,
    ) -> Vec<SearxngResult> {
        // Filter out results with no meaningful content
        results.retain(|r| r.content.as_ref().is_some_and(|c| c.trim().len() >= 20));

        // Sort by score descending so we keep the best result per domain
        results.sort_by(|a, b| {
            b.score
                .partial_cmp(&a.score)
                .unwrap_or(std::cmp::Ordering::Equal)
        });

        // Deduplicate by domain: keep only the first (highest-scored) per domain
        let mut seen_domains = HashSet::new();
        results.retain(|r| {
            let domain = extract_source(&r.url);
            seen_domains.insert(domain)
        });

        results.truncate(max_results);
        results
    }
}

/// Search for news using the SearXNG meta-search engine.
///
/// Uses Perplexity-style query enrichment and result ranking:
/// - Queries the "news" and "general" categories for fresh, relevant results
/// - Filters to the last month for recency
/// - Deduplicates by domain for source diversity
/// - Ranks by SearXNG relevance score
/// - Filters out results without meaningful content
///
/// # Arguments
///
/// * `query` - The search query string
///
/// # Returns
///
/// Up to 15 high-quality `NewsCard` results, or a `ServerFnError` on failure
///
/// # Errors
///
/// Returns `ServerFnError` if the SearXNG request fails or response parsing fails
#[post("/api/search")]
pub async fn search_topic(query: String) -> Result<Vec<NewsCard>, ServerFnError> {
    use inner::{extract_source, rank_and_deduplicate, SearxngResponse};

    let state: crate::infrastructure::ServerState =
        dioxus_fullstack::FullstackContext::extract().await?;
    let searxng_url = state.services.searxng_url.clone();

    // Enrich the query with "latest news" context for better results,
    // similar to how Perplexity reformulates queries before searching.
    let enriched_query = format!("{query} latest news");

    // Use POST with form-encoded body because SearXNG's default config
    // sets `method: "POST"` which rejects GET requests with 405.
    let search_url = format!("{searxng_url}/search");
    let params = [
        ("q", enriched_query.as_str()),
        ("format", "json"),
        ("language", "en"),
        ("categories", "news,general"),
        ("time_range", "month"),
    ];

    let client = reqwest::Client::new();
    let resp = client
        .post(&search_url)
        .form(&params)
        .send()
        .await
        .map_err(|e| ServerFnError::new(format!("SearXNG request failed: {e}")))?;

    if !resp.status().is_success() {
        return Err(ServerFnError::new(format!(
            "SearXNG returned status {}",
            resp.status()
        )));
    }

    let body: SearxngResponse = resp
        .json()
        .await
        .map_err(|e| ServerFnError::new(format!("Failed to parse SearXNG response: {e}")))?;

    // Apply Perplexity-style ranking: filter empties, deduplicate domains, sort by score
    let ranked = rank_and_deduplicate(body.results, 15);

    let cards: Vec<NewsCard> = ranked
        .into_iter()
        .map(|r| {
            let summary = r
                .content
                .clone()
                .unwrap_or_default()
                .chars()
                .take(200)
                .collect::<String>();
            let content = r.content.unwrap_or_default();
            NewsCard {
                title: r.title,
                source: extract_source(&r.url),
                summary,
                content,
                category: query.clone(),
                url: r.url,
                thumbnail_url: r.thumbnail,
                published_at: r.published_date.unwrap_or_else(|| "Recent".into()),
            }
        })
        .collect();

    Ok(cards)
}

/// Fetch trending topic keywords by running a broad news search and
/// extracting the most frequent meaningful terms from result titles.
///
/// This approach works regardless of whether SearXNG has autocomplete
/// configured, since it uses the standard search API.
///
/// # Returns
///
/// Up to 8 trending keyword strings, or a `ServerFnError` on failure
///
/// # Errors
///
/// Returns `ServerFnError` if the SearXNG search request fails
#[get("/api/trending")]
pub async fn get_trending_topics() -> Result<Vec<String>, ServerFnError> {
    use inner::SearxngResponse;
    use std::collections::HashMap;

    let state: crate::infrastructure::ServerState =
        dioxus_fullstack::FullstackContext::extract().await?;
    let searxng_url = state.services.searxng_url.clone();

    // Use POST to match SearXNG's default `method: "POST"` setting
    let search_url = format!("{searxng_url}/search");
    let params = [
        ("q", "trending technology AI"),
        ("format", "json"),
        ("language", "en"),
        ("categories", "news"),
        ("time_range", "week"),
    ];

    let client = reqwest::Client::builder()
        .timeout(std::time::Duration::from_secs(5))
        .build()
        .map_err(|e| ServerFnError::new(format!("HTTP client error: {e}")))?;

    let resp = client
        .post(&search_url)
        .form(&params)
        .send()
        .await
        .map_err(|e| ServerFnError::new(format!("SearXNG trending search failed: {e}")))?;

    if !resp.status().is_success() {
        return Err(ServerFnError::new(format!(
            "SearXNG trending search returned status {}",
            resp.status()
        )));
    }

    let body: SearxngResponse = resp
        .json()
        .await
        .map_err(|e| ServerFnError::new(format!("Failed to parse trending response: {e}")))?;

    // Common stop words to exclude from trending keywords
    const STOP_WORDS: &[&str] = &[
        "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by",
        "from", "is", "are", "was", "were", "be", "been", "has", "have", "had", "do", "does",
        "did", "will", "would", "could", "should", "may", "can", "not", "no", "it", "its", "this",
        "that", "these", "how", "what", "why", "who", "when", "new", "says", "said", "about",
        "after", "over", "into", "up", "out", "as", "all", "more", "than", "just", "now", "also",
        "us", "we", "you", "your", "our", "if", "so", "like", "get", "make", "year", "years",
        "one", "two",
    ];

    // Count word frequency across all result titles. Words are lowercased
    // and must be at least 3 characters to filter out noise.
    let mut word_counts: HashMap<String, u32> = HashMap::new();
    for result in &body.results {
        for word in result.title.split_whitespace() {
            // Strip punctuation from edges, lowercase
            let clean: String = word
                .trim_matches(|c: char| !c.is_alphanumeric())
                .to_lowercase();
            if clean.len() >= 3 && !STOP_WORDS.contains(&clean.as_str()) {
                *word_counts.entry(clean).or_insert(0) += 1;
            }
        }
    }

    // Sort by frequency descending, take top 8
    let mut sorted: Vec<(String, u32)> = word_counts.into_iter().collect();
    sorted.sort_by(|a, b| b.1.cmp(&a.1));

    // Capitalize first letter for display
    let topics: Vec<String> = sorted
        .into_iter()
        .filter(|(_, count)| *count >= 2)
        .take(8)
        .map(|(word, _)| {
            let mut chars = word.chars();
            match chars.next() {
                Some(c) => c.to_uppercase().to_string() + chars.as_str(),
                None => word,
            }
        })
        .collect();

    Ok(topics)
}

#[cfg(all(test, feature = "server"))]
mod tests {
    #![allow(clippy::unwrap_used, clippy::expect_used)]

    use super::inner::*;
    use pretty_assertions::assert_eq;

    // -----------------------------------------------------------------------
    // extract_source()
    // -----------------------------------------------------------------------

    #[test]
    fn extract_source_strips_www() {
        assert_eq!(
            extract_source("https://www.example.com/page"),
            "example.com"
        );
    }

    #[test]
    fn extract_source_returns_domain() {
        assert_eq!(
            extract_source("https://techcrunch.com/article"),
            "techcrunch.com"
        );
    }

    #[test]
    fn extract_source_invalid_url_returns_web() {
        assert_eq!(extract_source("not-a-url"), "Web");
    }

    #[test]
    fn extract_source_no_scheme_returns_web() {
        // url::Url::parse requires a scheme; bare domain fails
        assert_eq!(extract_source("example.com/path"), "Web");
    }

    // -----------------------------------------------------------------------
    // rank_and_deduplicate()
    // -----------------------------------------------------------------------

    fn make_result(url: &str, content: &str, score: f64) -> SearxngResult {
        SearxngResult {
            title: "Title".into(),
            url: url.into(),
            content: if content.is_empty() {
                None
            } else {
                Some(content.into())
            },
            published_date: None,
            thumbnail: None,
            score,
        }
    }

    #[test]
    fn rank_filters_empty_content() {
        let results = vec![
            make_result("https://a.com", "", 10.0),
            make_result(
                "https://b.com",
                "This is meaningful content that passes the length filter",
                5.0,
            ),
        ];
        let ranked = rank_and_deduplicate(results, 10);
        assert_eq!(ranked.len(), 1);
        assert_eq!(ranked[0].url, "https://b.com");
    }

    #[test]
    fn rank_filters_short_content() {
        let results = vec![
            make_result("https://a.com", "short", 10.0),
            make_result(
                "https://b.com",
                "This content is long enough to pass the 20-char filter threshold",
                5.0,
            ),
        ];
        let ranked = rank_and_deduplicate(results, 10);
        assert_eq!(ranked.len(), 1);
    }

    #[test]
    fn rank_deduplicates_by_domain_keeps_highest() {
        let results = vec![
            make_result(
                "https://example.com/page1",
                "First result with enough content here for the filter",
                3.0,
            ),
            make_result(
                "https://example.com/page2",
                "Second result with enough content here for the filter",
                8.0,
            ),
        ];
        let ranked = rank_and_deduplicate(results, 10);
        assert_eq!(ranked.len(), 1);
        // Should keep the highest-scored one (page2 with score 8.0)
        assert_eq!(ranked[0].url, "https://example.com/page2");
    }

    #[test]
    fn rank_sorts_by_score_descending() {
        let results = vec![
            make_result(
                "https://a.com/p",
                "Content A that is long enough to pass the filter check",
                1.0,
            ),
            make_result(
                "https://b.com/p",
                "Content B that is long enough to pass the filter check",
                5.0,
            ),
            make_result(
                "https://c.com/p",
                "Content C that is long enough to pass the filter check",
                3.0,
            ),
        ];
        let ranked = rank_and_deduplicate(results, 10);
        assert_eq!(ranked.len(), 3);
        assert!(ranked[0].score >= ranked[1].score);
        assert!(ranked[1].score >= ranked[2].score);
    }

    #[test]
    fn rank_truncates_to_max_results() {
        let results: Vec<_> = (0..20)
            .map(|i| {
                make_result(
                    &format!("https://site{i}.com/page"),
                    &format!("Content for site {i} that is long enough to pass the filter"),
                    i as f64,
                )
            })
            .collect();
        let ranked = rank_and_deduplicate(results, 5);
        assert_eq!(ranked.len(), 5);
    }

    #[test]
    fn rank_empty_input_returns_empty() {
        let ranked = rank_and_deduplicate(vec![], 10);
        assert!(ranked.is_empty());
    }

    #[test]
    fn rank_all_filtered_returns_empty() {
        let results = vec![
            make_result("https://a.com", "", 10.0),
            make_result("https://b.com", "too short", 5.0),
        ];
        let ranked = rank_and_deduplicate(results, 10);
        assert!(ranked.is_empty());
    }
}