breakpilot-lehrer/klausur-service/backend/hybrid_search.py

"""
Hybrid Search Module

Combines dense (semantic) search with sparse (BM25/keyword) search
for better retrieval, especially for German compound words.

Why Hybrid Search?
- Dense search: Great for semantic similarity ("Analyse" ≈ "Untersuchung")
- Sparse search: Great for exact matches, compound words ("Erwartungshorizont")
- Combined: Best of both worlds, 10-15% better recall

German compound nouns like "Erwartungshorizont", "Bewertungskriterien"
may not match semantically but should match lexically.
"""

import os
import re
from typing import List, Dict, Optional, Tuple
from collections import Counter
import math

# Configuration
HYBRID_ENABLED = os.getenv("HYBRID_SEARCH_ENABLED", "true").lower() == "true"
DENSE_WEIGHT = float(os.getenv("HYBRID_DENSE_WEIGHT", "0.7"))  # 70% dense
SPARSE_WEIGHT = float(os.getenv("HYBRID_SPARSE_WEIGHT", "0.3"))  # 30% sparse

# German stopwords for BM25
GERMAN_STOPWORDS = {
    'der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einer', 'eines',
    'einem', 'einen', 'und', 'oder', 'aber', 'als', 'auch', 'auf', 'aus', 'bei',
    'bis', 'durch', 'für', 'gegen', 'in', 'mit', 'nach', 'ohne', 'über', 'unter',
    'von', 'vor', 'zu', 'zum', 'zur', 'ist', 'sind', 'war', 'waren', 'wird',
    'werden', 'hat', 'haben', 'kann', 'können', 'muss', 'müssen', 'soll', 'sollen',
    'nicht', 'sich', 'es', 'er', 'sie', 'wir', 'ihr', 'man', 'was', 'wie', 'wo',
    'wenn', 'weil', 'dass', 'ob', 'so', 'sehr', 'nur', 'noch', 'schon', 'mehr',
    'also', 'dabei', 'dabei', 'sowie', 'bzw', 'etc', 'ca', 'vgl'
}


class BM25:
    """
    BM25 (Best Matching 25) implementation for German text.

    BM25 is a ranking function used for keyword-based retrieval.
    It considers term frequency, document length, and inverse document frequency.
    """

    def __init__(self, k1: float = 1.5, b: float = 0.75):
        """
        Initialize BM25 with tuning parameters.

        Args:
            k1: Term frequency saturation parameter (1.2-2.0 typical)
            b: Document length normalization (0.75 typical)
        """
        self.k1 = k1
        self.b = b
        self.corpus = []
        self.doc_lengths = []
        self.avg_doc_length = 0
        self.doc_freqs = Counter()
        self.idf = {}
        self.N = 0

    def _tokenize(self, text: str) -> List[str]:
        """Tokenize German text."""
        # Lowercase and split on non-word characters
        text = text.lower()
        # Keep German umlauts
        tokens = re.findall(r'[a-zäöüß]+', text)
        # Remove stopwords and short tokens
        tokens = [t for t in tokens if t not in GERMAN_STOPWORDS and len(t) > 2]
        return tokens

    def fit(self, documents: List[str]):
        """
        Fit BM25 on a corpus of documents.

        Args:
            documents: List of document texts
        """
        self.corpus = [self._tokenize(doc) for doc in documents]
        self.N = len(self.corpus)
        self.doc_lengths = [len(doc) for doc in self.corpus]
        self.avg_doc_length = sum(self.doc_lengths) / max(self.N, 1)

        # Calculate document frequencies
        self.doc_freqs = Counter()
        for doc in self.corpus:
            unique_terms = set(doc)
            for term in unique_terms:
                self.doc_freqs[term] += 1

        # Calculate IDF
        self.idf = {}
        for term, df in self.doc_freqs.items():
            # IDF with smoothing
            self.idf[term] = math.log((self.N - df + 0.5) / (df + 0.5) + 1)

    def score(self, query: str, doc_idx: int) -> float:
        """
        Calculate BM25 score for a query against a document.

        Args:
            query: Query text
            doc_idx: Index of document in corpus

        Returns:
            BM25 score (higher = more relevant)
        """
        query_tokens = self._tokenize(query)
        doc = self.corpus[doc_idx]
        doc_len = self.doc_lengths[doc_idx]

        score = 0.0
        doc_term_freqs = Counter(doc)

        for term in query_tokens:
            if term not in self.idf:
                continue

            tf = doc_term_freqs.get(term, 0)
            idf = self.idf[term]

            # BM25 formula
            numerator = tf * (self.k1 + 1)
            denominator = tf + self.k1 * (1 - self.b + self.b * doc_len / self.avg_doc_length)
            score += idf * numerator / denominator

        return score

    def search(self, query: str, top_k: int = 10) -> List[Tuple[int, float]]:
        """
        Search corpus for query.

        Args:
            query: Query text
            top_k: Number of results to return

        Returns:
            List of (doc_idx, score) tuples, sorted by score descending
        """
        scores = [(i, self.score(query, i)) for i in range(self.N)]
        scores.sort(key=lambda x: x[1], reverse=True)
        return scores[:top_k]


def normalize_scores(scores: List[float]) -> List[float]:
    """Normalize scores to 0-1 range using min-max normalization."""
    if not scores:
        return []
    min_score = min(scores)
    max_score = max(scores)
    if max_score == min_score:
        return [1.0] * len(scores)
    return [(s - min_score) / (max_score - min_score) for s in scores]


def combine_scores(
    dense_results: List[Dict],
    sparse_scores: List[Tuple[int, float]],
    documents: List[str],
    dense_weight: float = DENSE_WEIGHT,
    sparse_weight: float = SPARSE_WEIGHT,
) -> List[Dict]:
    """
    Combine dense and sparse search results using Reciprocal Rank Fusion (RRF).

    Args:
        dense_results: Results from dense (vector) search with 'score' field
        sparse_scores: BM25 scores as (idx, score) tuples
        documents: Original documents (for mapping)
        dense_weight: Weight for dense scores
        sparse_weight: Weight for sparse scores

    Returns:
        Combined results with hybrid_score field
    """
    # Create document ID to result mapping
    result_map = {}

    # Add dense results
    for rank, result in enumerate(dense_results):
        doc_id = result.get("id", str(rank))
        if doc_id not in result_map:
            result_map[doc_id] = result.copy()
            result_map[doc_id]["dense_score"] = result.get("score", 0)
            result_map[doc_id]["dense_rank"] = rank + 1
            result_map[doc_id]["sparse_score"] = 0
            result_map[doc_id]["sparse_rank"] = len(dense_results) + 1

    # Add sparse scores
    for rank, (doc_idx, score) in enumerate(sparse_scores):
        # Try to match with dense results by text similarity
        if doc_idx < len(dense_results):
            doc_id = dense_results[doc_idx].get("id", str(doc_idx))
            if doc_id in result_map:
                result_map[doc_id]["sparse_score"] = score
                result_map[doc_id]["sparse_rank"] = rank + 1

    # Calculate hybrid scores using RRF
    k = 60  # RRF constant
    for doc_id, result in result_map.items():
        dense_rrf = 1 / (k + result.get("dense_rank", 1000))
        sparse_rrf = 1 / (k + result.get("sparse_rank", 1000))
        result["hybrid_score"] = (dense_weight * dense_rrf + sparse_weight * sparse_rrf)

    # Sort by hybrid score
    results = list(result_map.values())
    results.sort(key=lambda x: x.get("hybrid_score", 0), reverse=True)

    return results


async def hybrid_search(
    query: str,
    documents: List[str],
    dense_search_func,
    top_k: int = 10,
    dense_weight: float = DENSE_WEIGHT,
    sparse_weight: float = SPARSE_WEIGHT,
    **dense_kwargs
) -> Dict:
    """
    Perform hybrid search combining dense and sparse retrieval.

    Args:
        query: Search query
        documents: List of document texts for BM25
        dense_search_func: Async function for dense search
        top_k: Number of results to return
        dense_weight: Weight for dense (semantic) scores
        sparse_weight: Weight for sparse (BM25) scores
        **dense_kwargs: Additional args for dense search

    Returns:
        Combined results with metadata
    """
    if not HYBRID_ENABLED:
        # Fall back to dense-only search
        results = await dense_search_func(query=query, limit=top_k, **dense_kwargs)
        return {
            "results": results,
            "hybrid_enabled": False,
            "dense_weight": 1.0,
            "sparse_weight": 0.0,
        }

    # Perform dense search
    dense_results = await dense_search_func(query=query, limit=top_k * 2, **dense_kwargs)

    # Perform sparse (BM25) search
    bm25 = BM25()
    doc_texts = [r.get("text", "") for r in dense_results]
    if doc_texts:
        bm25.fit(doc_texts)
        sparse_scores = bm25.search(query, top_k=top_k * 2)
    else:
        sparse_scores = []

    # Combine results
    combined = combine_scores(
        dense_results=dense_results,
        sparse_scores=sparse_scores,
        documents=doc_texts,
        dense_weight=dense_weight,
        sparse_weight=sparse_weight,
    )

    return {
        "results": combined[:top_k],
        "hybrid_enabled": True,
        "dense_weight": dense_weight,
        "sparse_weight": sparse_weight,
    }


def get_hybrid_search_info() -> dict:
    """Get information about hybrid search configuration."""
    return {
        "enabled": HYBRID_ENABLED,
        "dense_weight": DENSE_WEIGHT,
        "sparse_weight": SPARSE_WEIGHT,
        "algorithm": "BM25 + Dense Vector (RRF fusion)",
    }