breakpilot-lehrer/klausur-service/backend/hyde.py

"""
HyDE (Hypothetical Document Embeddings) Module

Improves RAG retrieval by generating hypothetical "ideal" documents
that would answer a query, then searching for similar real documents.

This bridges the semantic gap between:
- Short, informal user queries ("Was ist wichtig bei Gedichtanalyse?")
- Formal, detailed Erwartungshorizonte documents

Research shows HyDE can improve retrieval by 10-20% for queries
where there's a vocabulary mismatch between query and documents.
"""

import os
from typing import Optional, List
import httpx

# Configuration
# IMPORTANT: HyDE is DISABLED by default for privacy reasons!
# When enabled, user queries are sent to external LLM APIs (OpenAI/Anthropic)
# to generate hypothetical documents. This may expose search queries to third parties.
# Only enable if you have explicit user consent for data processing.
HYDE_ENABLED = os.getenv("HYDE_ENABLED", "false").lower() == "true"
HYDE_LLM_BACKEND = os.getenv("HYDE_LLM_BACKEND", "openai")  # openai, anthropic, or local
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
HYDE_MODEL = os.getenv("HYDE_MODEL", "gpt-4o-mini")  # Fast, cheap model for HyDE

# German education-specific prompt template
HYDE_PROMPT_TEMPLATE = """Du bist ein Experte für deutsche Bildungsstandards und Erwartungshorizonte.

Gegeben ist folgende Suchanfrage eines Lehrers:
"{query}"

Schreibe einen kurzen, fachlich korrekten Textabschnitt (2-3 Sätze), der wie ein Auszug aus einem offiziellen Erwartungshorizont für Abiturklausuren klingt und diese Anfrage beantworten würde.

Der Text sollte:
- Formelle, präzise Sprache verwenden
- Konkrete Bewertungskriterien oder Anforderungen nennen
- Wie ein echtes Dokument aus dem Bildungsministerium klingen

Antworte NUR mit dem Textabschnitt, ohne Einleitung oder Erklärung."""


class HyDEError(Exception):
    """Error during HyDE processing."""
    pass


async def generate_hypothetical_document(
    query: str,
    subject: Optional[str] = None,
    niveau: Optional[str] = None,
) -> str:
    """
    Generate a hypothetical document that would answer the query.

    Args:
        query: The user's search query
        subject: Optional subject context (e.g., "Deutsch", "Mathematik")
        niveau: Optional niveau context (e.g., "eA", "gA")

    Returns:
        A hypothetical document text optimized for embedding
    """
    if not HYDE_ENABLED:
        return query  # Fall back to original query

    # Enhance prompt with context if available
    context_info = ""
    if subject:
        context_info += f"\nFach: {subject}"
    if niveau:
        context_info += f"\nNiveau: {niveau}"

    prompt = HYDE_PROMPT_TEMPLATE.format(query=query)
    if context_info:
        prompt = prompt.replace(
            "Gegeben ist folgende Suchanfrage",
            f"Kontext:{context_info}\n\nGegeben ist folgende Suchanfrage"
        )

    try:
        if HYDE_LLM_BACKEND == "openai":
            return await _generate_openai(prompt)
        elif HYDE_LLM_BACKEND == "anthropic":
            return await _generate_anthropic(prompt)
        else:
            # No LLM available, return original query
            return query
    except Exception as e:
        print(f"HyDE generation failed, using original query: {e}")
        return query


async def _generate_openai(prompt: str) -> str:
    """Generate using OpenAI API."""
    if not OPENAI_API_KEY:
        raise HyDEError("OPENAI_API_KEY not configured for HyDE")

    async with httpx.AsyncClient() as client:
        response = await client.post(
            "https://api.openai.com/v1/chat/completions",
            headers={
                "Authorization": f"Bearer {OPENAI_API_KEY}",
                "Content-Type": "application/json"
            },
            json={
                "model": HYDE_MODEL,
                "messages": [
                    {"role": "system", "content": "Du bist ein Experte für deutsche Bildungsstandards."},
                    {"role": "user", "content": prompt}
                ],
                "max_tokens": 200,
                "temperature": 0.7,
            },
            timeout=30.0
        )

        if response.status_code != 200:
            raise HyDEError(f"OpenAI API error: {response.status_code}")

        data = response.json()
        return data["choices"][0]["message"]["content"].strip()


async def _generate_anthropic(prompt: str) -> str:
    """Generate using Anthropic API."""
    if not ANTHROPIC_API_KEY:
        raise HyDEError("ANTHROPIC_API_KEY not configured for HyDE")

    async with httpx.AsyncClient() as client:
        response = await client.post(
            "https://api.anthropic.com/v1/messages",
            headers={
                "x-api-key": ANTHROPIC_API_KEY,
                "Content-Type": "application/json",
                "anthropic-version": "2023-06-01"
            },
            json={
                "model": "claude-3-haiku-20240307",
                "max_tokens": 200,
                "messages": [
                    {"role": "user", "content": prompt}
                ]
            },
            timeout=30.0
        )

        if response.status_code != 200:
            raise HyDEError(f"Anthropic API error: {response.status_code}")

        data = response.json()
        return data["content"][0]["text"].strip()


async def hyde_search(
    query: str,
    search_func,
    subject: Optional[str] = None,
    niveau: Optional[str] = None,
    **search_kwargs
) -> dict:
    """
    Perform HyDE-enhanced search.

    Args:
        query: Original user query
        search_func: Async function to perform the actual search
        subject: Optional subject context
        niveau: Optional niveau context
        **search_kwargs: Additional arguments passed to search_func

    Returns:
        Search results with HyDE metadata
    """
    # Generate hypothetical document
    hypothetical_doc = await generate_hypothetical_document(query, subject, niveau)

    # Check if HyDE was actually used
    hyde_used = hypothetical_doc != query

    # Perform search with hypothetical document
    results = await search_func(
        query=hypothetical_doc,
        **search_kwargs
    )

    return {
        "results": results,
        "hyde_used": hyde_used,
        "original_query": query,
        "hypothetical_document": hypothetical_doc if hyde_used else None,
    }


def get_hyde_info() -> dict:
    """Get information about HyDE configuration."""
    return {
        "enabled": HYDE_ENABLED,
        "llm_backend": HYDE_LLM_BACKEND,
        "model": HYDE_MODEL,
        "openai_configured": bool(OPENAI_API_KEY),
        "anthropic_configured": bool(ANTHROPIC_API_KEY),
        "sends_data_externally": True,  # ALWAYS true when enabled - queries go to LLM APIs
        "privacy_warning": "When enabled, user search queries are sent to external LLM APIs",
        "default_enabled": False,  # Disabled by default for privacy
    }