""" HyDE (Hypothetical Document Embeddings) Module Improves RAG retrieval by generating hypothetical "ideal" documents that would answer a query, then searching for similar real documents. This bridges the semantic gap between: - Short, informal user queries ("Was ist wichtig bei Gedichtanalyse?") - Formal, detailed Erwartungshorizonte documents Research shows HyDE can improve retrieval by 10-20% for queries where there's a vocabulary mismatch between query and documents. """ import os from typing import Optional, List import httpx # Configuration # IMPORTANT: HyDE is DISABLED by default for privacy reasons! # When enabled, user queries are sent to external LLM APIs (OpenAI/Anthropic) # to generate hypothetical documents. This may expose search queries to third parties. # Only enable if you have explicit user consent for data processing. HYDE_ENABLED = os.getenv("HYDE_ENABLED", "false").lower() == "true" HYDE_LLM_BACKEND = os.getenv("HYDE_LLM_BACKEND", "openai") # openai, anthropic, or local OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "") HYDE_MODEL = os.getenv("HYDE_MODEL", "gpt-4o-mini") # Fast, cheap model for HyDE # German education-specific prompt template HYDE_PROMPT_TEMPLATE = """Du bist ein Experte für deutsche Bildungsstandards und Erwartungshorizonte. Gegeben ist folgende Suchanfrage eines Lehrers: "{query}" Schreibe einen kurzen, fachlich korrekten Textabschnitt (2-3 Sätze), der wie ein Auszug aus einem offiziellen Erwartungshorizont für Abiturklausuren klingt und diese Anfrage beantworten würde. Der Text sollte: - Formelle, präzise Sprache verwenden - Konkrete Bewertungskriterien oder Anforderungen nennen - Wie ein echtes Dokument aus dem Bildungsministerium klingen Antworte NUR mit dem Textabschnitt, ohne Einleitung oder Erklärung.""" class HyDEError(Exception): """Error during HyDE processing.""" pass async def generate_hypothetical_document( query: str, subject: Optional[str] = None, niveau: Optional[str] = None, ) -> str: """ Generate a hypothetical document that would answer the query. Args: query: The user's search query subject: Optional subject context (e.g., "Deutsch", "Mathematik") niveau: Optional niveau context (e.g., "eA", "gA") Returns: A hypothetical document text optimized for embedding """ if not HYDE_ENABLED: return query # Fall back to original query # Enhance prompt with context if available context_info = "" if subject: context_info += f"\nFach: {subject}" if niveau: context_info += f"\nNiveau: {niveau}" prompt = HYDE_PROMPT_TEMPLATE.format(query=query) if context_info: prompt = prompt.replace( "Gegeben ist folgende Suchanfrage", f"Kontext:{context_info}\n\nGegeben ist folgende Suchanfrage" ) try: if HYDE_LLM_BACKEND == "openai": return await _generate_openai(prompt) elif HYDE_LLM_BACKEND == "anthropic": return await _generate_anthropic(prompt) else: # No LLM available, return original query return query except Exception as e: print(f"HyDE generation failed, using original query: {e}") return query async def _generate_openai(prompt: str) -> str: """Generate using OpenAI API.""" if not OPENAI_API_KEY: raise HyDEError("OPENAI_API_KEY not configured for HyDE") async with httpx.AsyncClient() as client: response = await client.post( "https://api.openai.com/v1/chat/completions", headers={ "Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json" }, json={ "model": HYDE_MODEL, "messages": [ {"role": "system", "content": "Du bist ein Experte für deutsche Bildungsstandards."}, {"role": "user", "content": prompt} ], "max_tokens": 200, "temperature": 0.7, }, timeout=30.0 ) if response.status_code != 200: raise HyDEError(f"OpenAI API error: {response.status_code}") data = response.json() return data["choices"][0]["message"]["content"].strip() async def _generate_anthropic(prompt: str) -> str: """Generate using Anthropic API.""" if not ANTHROPIC_API_KEY: raise HyDEError("ANTHROPIC_API_KEY not configured for HyDE") async with httpx.AsyncClient() as client: response = await client.post( "https://api.anthropic.com/v1/messages", headers={ "x-api-key": ANTHROPIC_API_KEY, "Content-Type": "application/json", "anthropic-version": "2023-06-01" }, json={ "model": "claude-3-haiku-20240307", "max_tokens": 200, "messages": [ {"role": "user", "content": prompt} ] }, timeout=30.0 ) if response.status_code != 200: raise HyDEError(f"Anthropic API error: {response.status_code}") data = response.json() return data["content"][0]["text"].strip() async def hyde_search( query: str, search_func, subject: Optional[str] = None, niveau: Optional[str] = None, **search_kwargs ) -> dict: """ Perform HyDE-enhanced search. Args: query: Original user query search_func: Async function to perform the actual search subject: Optional subject context niveau: Optional niveau context **search_kwargs: Additional arguments passed to search_func Returns: Search results with HyDE metadata """ # Generate hypothetical document hypothetical_doc = await generate_hypothetical_document(query, subject, niveau) # Check if HyDE was actually used hyde_used = hypothetical_doc != query # Perform search with hypothetical document results = await search_func( query=hypothetical_doc, **search_kwargs ) return { "results": results, "hyde_used": hyde_used, "original_query": query, "hypothetical_document": hypothetical_doc if hyde_used else None, } def get_hyde_info() -> dict: """Get information about HyDE configuration.""" return { "enabled": HYDE_ENABLED, "llm_backend": HYDE_LLM_BACKEND, "model": HYDE_MODEL, "openai_configured": bool(OPENAI_API_KEY), "anthropic_configured": bool(ANTHROPIC_API_KEY), "sends_data_externally": True, # ALWAYS true when enabled - queries go to LLM APIs "privacy_warning": "When enabled, user search queries are sent to external LLM APIs", "default_enabled": False, # Disabled by default for privacy }