Files
breakpilot-lehrer/klausur-service/backend/hyde.py
Benjamin Boenisch 5a31f52310 Initial commit: breakpilot-lehrer - Lehrer KI Platform
Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website,
Klausur-Service, School-Service, Voice-Service, Geo-Service,
BreakPilot Drive, Agent-Core

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:26 +01:00

210 lines
6.8 KiB
Python

"""
HyDE (Hypothetical Document Embeddings) Module
Improves RAG retrieval by generating hypothetical "ideal" documents
that would answer a query, then searching for similar real documents.
This bridges the semantic gap between:
- Short, informal user queries ("Was ist wichtig bei Gedichtanalyse?")
- Formal, detailed Erwartungshorizonte documents
Research shows HyDE can improve retrieval by 10-20% for queries
where there's a vocabulary mismatch between query and documents.
"""
import os
from typing import Optional, List
import httpx
# Configuration
# IMPORTANT: HyDE is DISABLED by default for privacy reasons!
# When enabled, user queries are sent to external LLM APIs (OpenAI/Anthropic)
# to generate hypothetical documents. This may expose search queries to third parties.
# Only enable if you have explicit user consent for data processing.
HYDE_ENABLED = os.getenv("HYDE_ENABLED", "false").lower() == "true"
HYDE_LLM_BACKEND = os.getenv("HYDE_LLM_BACKEND", "openai") # openai, anthropic, or local
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
HYDE_MODEL = os.getenv("HYDE_MODEL", "gpt-4o-mini") # Fast, cheap model for HyDE
# German education-specific prompt template
HYDE_PROMPT_TEMPLATE = """Du bist ein Experte für deutsche Bildungsstandards und Erwartungshorizonte.
Gegeben ist folgende Suchanfrage eines Lehrers:
"{query}"
Schreibe einen kurzen, fachlich korrekten Textabschnitt (2-3 Sätze), der wie ein Auszug aus einem offiziellen Erwartungshorizont für Abiturklausuren klingt und diese Anfrage beantworten würde.
Der Text sollte:
- Formelle, präzise Sprache verwenden
- Konkrete Bewertungskriterien oder Anforderungen nennen
- Wie ein echtes Dokument aus dem Bildungsministerium klingen
Antworte NUR mit dem Textabschnitt, ohne Einleitung oder Erklärung."""
class HyDEError(Exception):
"""Error during HyDE processing."""
pass
async def generate_hypothetical_document(
query: str,
subject: Optional[str] = None,
niveau: Optional[str] = None,
) -> str:
"""
Generate a hypothetical document that would answer the query.
Args:
query: The user's search query
subject: Optional subject context (e.g., "Deutsch", "Mathematik")
niveau: Optional niveau context (e.g., "eA", "gA")
Returns:
A hypothetical document text optimized for embedding
"""
if not HYDE_ENABLED:
return query # Fall back to original query
# Enhance prompt with context if available
context_info = ""
if subject:
context_info += f"\nFach: {subject}"
if niveau:
context_info += f"\nNiveau: {niveau}"
prompt = HYDE_PROMPT_TEMPLATE.format(query=query)
if context_info:
prompt = prompt.replace(
"Gegeben ist folgende Suchanfrage",
f"Kontext:{context_info}\n\nGegeben ist folgende Suchanfrage"
)
try:
if HYDE_LLM_BACKEND == "openai":
return await _generate_openai(prompt)
elif HYDE_LLM_BACKEND == "anthropic":
return await _generate_anthropic(prompt)
else:
# No LLM available, return original query
return query
except Exception as e:
print(f"HyDE generation failed, using original query: {e}")
return query
async def _generate_openai(prompt: str) -> str:
"""Generate using OpenAI API."""
if not OPENAI_API_KEY:
raise HyDEError("OPENAI_API_KEY not configured for HyDE")
async with httpx.AsyncClient() as client:
response = await client.post(
"https://api.openai.com/v1/chat/completions",
headers={
"Authorization": f"Bearer {OPENAI_API_KEY}",
"Content-Type": "application/json"
},
json={
"model": HYDE_MODEL,
"messages": [
{"role": "system", "content": "Du bist ein Experte für deutsche Bildungsstandards."},
{"role": "user", "content": prompt}
],
"max_tokens": 200,
"temperature": 0.7,
},
timeout=30.0
)
if response.status_code != 200:
raise HyDEError(f"OpenAI API error: {response.status_code}")
data = response.json()
return data["choices"][0]["message"]["content"].strip()
async def _generate_anthropic(prompt: str) -> str:
"""Generate using Anthropic API."""
if not ANTHROPIC_API_KEY:
raise HyDEError("ANTHROPIC_API_KEY not configured for HyDE")
async with httpx.AsyncClient() as client:
response = await client.post(
"https://api.anthropic.com/v1/messages",
headers={
"x-api-key": ANTHROPIC_API_KEY,
"Content-Type": "application/json",
"anthropic-version": "2023-06-01"
},
json={
"model": "claude-3-haiku-20240307",
"max_tokens": 200,
"messages": [
{"role": "user", "content": prompt}
]
},
timeout=30.0
)
if response.status_code != 200:
raise HyDEError(f"Anthropic API error: {response.status_code}")
data = response.json()
return data["content"][0]["text"].strip()
async def hyde_search(
query: str,
search_func,
subject: Optional[str] = None,
niveau: Optional[str] = None,
**search_kwargs
) -> dict:
"""
Perform HyDE-enhanced search.
Args:
query: Original user query
search_func: Async function to perform the actual search
subject: Optional subject context
niveau: Optional niveau context
**search_kwargs: Additional arguments passed to search_func
Returns:
Search results with HyDE metadata
"""
# Generate hypothetical document
hypothetical_doc = await generate_hypothetical_document(query, subject, niveau)
# Check if HyDE was actually used
hyde_used = hypothetical_doc != query
# Perform search with hypothetical document
results = await search_func(
query=hypothetical_doc,
**search_kwargs
)
return {
"results": results,
"hyde_used": hyde_used,
"original_query": query,
"hypothetical_document": hypothetical_doc if hyde_used else None,
}
def get_hyde_info() -> dict:
"""Get information about HyDE configuration."""
return {
"enabled": HYDE_ENABLED,
"llm_backend": HYDE_LLM_BACKEND,
"model": HYDE_MODEL,
"openai_configured": bool(OPENAI_API_KEY),
"anthropic_configured": bool(ANTHROPIC_API_KEY),
"sends_data_externally": True, # ALWAYS true when enabled - queries go to LLM APIs
"privacy_warning": "When enabled, user search queries are sent to external LLM APIs",
"default_enabled": False, # Disabled by default for privacy
}