Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
210 lines
6.8 KiB
Python
210 lines
6.8 KiB
Python
"""
|
|
HyDE (Hypothetical Document Embeddings) Module
|
|
|
|
Improves RAG retrieval by generating hypothetical "ideal" documents
|
|
that would answer a query, then searching for similar real documents.
|
|
|
|
This bridges the semantic gap between:
|
|
- Short, informal user queries ("Was ist wichtig bei Gedichtanalyse?")
|
|
- Formal, detailed Erwartungshorizonte documents
|
|
|
|
Research shows HyDE can improve retrieval by 10-20% for queries
|
|
where there's a vocabulary mismatch between query and documents.
|
|
"""
|
|
|
|
import os
|
|
from typing import Optional, List
|
|
import httpx
|
|
|
|
# Configuration
|
|
# IMPORTANT: HyDE is DISABLED by default for privacy reasons!
|
|
# When enabled, user queries are sent to external LLM APIs (OpenAI/Anthropic)
|
|
# to generate hypothetical documents. This may expose search queries to third parties.
|
|
# Only enable if you have explicit user consent for data processing.
|
|
HYDE_ENABLED = os.getenv("HYDE_ENABLED", "false").lower() == "true"
|
|
HYDE_LLM_BACKEND = os.getenv("HYDE_LLM_BACKEND", "openai") # openai, anthropic, or local
|
|
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
|
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
|
|
HYDE_MODEL = os.getenv("HYDE_MODEL", "gpt-4o-mini") # Fast, cheap model for HyDE
|
|
|
|
# German education-specific prompt template
|
|
HYDE_PROMPT_TEMPLATE = """Du bist ein Experte für deutsche Bildungsstandards und Erwartungshorizonte.
|
|
|
|
Gegeben ist folgende Suchanfrage eines Lehrers:
|
|
"{query}"
|
|
|
|
Schreibe einen kurzen, fachlich korrekten Textabschnitt (2-3 Sätze), der wie ein Auszug aus einem offiziellen Erwartungshorizont für Abiturklausuren klingt und diese Anfrage beantworten würde.
|
|
|
|
Der Text sollte:
|
|
- Formelle, präzise Sprache verwenden
|
|
- Konkrete Bewertungskriterien oder Anforderungen nennen
|
|
- Wie ein echtes Dokument aus dem Bildungsministerium klingen
|
|
|
|
Antworte NUR mit dem Textabschnitt, ohne Einleitung oder Erklärung."""
|
|
|
|
|
|
class HyDEError(Exception):
|
|
"""Error during HyDE processing."""
|
|
pass
|
|
|
|
|
|
async def generate_hypothetical_document(
|
|
query: str,
|
|
subject: Optional[str] = None,
|
|
niveau: Optional[str] = None,
|
|
) -> str:
|
|
"""
|
|
Generate a hypothetical document that would answer the query.
|
|
|
|
Args:
|
|
query: The user's search query
|
|
subject: Optional subject context (e.g., "Deutsch", "Mathematik")
|
|
niveau: Optional niveau context (e.g., "eA", "gA")
|
|
|
|
Returns:
|
|
A hypothetical document text optimized for embedding
|
|
"""
|
|
if not HYDE_ENABLED:
|
|
return query # Fall back to original query
|
|
|
|
# Enhance prompt with context if available
|
|
context_info = ""
|
|
if subject:
|
|
context_info += f"\nFach: {subject}"
|
|
if niveau:
|
|
context_info += f"\nNiveau: {niveau}"
|
|
|
|
prompt = HYDE_PROMPT_TEMPLATE.format(query=query)
|
|
if context_info:
|
|
prompt = prompt.replace(
|
|
"Gegeben ist folgende Suchanfrage",
|
|
f"Kontext:{context_info}\n\nGegeben ist folgende Suchanfrage"
|
|
)
|
|
|
|
try:
|
|
if HYDE_LLM_BACKEND == "openai":
|
|
return await _generate_openai(prompt)
|
|
elif HYDE_LLM_BACKEND == "anthropic":
|
|
return await _generate_anthropic(prompt)
|
|
else:
|
|
# No LLM available, return original query
|
|
return query
|
|
except Exception as e:
|
|
print(f"HyDE generation failed, using original query: {e}")
|
|
return query
|
|
|
|
|
|
async def _generate_openai(prompt: str) -> str:
|
|
"""Generate using OpenAI API."""
|
|
if not OPENAI_API_KEY:
|
|
raise HyDEError("OPENAI_API_KEY not configured for HyDE")
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
response = await client.post(
|
|
"https://api.openai.com/v1/chat/completions",
|
|
headers={
|
|
"Authorization": f"Bearer {OPENAI_API_KEY}",
|
|
"Content-Type": "application/json"
|
|
},
|
|
json={
|
|
"model": HYDE_MODEL,
|
|
"messages": [
|
|
{"role": "system", "content": "Du bist ein Experte für deutsche Bildungsstandards."},
|
|
{"role": "user", "content": prompt}
|
|
],
|
|
"max_tokens": 200,
|
|
"temperature": 0.7,
|
|
},
|
|
timeout=30.0
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
raise HyDEError(f"OpenAI API error: {response.status_code}")
|
|
|
|
data = response.json()
|
|
return data["choices"][0]["message"]["content"].strip()
|
|
|
|
|
|
async def _generate_anthropic(prompt: str) -> str:
|
|
"""Generate using Anthropic API."""
|
|
if not ANTHROPIC_API_KEY:
|
|
raise HyDEError("ANTHROPIC_API_KEY not configured for HyDE")
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
response = await client.post(
|
|
"https://api.anthropic.com/v1/messages",
|
|
headers={
|
|
"x-api-key": ANTHROPIC_API_KEY,
|
|
"Content-Type": "application/json",
|
|
"anthropic-version": "2023-06-01"
|
|
},
|
|
json={
|
|
"model": "claude-3-haiku-20240307",
|
|
"max_tokens": 200,
|
|
"messages": [
|
|
{"role": "user", "content": prompt}
|
|
]
|
|
},
|
|
timeout=30.0
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
raise HyDEError(f"Anthropic API error: {response.status_code}")
|
|
|
|
data = response.json()
|
|
return data["content"][0]["text"].strip()
|
|
|
|
|
|
async def hyde_search(
|
|
query: str,
|
|
search_func,
|
|
subject: Optional[str] = None,
|
|
niveau: Optional[str] = None,
|
|
**search_kwargs
|
|
) -> dict:
|
|
"""
|
|
Perform HyDE-enhanced search.
|
|
|
|
Args:
|
|
query: Original user query
|
|
search_func: Async function to perform the actual search
|
|
subject: Optional subject context
|
|
niveau: Optional niveau context
|
|
**search_kwargs: Additional arguments passed to search_func
|
|
|
|
Returns:
|
|
Search results with HyDE metadata
|
|
"""
|
|
# Generate hypothetical document
|
|
hypothetical_doc = await generate_hypothetical_document(query, subject, niveau)
|
|
|
|
# Check if HyDE was actually used
|
|
hyde_used = hypothetical_doc != query
|
|
|
|
# Perform search with hypothetical document
|
|
results = await search_func(
|
|
query=hypothetical_doc,
|
|
**search_kwargs
|
|
)
|
|
|
|
return {
|
|
"results": results,
|
|
"hyde_used": hyde_used,
|
|
"original_query": query,
|
|
"hypothetical_document": hypothetical_doc if hyde_used else None,
|
|
}
|
|
|
|
|
|
def get_hyde_info() -> dict:
|
|
"""Get information about HyDE configuration."""
|
|
return {
|
|
"enabled": HYDE_ENABLED,
|
|
"llm_backend": HYDE_LLM_BACKEND,
|
|
"model": HYDE_MODEL,
|
|
"openai_configured": bool(OPENAI_API_KEY),
|
|
"anthropic_configured": bool(ANTHROPIC_API_KEY),
|
|
"sends_data_externally": True, # ALWAYS true when enabled - queries go to LLM APIs
|
|
"privacy_warning": "When enabled, user search queries are sent to external LLM APIs",
|
|
"default_enabled": False, # Disabled by default for privacy
|
|
}
|