Files
breakpilot-lehrer/klausur-service/backend/eh_pipeline.py
Benjamin Boenisch 5a31f52310 Initial commit: breakpilot-lehrer - Lehrer KI Platform
Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website,
Klausur-Service, School-Service, Voice-Service, Geo-Service,
BreakPilot Drive, Agent-Core

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:26 +01:00

421 lines
12 KiB
Python

"""
BYOEH Processing Pipeline
Handles chunking, embedding generation, and encryption for Erwartungshorizonte.
Supports multiple embedding backends:
- local: sentence-transformers (default, no API key needed)
- openai: OpenAI text-embedding-3-small (requires OPENAI_API_KEY)
"""
import os
import io
import base64
import hashlib
from typing import List, Tuple, Optional
from cryptography.hazmat.primitives.ciphers.aead import AESGCM
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
from cryptography.hazmat.primitives import hashes
import httpx
# Embedding Configuration
# Backend: "local" (sentence-transformers) or "openai"
EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
# Local embedding model (all-MiniLM-L6-v2: 384 dimensions, fast, good quality)
LOCAL_EMBEDDING_MODEL = os.getenv("LOCAL_EMBEDDING_MODEL", "all-MiniLM-L6-v2")
# Vector dimensions per backend
VECTOR_DIMENSIONS = {
"local": 384, # all-MiniLM-L6-v2
"openai": 1536, # text-embedding-3-small
}
CHUNK_SIZE = int(os.getenv("BYOEH_CHUNK_SIZE", "1000"))
CHUNK_OVERLAP = int(os.getenv("BYOEH_CHUNK_OVERLAP", "200"))
# Lazy-loaded sentence-transformers model
_local_model = None
class ChunkingError(Exception):
"""Error during text chunking."""
pass
class EmbeddingError(Exception):
"""Error during embedding generation."""
pass
class EncryptionError(Exception):
"""Error during encryption/decryption."""
pass
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
"""
Split text into overlapping chunks.
Uses a simple recursive character splitter approach:
- Try to split on paragraph boundaries first
- Then sentences
- Then words
- Finally characters
Args:
text: Input text to chunk
chunk_size: Target chunk size in characters
overlap: Overlap between chunks
Returns:
List of text chunks
"""
if not text or len(text) <= chunk_size:
return [text] if text else []
chunks = []
separators = ["\n\n", "\n", ". ", " ", ""]
def split_recursive(text: str, sep_idx: int = 0) -> List[str]:
if len(text) <= chunk_size:
return [text]
if sep_idx >= len(separators):
# Last resort: hard split
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - overlap)]
sep = separators[sep_idx]
if not sep:
# Empty separator = character split
parts = list(text)
else:
parts = text.split(sep)
result = []
current = ""
for part in parts:
test_chunk = current + sep + part if current else part
if len(test_chunk) <= chunk_size:
current = test_chunk
else:
if current:
result.append(current)
# If single part is too big, recursively split it
if len(part) > chunk_size:
result.extend(split_recursive(part, sep_idx + 1))
current = ""
else:
current = part
if current:
result.append(current)
return result
raw_chunks = split_recursive(text)
# Add overlap
final_chunks = []
for i, chunk in enumerate(raw_chunks):
if i > 0 and overlap > 0:
# Add overlap from previous chunk
prev_chunk = raw_chunks[i-1]
overlap_text = prev_chunk[-min(overlap, len(prev_chunk)):]
chunk = overlap_text + chunk
final_chunks.append(chunk.strip())
return [c for c in final_chunks if c]
def get_vector_size() -> int:
"""Get the vector dimension for the current embedding backend."""
return VECTOR_DIMENSIONS.get(EMBEDDING_BACKEND, 384)
def _get_local_model():
"""Lazy-load the sentence-transformers model."""
global _local_model
if _local_model is None:
try:
from sentence_transformers import SentenceTransformer
print(f"Loading local embedding model: {LOCAL_EMBEDDING_MODEL}")
_local_model = SentenceTransformer(LOCAL_EMBEDDING_MODEL)
print(f"Model loaded successfully (dim={_local_model.get_sentence_embedding_dimension()})")
except ImportError:
raise EmbeddingError(
"sentence-transformers not installed. "
"Install with: pip install sentence-transformers"
)
return _local_model
def _generate_local_embeddings(texts: List[str]) -> List[List[float]]:
"""Generate embeddings using local sentence-transformers model."""
if not texts:
return []
model = _get_local_model()
embeddings = model.encode(texts, show_progress_bar=len(texts) > 10)
return [emb.tolist() for emb in embeddings]
async def _generate_openai_embeddings(texts: List[str]) -> List[List[float]]:
"""Generate embeddings using OpenAI API."""
if not OPENAI_API_KEY:
raise EmbeddingError("OPENAI_API_KEY not configured")
try:
async with httpx.AsyncClient() as client:
response = await client.post(
"https://api.openai.com/v1/embeddings",
headers={
"Authorization": f"Bearer {OPENAI_API_KEY}",
"Content-Type": "application/json"
},
json={
"model": EMBEDDING_MODEL,
"input": texts
},
timeout=60.0
)
if response.status_code != 200:
raise EmbeddingError(f"OpenAI API error: {response.status_code} - {response.text}")
data = response.json()
embeddings = [item["embedding"] for item in data["data"]]
return embeddings
except httpx.TimeoutException:
raise EmbeddingError("OpenAI API timeout")
except Exception as e:
raise EmbeddingError(f"Failed to generate embeddings: {str(e)}")
async def generate_embeddings(texts: List[str]) -> List[List[float]]:
"""
Generate embeddings using configured backend.
Backends:
- local: sentence-transformers (default, no API key needed)
- openai: OpenAI text-embedding-3-small
Args:
texts: List of text chunks
Returns:
List of embedding vectors
Raises:
EmbeddingError: If embedding generation fails
"""
if not texts:
return []
if EMBEDDING_BACKEND == "local":
# Local model runs synchronously but is fast
return _generate_local_embeddings(texts)
elif EMBEDDING_BACKEND == "openai":
return await _generate_openai_embeddings(texts)
else:
raise EmbeddingError(f"Unknown embedding backend: {EMBEDDING_BACKEND}")
async def generate_single_embedding(text: str) -> List[float]:
"""Generate embedding for a single text."""
embeddings = await generate_embeddings([text])
return embeddings[0] if embeddings else []
def derive_key(passphrase: str, salt: bytes) -> bytes:
"""
Derive encryption key from passphrase using PBKDF2.
Args:
passphrase: User passphrase
salt: Random salt (16 bytes)
Returns:
32-byte AES key
"""
kdf = PBKDF2HMAC(
algorithm=hashes.SHA256(),
length=32,
salt=salt,
iterations=100000,
)
return kdf.derive(passphrase.encode())
def encrypt_text(text: str, passphrase: str, salt_hex: str) -> str:
"""
Encrypt text using AES-256-GCM.
Args:
text: Plaintext to encrypt
passphrase: User passphrase
salt_hex: Salt as hex string
Returns:
Base64-encoded ciphertext (IV + ciphertext)
"""
try:
salt = bytes.fromhex(salt_hex)
key = derive_key(passphrase, salt)
aesgcm = AESGCM(key)
iv = os.urandom(12)
ciphertext = aesgcm.encrypt(iv, text.encode(), None)
# Combine IV + ciphertext
combined = iv + ciphertext
return base64.b64encode(combined).decode()
except Exception as e:
raise EncryptionError(f"Encryption failed: {str(e)}")
def decrypt_text(encrypted_b64: str, passphrase: str, salt_hex: str) -> str:
"""
Decrypt text using AES-256-GCM.
Args:
encrypted_b64: Base64-encoded ciphertext (IV + ciphertext)
passphrase: User passphrase
salt_hex: Salt as hex string
Returns:
Decrypted plaintext
"""
try:
salt = bytes.fromhex(salt_hex)
key = derive_key(passphrase, salt)
combined = base64.b64decode(encrypted_b64)
iv = combined[:12]
ciphertext = combined[12:]
aesgcm = AESGCM(key)
plaintext = aesgcm.decrypt(iv, ciphertext, None)
return plaintext.decode()
except Exception as e:
raise EncryptionError(f"Decryption failed: {str(e)}")
def hash_key(passphrase: str, salt_hex: str) -> str:
"""
Create SHA-256 hash of derived key for verification.
Args:
passphrase: User passphrase
salt_hex: Salt as hex string
Returns:
Hex-encoded key hash
"""
salt = bytes.fromhex(salt_hex)
key = derive_key(passphrase, salt)
return hashlib.sha256(key).hexdigest()
def verify_key_hash(passphrase: str, salt_hex: str, expected_hash: str) -> bool:
"""
Verify passphrase matches stored key hash.
Args:
passphrase: User passphrase to verify
salt_hex: Salt as hex string
expected_hash: Expected key hash
Returns:
True if passphrase is correct
"""
computed_hash = hash_key(passphrase, salt_hex)
return computed_hash == expected_hash
def extract_text_from_pdf(pdf_content: bytes) -> str:
"""
Extract text from PDF file.
Args:
pdf_content: Raw PDF bytes
Returns:
Extracted text
"""
try:
import PyPDF2
pdf_file = io.BytesIO(pdf_content)
reader = PyPDF2.PdfReader(pdf_file)
text_parts = []
for page in reader.pages:
text = page.extract_text()
if text:
text_parts.append(text)
return "\n\n".join(text_parts)
except ImportError:
raise ChunkingError("PyPDF2 not installed")
except Exception as e:
raise ChunkingError(f"Failed to extract PDF text: {str(e)}")
async def process_eh_for_indexing(
eh_id: str,
tenant_id: str,
subject: str,
text_content: str,
passphrase: str,
salt_hex: str
) -> Tuple[int, List[dict]]:
"""
Full processing pipeline for Erwartungshorizont indexing.
1. Chunk the text
2. Generate embeddings
3. Encrypt chunks
4. Return prepared data for Qdrant
Args:
eh_id: Erwartungshorizont ID
tenant_id: Tenant ID
subject: Subject (deutsch, englisch, etc.)
text_content: Decrypted text content
passphrase: User passphrase for re-encryption
salt_hex: Salt for encryption
Returns:
Tuple of (chunk_count, chunks_data)
"""
# 1. Chunk the text
chunks = chunk_text(text_content)
if not chunks:
return 0, []
# 2. Generate embeddings
embeddings = await generate_embeddings(chunks)
# 3. Encrypt chunks for storage
encrypted_chunks = []
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
encrypted_content = encrypt_text(chunk, passphrase, salt_hex)
encrypted_chunks.append({
"chunk_index": i,
"embedding": embedding,
"encrypted_content": encrypted_content
})
return len(chunks), encrypted_chunks