Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
421 lines
12 KiB
Python
421 lines
12 KiB
Python
"""
|
|
BYOEH Processing Pipeline
|
|
Handles chunking, embedding generation, and encryption for Erwartungshorizonte.
|
|
|
|
Supports multiple embedding backends:
|
|
- local: sentence-transformers (default, no API key needed)
|
|
- openai: OpenAI text-embedding-3-small (requires OPENAI_API_KEY)
|
|
"""
|
|
|
|
import os
|
|
import io
|
|
import base64
|
|
import hashlib
|
|
from typing import List, Tuple, Optional
|
|
from cryptography.hazmat.primitives.ciphers.aead import AESGCM
|
|
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
|
|
from cryptography.hazmat.primitives import hashes
|
|
import httpx
|
|
|
|
# Embedding Configuration
|
|
# Backend: "local" (sentence-transformers) or "openai"
|
|
EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")
|
|
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
|
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
|
|
|
|
# Local embedding model (all-MiniLM-L6-v2: 384 dimensions, fast, good quality)
|
|
LOCAL_EMBEDDING_MODEL = os.getenv("LOCAL_EMBEDDING_MODEL", "all-MiniLM-L6-v2")
|
|
|
|
# Vector dimensions per backend
|
|
VECTOR_DIMENSIONS = {
|
|
"local": 384, # all-MiniLM-L6-v2
|
|
"openai": 1536, # text-embedding-3-small
|
|
}
|
|
|
|
CHUNK_SIZE = int(os.getenv("BYOEH_CHUNK_SIZE", "1000"))
|
|
CHUNK_OVERLAP = int(os.getenv("BYOEH_CHUNK_OVERLAP", "200"))
|
|
|
|
# Lazy-loaded sentence-transformers model
|
|
_local_model = None
|
|
|
|
|
|
class ChunkingError(Exception):
|
|
"""Error during text chunking."""
|
|
pass
|
|
|
|
|
|
class EmbeddingError(Exception):
|
|
"""Error during embedding generation."""
|
|
pass
|
|
|
|
|
|
class EncryptionError(Exception):
|
|
"""Error during encryption/decryption."""
|
|
pass
|
|
|
|
|
|
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
|
|
"""
|
|
Split text into overlapping chunks.
|
|
|
|
Uses a simple recursive character splitter approach:
|
|
- Try to split on paragraph boundaries first
|
|
- Then sentences
|
|
- Then words
|
|
- Finally characters
|
|
|
|
Args:
|
|
text: Input text to chunk
|
|
chunk_size: Target chunk size in characters
|
|
overlap: Overlap between chunks
|
|
|
|
Returns:
|
|
List of text chunks
|
|
"""
|
|
if not text or len(text) <= chunk_size:
|
|
return [text] if text else []
|
|
|
|
chunks = []
|
|
separators = ["\n\n", "\n", ". ", " ", ""]
|
|
|
|
def split_recursive(text: str, sep_idx: int = 0) -> List[str]:
|
|
if len(text) <= chunk_size:
|
|
return [text]
|
|
|
|
if sep_idx >= len(separators):
|
|
# Last resort: hard split
|
|
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - overlap)]
|
|
|
|
sep = separators[sep_idx]
|
|
if not sep:
|
|
# Empty separator = character split
|
|
parts = list(text)
|
|
else:
|
|
parts = text.split(sep)
|
|
|
|
result = []
|
|
current = ""
|
|
|
|
for part in parts:
|
|
test_chunk = current + sep + part if current else part
|
|
|
|
if len(test_chunk) <= chunk_size:
|
|
current = test_chunk
|
|
else:
|
|
if current:
|
|
result.append(current)
|
|
# If single part is too big, recursively split it
|
|
if len(part) > chunk_size:
|
|
result.extend(split_recursive(part, sep_idx + 1))
|
|
current = ""
|
|
else:
|
|
current = part
|
|
|
|
if current:
|
|
result.append(current)
|
|
|
|
return result
|
|
|
|
raw_chunks = split_recursive(text)
|
|
|
|
# Add overlap
|
|
final_chunks = []
|
|
for i, chunk in enumerate(raw_chunks):
|
|
if i > 0 and overlap > 0:
|
|
# Add overlap from previous chunk
|
|
prev_chunk = raw_chunks[i-1]
|
|
overlap_text = prev_chunk[-min(overlap, len(prev_chunk)):]
|
|
chunk = overlap_text + chunk
|
|
final_chunks.append(chunk.strip())
|
|
|
|
return [c for c in final_chunks if c]
|
|
|
|
|
|
def get_vector_size() -> int:
|
|
"""Get the vector dimension for the current embedding backend."""
|
|
return VECTOR_DIMENSIONS.get(EMBEDDING_BACKEND, 384)
|
|
|
|
|
|
def _get_local_model():
|
|
"""Lazy-load the sentence-transformers model."""
|
|
global _local_model
|
|
if _local_model is None:
|
|
try:
|
|
from sentence_transformers import SentenceTransformer
|
|
print(f"Loading local embedding model: {LOCAL_EMBEDDING_MODEL}")
|
|
_local_model = SentenceTransformer(LOCAL_EMBEDDING_MODEL)
|
|
print(f"Model loaded successfully (dim={_local_model.get_sentence_embedding_dimension()})")
|
|
except ImportError:
|
|
raise EmbeddingError(
|
|
"sentence-transformers not installed. "
|
|
"Install with: pip install sentence-transformers"
|
|
)
|
|
return _local_model
|
|
|
|
|
|
def _generate_local_embeddings(texts: List[str]) -> List[List[float]]:
|
|
"""Generate embeddings using local sentence-transformers model."""
|
|
if not texts:
|
|
return []
|
|
|
|
model = _get_local_model()
|
|
embeddings = model.encode(texts, show_progress_bar=len(texts) > 10)
|
|
return [emb.tolist() for emb in embeddings]
|
|
|
|
|
|
async def _generate_openai_embeddings(texts: List[str]) -> List[List[float]]:
|
|
"""Generate embeddings using OpenAI API."""
|
|
if not OPENAI_API_KEY:
|
|
raise EmbeddingError("OPENAI_API_KEY not configured")
|
|
|
|
try:
|
|
async with httpx.AsyncClient() as client:
|
|
response = await client.post(
|
|
"https://api.openai.com/v1/embeddings",
|
|
headers={
|
|
"Authorization": f"Bearer {OPENAI_API_KEY}",
|
|
"Content-Type": "application/json"
|
|
},
|
|
json={
|
|
"model": EMBEDDING_MODEL,
|
|
"input": texts
|
|
},
|
|
timeout=60.0
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
raise EmbeddingError(f"OpenAI API error: {response.status_code} - {response.text}")
|
|
|
|
data = response.json()
|
|
embeddings = [item["embedding"] for item in data["data"]]
|
|
return embeddings
|
|
|
|
except httpx.TimeoutException:
|
|
raise EmbeddingError("OpenAI API timeout")
|
|
except Exception as e:
|
|
raise EmbeddingError(f"Failed to generate embeddings: {str(e)}")
|
|
|
|
|
|
async def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
|
"""
|
|
Generate embeddings using configured backend.
|
|
|
|
Backends:
|
|
- local: sentence-transformers (default, no API key needed)
|
|
- openai: OpenAI text-embedding-3-small
|
|
|
|
Args:
|
|
texts: List of text chunks
|
|
|
|
Returns:
|
|
List of embedding vectors
|
|
|
|
Raises:
|
|
EmbeddingError: If embedding generation fails
|
|
"""
|
|
if not texts:
|
|
return []
|
|
|
|
if EMBEDDING_BACKEND == "local":
|
|
# Local model runs synchronously but is fast
|
|
return _generate_local_embeddings(texts)
|
|
elif EMBEDDING_BACKEND == "openai":
|
|
return await _generate_openai_embeddings(texts)
|
|
else:
|
|
raise EmbeddingError(f"Unknown embedding backend: {EMBEDDING_BACKEND}")
|
|
|
|
|
|
async def generate_single_embedding(text: str) -> List[float]:
|
|
"""Generate embedding for a single text."""
|
|
embeddings = await generate_embeddings([text])
|
|
return embeddings[0] if embeddings else []
|
|
|
|
|
|
def derive_key(passphrase: str, salt: bytes) -> bytes:
|
|
"""
|
|
Derive encryption key from passphrase using PBKDF2.
|
|
|
|
Args:
|
|
passphrase: User passphrase
|
|
salt: Random salt (16 bytes)
|
|
|
|
Returns:
|
|
32-byte AES key
|
|
"""
|
|
kdf = PBKDF2HMAC(
|
|
algorithm=hashes.SHA256(),
|
|
length=32,
|
|
salt=salt,
|
|
iterations=100000,
|
|
)
|
|
return kdf.derive(passphrase.encode())
|
|
|
|
|
|
def encrypt_text(text: str, passphrase: str, salt_hex: str) -> str:
|
|
"""
|
|
Encrypt text using AES-256-GCM.
|
|
|
|
Args:
|
|
text: Plaintext to encrypt
|
|
passphrase: User passphrase
|
|
salt_hex: Salt as hex string
|
|
|
|
Returns:
|
|
Base64-encoded ciphertext (IV + ciphertext)
|
|
"""
|
|
try:
|
|
salt = bytes.fromhex(salt_hex)
|
|
key = derive_key(passphrase, salt)
|
|
|
|
aesgcm = AESGCM(key)
|
|
iv = os.urandom(12)
|
|
|
|
ciphertext = aesgcm.encrypt(iv, text.encode(), None)
|
|
|
|
# Combine IV + ciphertext
|
|
combined = iv + ciphertext
|
|
return base64.b64encode(combined).decode()
|
|
|
|
except Exception as e:
|
|
raise EncryptionError(f"Encryption failed: {str(e)}")
|
|
|
|
|
|
def decrypt_text(encrypted_b64: str, passphrase: str, salt_hex: str) -> str:
|
|
"""
|
|
Decrypt text using AES-256-GCM.
|
|
|
|
Args:
|
|
encrypted_b64: Base64-encoded ciphertext (IV + ciphertext)
|
|
passphrase: User passphrase
|
|
salt_hex: Salt as hex string
|
|
|
|
Returns:
|
|
Decrypted plaintext
|
|
"""
|
|
try:
|
|
salt = bytes.fromhex(salt_hex)
|
|
key = derive_key(passphrase, salt)
|
|
|
|
combined = base64.b64decode(encrypted_b64)
|
|
iv = combined[:12]
|
|
ciphertext = combined[12:]
|
|
|
|
aesgcm = AESGCM(key)
|
|
plaintext = aesgcm.decrypt(iv, ciphertext, None)
|
|
|
|
return plaintext.decode()
|
|
|
|
except Exception as e:
|
|
raise EncryptionError(f"Decryption failed: {str(e)}")
|
|
|
|
|
|
def hash_key(passphrase: str, salt_hex: str) -> str:
|
|
"""
|
|
Create SHA-256 hash of derived key for verification.
|
|
|
|
Args:
|
|
passphrase: User passphrase
|
|
salt_hex: Salt as hex string
|
|
|
|
Returns:
|
|
Hex-encoded key hash
|
|
"""
|
|
salt = bytes.fromhex(salt_hex)
|
|
key = derive_key(passphrase, salt)
|
|
return hashlib.sha256(key).hexdigest()
|
|
|
|
|
|
def verify_key_hash(passphrase: str, salt_hex: str, expected_hash: str) -> bool:
|
|
"""
|
|
Verify passphrase matches stored key hash.
|
|
|
|
Args:
|
|
passphrase: User passphrase to verify
|
|
salt_hex: Salt as hex string
|
|
expected_hash: Expected key hash
|
|
|
|
Returns:
|
|
True if passphrase is correct
|
|
"""
|
|
computed_hash = hash_key(passphrase, salt_hex)
|
|
return computed_hash == expected_hash
|
|
|
|
|
|
def extract_text_from_pdf(pdf_content: bytes) -> str:
|
|
"""
|
|
Extract text from PDF file.
|
|
|
|
Args:
|
|
pdf_content: Raw PDF bytes
|
|
|
|
Returns:
|
|
Extracted text
|
|
"""
|
|
try:
|
|
import PyPDF2
|
|
|
|
pdf_file = io.BytesIO(pdf_content)
|
|
reader = PyPDF2.PdfReader(pdf_file)
|
|
|
|
text_parts = []
|
|
for page in reader.pages:
|
|
text = page.extract_text()
|
|
if text:
|
|
text_parts.append(text)
|
|
|
|
return "\n\n".join(text_parts)
|
|
|
|
except ImportError:
|
|
raise ChunkingError("PyPDF2 not installed")
|
|
except Exception as e:
|
|
raise ChunkingError(f"Failed to extract PDF text: {str(e)}")
|
|
|
|
|
|
async def process_eh_for_indexing(
|
|
eh_id: str,
|
|
tenant_id: str,
|
|
subject: str,
|
|
text_content: str,
|
|
passphrase: str,
|
|
salt_hex: str
|
|
) -> Tuple[int, List[dict]]:
|
|
"""
|
|
Full processing pipeline for Erwartungshorizont indexing.
|
|
|
|
1. Chunk the text
|
|
2. Generate embeddings
|
|
3. Encrypt chunks
|
|
4. Return prepared data for Qdrant
|
|
|
|
Args:
|
|
eh_id: Erwartungshorizont ID
|
|
tenant_id: Tenant ID
|
|
subject: Subject (deutsch, englisch, etc.)
|
|
text_content: Decrypted text content
|
|
passphrase: User passphrase for re-encryption
|
|
salt_hex: Salt for encryption
|
|
|
|
Returns:
|
|
Tuple of (chunk_count, chunks_data)
|
|
"""
|
|
# 1. Chunk the text
|
|
chunks = chunk_text(text_content)
|
|
|
|
if not chunks:
|
|
return 0, []
|
|
|
|
# 2. Generate embeddings
|
|
embeddings = await generate_embeddings(chunks)
|
|
|
|
# 3. Encrypt chunks for storage
|
|
encrypted_chunks = []
|
|
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
|
|
encrypted_content = encrypt_text(chunk, passphrase, salt_hex)
|
|
encrypted_chunks.append({
|
|
"chunk_index": i,
|
|
"embedding": embedding,
|
|
"encrypted_content": encrypted_content
|
|
})
|
|
|
|
return len(chunks), encrypted_chunks
|