fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
420
klausur-service/backend/eh_pipeline.py
Normal file
420
klausur-service/backend/eh_pipeline.py
Normal file
@@ -0,0 +1,420 @@
|
||||
"""
|
||||
BYOEH Processing Pipeline
|
||||
Handles chunking, embedding generation, and encryption for Erwartungshorizonte.
|
||||
|
||||
Supports multiple embedding backends:
|
||||
- local: sentence-transformers (default, no API key needed)
|
||||
- openai: OpenAI text-embedding-3-small (requires OPENAI_API_KEY)
|
||||
"""
|
||||
|
||||
import os
|
||||
import io
|
||||
import base64
|
||||
import hashlib
|
||||
from typing import List, Tuple, Optional
|
||||
from cryptography.hazmat.primitives.ciphers.aead import AESGCM
|
||||
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
|
||||
from cryptography.hazmat.primitives import hashes
|
||||
import httpx
|
||||
|
||||
# Embedding Configuration
|
||||
# Backend: "local" (sentence-transformers) or "openai"
|
||||
EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
||||
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
|
||||
|
||||
# Local embedding model (all-MiniLM-L6-v2: 384 dimensions, fast, good quality)
|
||||
LOCAL_EMBEDDING_MODEL = os.getenv("LOCAL_EMBEDDING_MODEL", "all-MiniLM-L6-v2")
|
||||
|
||||
# Vector dimensions per backend
|
||||
VECTOR_DIMENSIONS = {
|
||||
"local": 384, # all-MiniLM-L6-v2
|
||||
"openai": 1536, # text-embedding-3-small
|
||||
}
|
||||
|
||||
CHUNK_SIZE = int(os.getenv("BYOEH_CHUNK_SIZE", "1000"))
|
||||
CHUNK_OVERLAP = int(os.getenv("BYOEH_CHUNK_OVERLAP", "200"))
|
||||
|
||||
# Lazy-loaded sentence-transformers model
|
||||
_local_model = None
|
||||
|
||||
|
||||
class ChunkingError(Exception):
|
||||
"""Error during text chunking."""
|
||||
pass
|
||||
|
||||
|
||||
class EmbeddingError(Exception):
|
||||
"""Error during embedding generation."""
|
||||
pass
|
||||
|
||||
|
||||
class EncryptionError(Exception):
|
||||
"""Error during encryption/decryption."""
|
||||
pass
|
||||
|
||||
|
||||
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
|
||||
"""
|
||||
Split text into overlapping chunks.
|
||||
|
||||
Uses a simple recursive character splitter approach:
|
||||
- Try to split on paragraph boundaries first
|
||||
- Then sentences
|
||||
- Then words
|
||||
- Finally characters
|
||||
|
||||
Args:
|
||||
text: Input text to chunk
|
||||
chunk_size: Target chunk size in characters
|
||||
overlap: Overlap between chunks
|
||||
|
||||
Returns:
|
||||
List of text chunks
|
||||
"""
|
||||
if not text or len(text) <= chunk_size:
|
||||
return [text] if text else []
|
||||
|
||||
chunks = []
|
||||
separators = ["\n\n", "\n", ". ", " ", ""]
|
||||
|
||||
def split_recursive(text: str, sep_idx: int = 0) -> List[str]:
|
||||
if len(text) <= chunk_size:
|
||||
return [text]
|
||||
|
||||
if sep_idx >= len(separators):
|
||||
# Last resort: hard split
|
||||
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - overlap)]
|
||||
|
||||
sep = separators[sep_idx]
|
||||
if not sep:
|
||||
# Empty separator = character split
|
||||
parts = list(text)
|
||||
else:
|
||||
parts = text.split(sep)
|
||||
|
||||
result = []
|
||||
current = ""
|
||||
|
||||
for part in parts:
|
||||
test_chunk = current + sep + part if current else part
|
||||
|
||||
if len(test_chunk) <= chunk_size:
|
||||
current = test_chunk
|
||||
else:
|
||||
if current:
|
||||
result.append(current)
|
||||
# If single part is too big, recursively split it
|
||||
if len(part) > chunk_size:
|
||||
result.extend(split_recursive(part, sep_idx + 1))
|
||||
current = ""
|
||||
else:
|
||||
current = part
|
||||
|
||||
if current:
|
||||
result.append(current)
|
||||
|
||||
return result
|
||||
|
||||
raw_chunks = split_recursive(text)
|
||||
|
||||
# Add overlap
|
||||
final_chunks = []
|
||||
for i, chunk in enumerate(raw_chunks):
|
||||
if i > 0 and overlap > 0:
|
||||
# Add overlap from previous chunk
|
||||
prev_chunk = raw_chunks[i-1]
|
||||
overlap_text = prev_chunk[-min(overlap, len(prev_chunk)):]
|
||||
chunk = overlap_text + chunk
|
||||
final_chunks.append(chunk.strip())
|
||||
|
||||
return [c for c in final_chunks if c]
|
||||
|
||||
|
||||
def get_vector_size() -> int:
|
||||
"""Get the vector dimension for the current embedding backend."""
|
||||
return VECTOR_DIMENSIONS.get(EMBEDDING_BACKEND, 384)
|
||||
|
||||
|
||||
def _get_local_model():
|
||||
"""Lazy-load the sentence-transformers model."""
|
||||
global _local_model
|
||||
if _local_model is None:
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
print(f"Loading local embedding model: {LOCAL_EMBEDDING_MODEL}")
|
||||
_local_model = SentenceTransformer(LOCAL_EMBEDDING_MODEL)
|
||||
print(f"Model loaded successfully (dim={_local_model.get_sentence_embedding_dimension()})")
|
||||
except ImportError:
|
||||
raise EmbeddingError(
|
||||
"sentence-transformers not installed. "
|
||||
"Install with: pip install sentence-transformers"
|
||||
)
|
||||
return _local_model
|
||||
|
||||
|
||||
def _generate_local_embeddings(texts: List[str]) -> List[List[float]]:
|
||||
"""Generate embeddings using local sentence-transformers model."""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
model = _get_local_model()
|
||||
embeddings = model.encode(texts, show_progress_bar=len(texts) > 10)
|
||||
return [emb.tolist() for emb in embeddings]
|
||||
|
||||
|
||||
async def _generate_openai_embeddings(texts: List[str]) -> List[List[float]]:
|
||||
"""Generate embeddings using OpenAI API."""
|
||||
if not OPENAI_API_KEY:
|
||||
raise EmbeddingError("OPENAI_API_KEY not configured")
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
"https://api.openai.com/v1/embeddings",
|
||||
headers={
|
||||
"Authorization": f"Bearer {OPENAI_API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": EMBEDDING_MODEL,
|
||||
"input": texts
|
||||
},
|
||||
timeout=60.0
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise EmbeddingError(f"OpenAI API error: {response.status_code} - {response.text}")
|
||||
|
||||
data = response.json()
|
||||
embeddings = [item["embedding"] for item in data["data"]]
|
||||
return embeddings
|
||||
|
||||
except httpx.TimeoutException:
|
||||
raise EmbeddingError("OpenAI API timeout")
|
||||
except Exception as e:
|
||||
raise EmbeddingError(f"Failed to generate embeddings: {str(e)}")
|
||||
|
||||
|
||||
async def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
||||
"""
|
||||
Generate embeddings using configured backend.
|
||||
|
||||
Backends:
|
||||
- local: sentence-transformers (default, no API key needed)
|
||||
- openai: OpenAI text-embedding-3-small
|
||||
|
||||
Args:
|
||||
texts: List of text chunks
|
||||
|
||||
Returns:
|
||||
List of embedding vectors
|
||||
|
||||
Raises:
|
||||
EmbeddingError: If embedding generation fails
|
||||
"""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
if EMBEDDING_BACKEND == "local":
|
||||
# Local model runs synchronously but is fast
|
||||
return _generate_local_embeddings(texts)
|
||||
elif EMBEDDING_BACKEND == "openai":
|
||||
return await _generate_openai_embeddings(texts)
|
||||
else:
|
||||
raise EmbeddingError(f"Unknown embedding backend: {EMBEDDING_BACKEND}")
|
||||
|
||||
|
||||
async def generate_single_embedding(text: str) -> List[float]:
|
||||
"""Generate embedding for a single text."""
|
||||
embeddings = await generate_embeddings([text])
|
||||
return embeddings[0] if embeddings else []
|
||||
|
||||
|
||||
def derive_key(passphrase: str, salt: bytes) -> bytes:
|
||||
"""
|
||||
Derive encryption key from passphrase using PBKDF2.
|
||||
|
||||
Args:
|
||||
passphrase: User passphrase
|
||||
salt: Random salt (16 bytes)
|
||||
|
||||
Returns:
|
||||
32-byte AES key
|
||||
"""
|
||||
kdf = PBKDF2HMAC(
|
||||
algorithm=hashes.SHA256(),
|
||||
length=32,
|
||||
salt=salt,
|
||||
iterations=100000,
|
||||
)
|
||||
return kdf.derive(passphrase.encode())
|
||||
|
||||
|
||||
def encrypt_text(text: str, passphrase: str, salt_hex: str) -> str:
|
||||
"""
|
||||
Encrypt text using AES-256-GCM.
|
||||
|
||||
Args:
|
||||
text: Plaintext to encrypt
|
||||
passphrase: User passphrase
|
||||
salt_hex: Salt as hex string
|
||||
|
||||
Returns:
|
||||
Base64-encoded ciphertext (IV + ciphertext)
|
||||
"""
|
||||
try:
|
||||
salt = bytes.fromhex(salt_hex)
|
||||
key = derive_key(passphrase, salt)
|
||||
|
||||
aesgcm = AESGCM(key)
|
||||
iv = os.urandom(12)
|
||||
|
||||
ciphertext = aesgcm.encrypt(iv, text.encode(), None)
|
||||
|
||||
# Combine IV + ciphertext
|
||||
combined = iv + ciphertext
|
||||
return base64.b64encode(combined).decode()
|
||||
|
||||
except Exception as e:
|
||||
raise EncryptionError(f"Encryption failed: {str(e)}")
|
||||
|
||||
|
||||
def decrypt_text(encrypted_b64: str, passphrase: str, salt_hex: str) -> str:
|
||||
"""
|
||||
Decrypt text using AES-256-GCM.
|
||||
|
||||
Args:
|
||||
encrypted_b64: Base64-encoded ciphertext (IV + ciphertext)
|
||||
passphrase: User passphrase
|
||||
salt_hex: Salt as hex string
|
||||
|
||||
Returns:
|
||||
Decrypted plaintext
|
||||
"""
|
||||
try:
|
||||
salt = bytes.fromhex(salt_hex)
|
||||
key = derive_key(passphrase, salt)
|
||||
|
||||
combined = base64.b64decode(encrypted_b64)
|
||||
iv = combined[:12]
|
||||
ciphertext = combined[12:]
|
||||
|
||||
aesgcm = AESGCM(key)
|
||||
plaintext = aesgcm.decrypt(iv, ciphertext, None)
|
||||
|
||||
return plaintext.decode()
|
||||
|
||||
except Exception as e:
|
||||
raise EncryptionError(f"Decryption failed: {str(e)}")
|
||||
|
||||
|
||||
def hash_key(passphrase: str, salt_hex: str) -> str:
|
||||
"""
|
||||
Create SHA-256 hash of derived key for verification.
|
||||
|
||||
Args:
|
||||
passphrase: User passphrase
|
||||
salt_hex: Salt as hex string
|
||||
|
||||
Returns:
|
||||
Hex-encoded key hash
|
||||
"""
|
||||
salt = bytes.fromhex(salt_hex)
|
||||
key = derive_key(passphrase, salt)
|
||||
return hashlib.sha256(key).hexdigest()
|
||||
|
||||
|
||||
def verify_key_hash(passphrase: str, salt_hex: str, expected_hash: str) -> bool:
|
||||
"""
|
||||
Verify passphrase matches stored key hash.
|
||||
|
||||
Args:
|
||||
passphrase: User passphrase to verify
|
||||
salt_hex: Salt as hex string
|
||||
expected_hash: Expected key hash
|
||||
|
||||
Returns:
|
||||
True if passphrase is correct
|
||||
"""
|
||||
computed_hash = hash_key(passphrase, salt_hex)
|
||||
return computed_hash == expected_hash
|
||||
|
||||
|
||||
def extract_text_from_pdf(pdf_content: bytes) -> str:
|
||||
"""
|
||||
Extract text from PDF file.
|
||||
|
||||
Args:
|
||||
pdf_content: Raw PDF bytes
|
||||
|
||||
Returns:
|
||||
Extracted text
|
||||
"""
|
||||
try:
|
||||
import PyPDF2
|
||||
|
||||
pdf_file = io.BytesIO(pdf_content)
|
||||
reader = PyPDF2.PdfReader(pdf_file)
|
||||
|
||||
text_parts = []
|
||||
for page in reader.pages:
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
text_parts.append(text)
|
||||
|
||||
return "\n\n".join(text_parts)
|
||||
|
||||
except ImportError:
|
||||
raise ChunkingError("PyPDF2 not installed")
|
||||
except Exception as e:
|
||||
raise ChunkingError(f"Failed to extract PDF text: {str(e)}")
|
||||
|
||||
|
||||
async def process_eh_for_indexing(
|
||||
eh_id: str,
|
||||
tenant_id: str,
|
||||
subject: str,
|
||||
text_content: str,
|
||||
passphrase: str,
|
||||
salt_hex: str
|
||||
) -> Tuple[int, List[dict]]:
|
||||
"""
|
||||
Full processing pipeline for Erwartungshorizont indexing.
|
||||
|
||||
1. Chunk the text
|
||||
2. Generate embeddings
|
||||
3. Encrypt chunks
|
||||
4. Return prepared data for Qdrant
|
||||
|
||||
Args:
|
||||
eh_id: Erwartungshorizont ID
|
||||
tenant_id: Tenant ID
|
||||
subject: Subject (deutsch, englisch, etc.)
|
||||
text_content: Decrypted text content
|
||||
passphrase: User passphrase for re-encryption
|
||||
salt_hex: Salt for encryption
|
||||
|
||||
Returns:
|
||||
Tuple of (chunk_count, chunks_data)
|
||||
"""
|
||||
# 1. Chunk the text
|
||||
chunks = chunk_text(text_content)
|
||||
|
||||
if not chunks:
|
||||
return 0, []
|
||||
|
||||
# 2. Generate embeddings
|
||||
embeddings = await generate_embeddings(chunks)
|
||||
|
||||
# 3. Encrypt chunks for storage
|
||||
encrypted_chunks = []
|
||||
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
|
||||
encrypted_content = encrypt_text(chunk, passphrase, salt_hex)
|
||||
encrypted_chunks.append({
|
||||
"chunk_index": i,
|
||||
"embedding": embedding,
|
||||
"encrypted_content": encrypted_content
|
||||
})
|
||||
|
||||
return len(chunks), encrypted_chunks
|
||||
Reference in New Issue
Block a user