fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits, losing 3400+ files across admin-v2, backend, studio-v2, website, klausur-service, and many other services. The partial restore attempt (660295e2) only recovered some files. This commit restores all missing files from pre-rebase ref 98933f5e while preserving post-rebase additions (night-scheduler, night-mode UI, NightModeWidget dashboard integration). Restored features include: - AI Module Sidebar (FAB), OCR Labeling, OCR Compare - GPU Dashboard, RAG Pipeline, Magic Help - Klausur-Korrektur (8 files), Abitur-Archiv (5+ files) - Companion, Zeugnisse-Crawler, Screen Flow - Full backend, studio-v2, website, klausur-service - All compliance SDKs, agent-core, voice-service - CI/CD configs, documentation, scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions
--- a/klausur-service/backend/eh_pipeline.py
+++ b/klausur-service/backend/eh_pipeline.py
@@ -0,0 +1,420 @@
+"""
+BYOEH Processing Pipeline
+Handles chunking, embedding generation, and encryption for Erwartungshorizonte.
+
+Supports multiple embedding backends:
+- local: sentence-transformers (default, no API key needed)
+- openai: OpenAI text-embedding-3-small (requires OPENAI_API_KEY)
+"""
+
+import os
+import io
+import base64
+import hashlib
+from typing import List, Tuple, Optional
+from cryptography.hazmat.primitives.ciphers.aead import AESGCM
+from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
+from cryptography.hazmat.primitives import hashes
+import httpx
+
+# Embedding Configuration
+# Backend: "local" (sentence-transformers) or "openai"
+EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
+
+# Local embedding model (all-MiniLM-L6-v2: 384 dimensions, fast, good quality)
+LOCAL_EMBEDDING_MODEL = os.getenv("LOCAL_EMBEDDING_MODEL", "all-MiniLM-L6-v2")
+
+# Vector dimensions per backend
+VECTOR_DIMENSIONS = {
+    "local": 384,      # all-MiniLM-L6-v2
+    "openai": 1536,    # text-embedding-3-small
+}
+
+CHUNK_SIZE = int(os.getenv("BYOEH_CHUNK_SIZE", "1000"))
+CHUNK_OVERLAP = int(os.getenv("BYOEH_CHUNK_OVERLAP", "200"))
+
+# Lazy-loaded sentence-transformers model
+_local_model = None
+
+
+class ChunkingError(Exception):
+    """Error during text chunking."""
+    pass
+
+
+class EmbeddingError(Exception):
+    """Error during embedding generation."""
+    pass
+
+
+class EncryptionError(Exception):
+    """Error during encryption/decryption."""
+    pass
+
+
+def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
+    """
+    Split text into overlapping chunks.
+
+    Uses a simple recursive character splitter approach:
+    - Try to split on paragraph boundaries first
+    - Then sentences
+    - Then words
+    - Finally characters
+
+    Args:
+        text: Input text to chunk
+        chunk_size: Target chunk size in characters
+        overlap: Overlap between chunks
+
+    Returns:
+        List of text chunks
+    """
+    if not text or len(text) <= chunk_size:
+        return [text] if text else []
+
+    chunks = []
+    separators = ["\n\n", "\n", ". ", " ", ""]
+
+    def split_recursive(text: str, sep_idx: int = 0) -> List[str]:
+        if len(text) <= chunk_size:
+            return [text]
+
+        if sep_idx >= len(separators):
+            # Last resort: hard split
+            return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - overlap)]
+
+        sep = separators[sep_idx]
+        if not sep:
+            # Empty separator = character split
+            parts = list(text)
+        else:
+            parts = text.split(sep)
+
+        result = []
+        current = ""
+
+        for part in parts:
+            test_chunk = current + sep + part if current else part
+
+            if len(test_chunk) <= chunk_size:
+                current = test_chunk
+            else:
+                if current:
+                    result.append(current)
+                # If single part is too big, recursively split it
+                if len(part) > chunk_size:
+                    result.extend(split_recursive(part, sep_idx + 1))
+                    current = ""
+                else:
+                    current = part
+
+        if current:
+            result.append(current)
+
+        return result
+
+    raw_chunks = split_recursive(text)
+
+    # Add overlap
+    final_chunks = []
+    for i, chunk in enumerate(raw_chunks):
+        if i > 0 and overlap > 0:
+            # Add overlap from previous chunk
+            prev_chunk = raw_chunks[i-1]
+            overlap_text = prev_chunk[-min(overlap, len(prev_chunk)):]
+            chunk = overlap_text + chunk
+        final_chunks.append(chunk.strip())
+
+    return [c for c in final_chunks if c]
+
+
+def get_vector_size() -> int:
+    """Get the vector dimension for the current embedding backend."""
+    return VECTOR_DIMENSIONS.get(EMBEDDING_BACKEND, 384)
+
+
+def _get_local_model():
+    """Lazy-load the sentence-transformers model."""
+    global _local_model
+    if _local_model is None:
+        try:
+            from sentence_transformers import SentenceTransformer
+            print(f"Loading local embedding model: {LOCAL_EMBEDDING_MODEL}")
+            _local_model = SentenceTransformer(LOCAL_EMBEDDING_MODEL)
+            print(f"Model loaded successfully (dim={_local_model.get_sentence_embedding_dimension()})")
+        except ImportError:
+            raise EmbeddingError(
+                "sentence-transformers not installed. "
+                "Install with: pip install sentence-transformers"
+            )
+    return _local_model
+
+
+def _generate_local_embeddings(texts: List[str]) -> List[List[float]]:
+    """Generate embeddings using local sentence-transformers model."""
+    if not texts:
+        return []
+
+    model = _get_local_model()
+    embeddings = model.encode(texts, show_progress_bar=len(texts) > 10)
+    return [emb.tolist() for emb in embeddings]
+
+
+async def _generate_openai_embeddings(texts: List[str]) -> List[List[float]]:
+    """Generate embeddings using OpenAI API."""
+    if not OPENAI_API_KEY:
+        raise EmbeddingError("OPENAI_API_KEY not configured")
+
+    try:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                "https://api.openai.com/v1/embeddings",
+                headers={
+                    "Authorization": f"Bearer {OPENAI_API_KEY}",
+                    "Content-Type": "application/json"
+                },
+                json={
+                    "model": EMBEDDING_MODEL,
+                    "input": texts
+                },
+                timeout=60.0
+            )
+
+            if response.status_code != 200:
+                raise EmbeddingError(f"OpenAI API error: {response.status_code} - {response.text}")
+
+            data = response.json()
+            embeddings = [item["embedding"] for item in data["data"]]
+            return embeddings
+
+    except httpx.TimeoutException:
+        raise EmbeddingError("OpenAI API timeout")
+    except Exception as e:
+        raise EmbeddingError(f"Failed to generate embeddings: {str(e)}")
+
+
+async def generate_embeddings(texts: List[str]) -> List[List[float]]:
+    """
+    Generate embeddings using configured backend.
+
+    Backends:
+    - local: sentence-transformers (default, no API key needed)
+    - openai: OpenAI text-embedding-3-small
+
+    Args:
+        texts: List of text chunks
+
+    Returns:
+        List of embedding vectors
+
+    Raises:
+        EmbeddingError: If embedding generation fails
+    """
+    if not texts:
+        return []
+
+    if EMBEDDING_BACKEND == "local":
+        # Local model runs synchronously but is fast
+        return _generate_local_embeddings(texts)
+    elif EMBEDDING_BACKEND == "openai":
+        return await _generate_openai_embeddings(texts)
+    else:
+        raise EmbeddingError(f"Unknown embedding backend: {EMBEDDING_BACKEND}")
+
+
+async def generate_single_embedding(text: str) -> List[float]:
+    """Generate embedding for a single text."""
+    embeddings = await generate_embeddings([text])
+    return embeddings[0] if embeddings else []
+
+
+def derive_key(passphrase: str, salt: bytes) -> bytes:
+    """
+    Derive encryption key from passphrase using PBKDF2.
+
+    Args:
+        passphrase: User passphrase
+        salt: Random salt (16 bytes)
+
+    Returns:
+        32-byte AES key
+    """
+    kdf = PBKDF2HMAC(
+        algorithm=hashes.SHA256(),
+        length=32,
+        salt=salt,
+        iterations=100000,
+    )
+    return kdf.derive(passphrase.encode())
+
+
+def encrypt_text(text: str, passphrase: str, salt_hex: str) -> str:
+    """
+    Encrypt text using AES-256-GCM.
+
+    Args:
+        text: Plaintext to encrypt
+        passphrase: User passphrase
+        salt_hex: Salt as hex string
+
+    Returns:
+        Base64-encoded ciphertext (IV + ciphertext)
+    """
+    try:
+        salt = bytes.fromhex(salt_hex)
+        key = derive_key(passphrase, salt)
+
+        aesgcm = AESGCM(key)
+        iv = os.urandom(12)
+
+        ciphertext = aesgcm.encrypt(iv, text.encode(), None)
+
+        # Combine IV + ciphertext
+        combined = iv + ciphertext
+        return base64.b64encode(combined).decode()
+
+    except Exception as e:
+        raise EncryptionError(f"Encryption failed: {str(e)}")
+
+
+def decrypt_text(encrypted_b64: str, passphrase: str, salt_hex: str) -> str:
+    """
+    Decrypt text using AES-256-GCM.
+
+    Args:
+        encrypted_b64: Base64-encoded ciphertext (IV + ciphertext)
+        passphrase: User passphrase
+        salt_hex: Salt as hex string
+
+    Returns:
+        Decrypted plaintext
+    """
+    try:
+        salt = bytes.fromhex(salt_hex)
+        key = derive_key(passphrase, salt)
+
+        combined = base64.b64decode(encrypted_b64)
+        iv = combined[:12]
+        ciphertext = combined[12:]
+
+        aesgcm = AESGCM(key)
+        plaintext = aesgcm.decrypt(iv, ciphertext, None)
+
+        return plaintext.decode()
+
+    except Exception as e:
+        raise EncryptionError(f"Decryption failed: {str(e)}")
+
+
+def hash_key(passphrase: str, salt_hex: str) -> str:
+    """
+    Create SHA-256 hash of derived key for verification.
+
+    Args:
+        passphrase: User passphrase
+        salt_hex: Salt as hex string
+
+    Returns:
+        Hex-encoded key hash
+    """
+    salt = bytes.fromhex(salt_hex)
+    key = derive_key(passphrase, salt)
+    return hashlib.sha256(key).hexdigest()
+
+
+def verify_key_hash(passphrase: str, salt_hex: str, expected_hash: str) -> bool:
+    """
+    Verify passphrase matches stored key hash.
+
+    Args:
+        passphrase: User passphrase to verify
+        salt_hex: Salt as hex string
+        expected_hash: Expected key hash
+
+    Returns:
+        True if passphrase is correct
+    """
+    computed_hash = hash_key(passphrase, salt_hex)
+    return computed_hash == expected_hash
+
+
+def extract_text_from_pdf(pdf_content: bytes) -> str:
+    """
+    Extract text from PDF file.
+
+    Args:
+        pdf_content: Raw PDF bytes
+
+    Returns:
+        Extracted text
+    """
+    try:
+        import PyPDF2
+
+        pdf_file = io.BytesIO(pdf_content)
+        reader = PyPDF2.PdfReader(pdf_file)
+
+        text_parts = []
+        for page in reader.pages:
+            text = page.extract_text()
+            if text:
+                text_parts.append(text)
+
+        return "\n\n".join(text_parts)
+
+    except ImportError:
+        raise ChunkingError("PyPDF2 not installed")
+    except Exception as e:
+        raise ChunkingError(f"Failed to extract PDF text: {str(e)}")
+
+
+async def process_eh_for_indexing(
+    eh_id: str,
+    tenant_id: str,
+    subject: str,
+    text_content: str,
+    passphrase: str,
+    salt_hex: str
+) -> Tuple[int, List[dict]]:
+    """
+    Full processing pipeline for Erwartungshorizont indexing.
+
+    1. Chunk the text
+    2. Generate embeddings
+    3. Encrypt chunks
+    4. Return prepared data for Qdrant
+
+    Args:
+        eh_id: Erwartungshorizont ID
+        tenant_id: Tenant ID
+        subject: Subject (deutsch, englisch, etc.)
+        text_content: Decrypted text content
+        passphrase: User passphrase for re-encryption
+        salt_hex: Salt for encryption
+
+    Returns:
+        Tuple of (chunk_count, chunks_data)
+    """
+    # 1. Chunk the text
+    chunks = chunk_text(text_content)
+
+    if not chunks:
+        return 0, []
+
+    # 2. Generate embeddings
+    embeddings = await generate_embeddings(chunks)
+
+    # 3. Encrypt chunks for storage
+    encrypted_chunks = []
+    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
+        encrypted_content = encrypt_text(chunk, passphrase, salt_hex)
+        encrypted_chunks.append({
+            "chunk_index": i,
+            "embedding": embedding,
+            "encrypted_content": encrypted_content
+        })
+
+    return len(chunks), encrypted_chunks