breakpilot-lehrer/klausur-service/backend/eh_pipeline.py

"""
BYOEH Processing Pipeline
Handles chunking, embedding generation, and encryption for Erwartungshorizonte.

Supports multiple embedding backends:
- local: sentence-transformers (default, no API key needed)
- openai: OpenAI text-embedding-3-small (requires OPENAI_API_KEY)
"""

import os
import io
import base64
import hashlib
from typing import List, Tuple, Optional
from cryptography.hazmat.primitives.ciphers.aead import AESGCM
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
from cryptography.hazmat.primitives import hashes
import httpx

# Embedding Configuration
# Backend: "local" (sentence-transformers) or "openai"
EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")

# Local embedding model (all-MiniLM-L6-v2: 384 dimensions, fast, good quality)
LOCAL_EMBEDDING_MODEL = os.getenv("LOCAL_EMBEDDING_MODEL", "all-MiniLM-L6-v2")

# Vector dimensions per backend
VECTOR_DIMENSIONS = {
    "local": 384,      # all-MiniLM-L6-v2
    "openai": 1536,    # text-embedding-3-small
}

CHUNK_SIZE = int(os.getenv("BYOEH_CHUNK_SIZE", "1000"))
CHUNK_OVERLAP = int(os.getenv("BYOEH_CHUNK_OVERLAP", "200"))

# Lazy-loaded sentence-transformers model
_local_model = None


class ChunkingError(Exception):
    """Error during text chunking."""
    pass


class EmbeddingError(Exception):
    """Error during embedding generation."""
    pass


class EncryptionError(Exception):
    """Error during encryption/decryption."""
    pass


def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
    """
    Split text into overlapping chunks.

    Uses a simple recursive character splitter approach:
    - Try to split on paragraph boundaries first
    - Then sentences
    - Then words
    - Finally characters

    Args:
        text: Input text to chunk
        chunk_size: Target chunk size in characters
        overlap: Overlap between chunks

    Returns:
        List of text chunks
    """
    if not text or len(text) <= chunk_size:
        return [text] if text else []

    chunks = []
    separators = ["\n\n", "\n", ". ", " ", ""]

    def split_recursive(text: str, sep_idx: int = 0) -> List[str]:
        if len(text) <= chunk_size:
            return [text]

        if sep_idx >= len(separators):
            # Last resort: hard split
            return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - overlap)]

        sep = separators[sep_idx]
        if not sep:
            # Empty separator = character split
            parts = list(text)
        else:
            parts = text.split(sep)

        result = []
        current = ""

        for part in parts:
            test_chunk = current + sep + part if current else part

            if len(test_chunk) <= chunk_size:
                current = test_chunk
            else:
                if current:
                    result.append(current)
                # If single part is too big, recursively split it
                if len(part) > chunk_size:
                    result.extend(split_recursive(part, sep_idx + 1))
                    current = ""
                else:
                    current = part

        if current:
            result.append(current)

        return result

    raw_chunks = split_recursive(text)

    # Add overlap
    final_chunks = []
    for i, chunk in enumerate(raw_chunks):
        if i > 0 and overlap > 0:
            # Add overlap from previous chunk
            prev_chunk = raw_chunks[i-1]
            overlap_text = prev_chunk[-min(overlap, len(prev_chunk)):]
            chunk = overlap_text + chunk
        final_chunks.append(chunk.strip())

    return [c for c in final_chunks if c]


def get_vector_size() -> int:
    """Get the vector dimension for the current embedding backend."""
    return VECTOR_DIMENSIONS.get(EMBEDDING_BACKEND, 384)


def _get_local_model():
    """Lazy-load the sentence-transformers model."""
    global _local_model
    if _local_model is None:
        try:
            from sentence_transformers import SentenceTransformer
            print(f"Loading local embedding model: {LOCAL_EMBEDDING_MODEL}")
            _local_model = SentenceTransformer(LOCAL_EMBEDDING_MODEL)
            print(f"Model loaded successfully (dim={_local_model.get_sentence_embedding_dimension()})")
        except ImportError:
            raise EmbeddingError(
                "sentence-transformers not installed. "
                "Install with: pip install sentence-transformers"
            )
    return _local_model


def _generate_local_embeddings(texts: List[str]) -> List[List[float]]:
    """Generate embeddings using local sentence-transformers model."""
    if not texts:
        return []

    model = _get_local_model()
    embeddings = model.encode(texts, show_progress_bar=len(texts) > 10)
    return [emb.tolist() for emb in embeddings]


async def _generate_openai_embeddings(texts: List[str]) -> List[List[float]]:
    """Generate embeddings using OpenAI API."""
    if not OPENAI_API_KEY:
        raise EmbeddingError("OPENAI_API_KEY not configured")

    try:
        async with httpx.AsyncClient() as client:
            response = await client.post(
                "https://api.openai.com/v1/embeddings",
                headers={
                    "Authorization": f"Bearer {OPENAI_API_KEY}",
                    "Content-Type": "application/json"
                },
                json={
                    "model": EMBEDDING_MODEL,
                    "input": texts
                },
                timeout=60.0
            )

            if response.status_code != 200:
                raise EmbeddingError(f"OpenAI API error: {response.status_code} - {response.text}")

            data = response.json()
            embeddings = [item["embedding"] for item in data["data"]]
            return embeddings

    except httpx.TimeoutException:
        raise EmbeddingError("OpenAI API timeout")
    except Exception as e:
        raise EmbeddingError(f"Failed to generate embeddings: {str(e)}")


async def generate_embeddings(texts: List[str]) -> List[List[float]]:
    """
    Generate embeddings using configured backend.

    Backends:
    - local: sentence-transformers (default, no API key needed)
    - openai: OpenAI text-embedding-3-small

    Args:
        texts: List of text chunks

    Returns:
        List of embedding vectors

    Raises:
        EmbeddingError: If embedding generation fails
    """
    if not texts:
        return []

    if EMBEDDING_BACKEND == "local":
        # Local model runs synchronously but is fast
        return _generate_local_embeddings(texts)
    elif EMBEDDING_BACKEND == "openai":
        return await _generate_openai_embeddings(texts)
    else:
        raise EmbeddingError(f"Unknown embedding backend: {EMBEDDING_BACKEND}")


async def generate_single_embedding(text: str) -> List[float]:
    """Generate embedding for a single text."""
    embeddings = await generate_embeddings([text])
    return embeddings[0] if embeddings else []


def derive_key(passphrase: str, salt: bytes) -> bytes:
    """
    Derive encryption key from passphrase using PBKDF2.

    Args:
        passphrase: User passphrase
        salt: Random salt (16 bytes)

    Returns:
        32-byte AES key
    """
    kdf = PBKDF2HMAC(
        algorithm=hashes.SHA256(),
        length=32,
        salt=salt,
        iterations=100000,
    )
    return kdf.derive(passphrase.encode())


def encrypt_text(text: str, passphrase: str, salt_hex: str) -> str:
    """
    Encrypt text using AES-256-GCM.

    Args:
        text: Plaintext to encrypt
        passphrase: User passphrase
        salt_hex: Salt as hex string

    Returns:
        Base64-encoded ciphertext (IV + ciphertext)
    """
    try:
        salt = bytes.fromhex(salt_hex)
        key = derive_key(passphrase, salt)

        aesgcm = AESGCM(key)
        iv = os.urandom(12)

        ciphertext = aesgcm.encrypt(iv, text.encode(), None)

        # Combine IV + ciphertext
        combined = iv + ciphertext
        return base64.b64encode(combined).decode()

    except Exception as e:
        raise EncryptionError(f"Encryption failed: {str(e)}")


def decrypt_text(encrypted_b64: str, passphrase: str, salt_hex: str) -> str:
    """
    Decrypt text using AES-256-GCM.

    Args:
        encrypted_b64: Base64-encoded ciphertext (IV + ciphertext)
        passphrase: User passphrase
        salt_hex: Salt as hex string

    Returns:
        Decrypted plaintext
    """
    try:
        salt = bytes.fromhex(salt_hex)
        key = derive_key(passphrase, salt)

        combined = base64.b64decode(encrypted_b64)
        iv = combined[:12]
        ciphertext = combined[12:]

        aesgcm = AESGCM(key)
        plaintext = aesgcm.decrypt(iv, ciphertext, None)

        return plaintext.decode()

    except Exception as e:
        raise EncryptionError(f"Decryption failed: {str(e)}")


def hash_key(passphrase: str, salt_hex: str) -> str:
    """
    Create SHA-256 hash of derived key for verification.

    Args:
        passphrase: User passphrase
        salt_hex: Salt as hex string

    Returns:
        Hex-encoded key hash
    """
    salt = bytes.fromhex(salt_hex)
    key = derive_key(passphrase, salt)
    return hashlib.sha256(key).hexdigest()


def verify_key_hash(passphrase: str, salt_hex: str, expected_hash: str) -> bool:
    """
    Verify passphrase matches stored key hash.

    Args:
        passphrase: User passphrase to verify
        salt_hex: Salt as hex string
        expected_hash: Expected key hash

    Returns:
        True if passphrase is correct
    """
    computed_hash = hash_key(passphrase, salt_hex)
    return computed_hash == expected_hash


def extract_text_from_pdf(pdf_content: bytes) -> str:
    """
    Extract text from PDF file.

    Args:
        pdf_content: Raw PDF bytes

    Returns:
        Extracted text
    """
    try:
        import PyPDF2

        pdf_file = io.BytesIO(pdf_content)
        reader = PyPDF2.PdfReader(pdf_file)

        text_parts = []
        for page in reader.pages:
            text = page.extract_text()
            if text:
                text_parts.append(text)

        return "\n\n".join(text_parts)

    except ImportError:
        raise ChunkingError("PyPDF2 not installed")
    except Exception as e:
        raise ChunkingError(f"Failed to extract PDF text: {str(e)}")


async def process_eh_for_indexing(
    eh_id: str,
    tenant_id: str,
    subject: str,
    text_content: str,
    passphrase: str,
    salt_hex: str
) -> Tuple[int, List[dict]]:
    """
    Full processing pipeline for Erwartungshorizont indexing.

    1. Chunk the text
    2. Generate embeddings
    3. Encrypt chunks
    4. Return prepared data for Qdrant

    Args:
        eh_id: Erwartungshorizont ID
        tenant_id: Tenant ID
        subject: Subject (deutsch, englisch, etc.)
        text_content: Decrypted text content
        passphrase: User passphrase for re-encryption
        salt_hex: Salt for encryption

    Returns:
        Tuple of (chunk_count, chunks_data)
    """
    # 1. Chunk the text
    chunks = chunk_text(text_content)

    if not chunks:
        return 0, []

    # 2. Generate embeddings
    embeddings = await generate_embeddings(chunks)

    # 3. Encrypt chunks for storage
    encrypted_chunks = []
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        encrypted_content = encrypt_text(chunk, passphrase, salt_hex)
        encrypted_chunks.append({
            "chunk_index": i,
            "embedding": embedding,
            "encrypted_content": encrypted_content
        })

    return len(chunks), encrypted_chunks