""" BYOEH Processing Pipeline Handles chunking, embedding generation, and encryption for Erwartungshorizonte. Supports multiple embedding backends: - local: sentence-transformers (default, no API key needed) - openai: OpenAI text-embedding-3-small (requires OPENAI_API_KEY) """ import os import io import base64 import hashlib from typing import List, Tuple, Optional from cryptography.hazmat.primitives.ciphers.aead import AESGCM from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC from cryptography.hazmat.primitives import hashes import httpx # Embedding Configuration # Backend: "local" (sentence-transformers) or "openai" EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local") OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small") # Local embedding model (all-MiniLM-L6-v2: 384 dimensions, fast, good quality) LOCAL_EMBEDDING_MODEL = os.getenv("LOCAL_EMBEDDING_MODEL", "all-MiniLM-L6-v2") # Vector dimensions per backend VECTOR_DIMENSIONS = { "local": 384, # all-MiniLM-L6-v2 "openai": 1536, # text-embedding-3-small } CHUNK_SIZE = int(os.getenv("BYOEH_CHUNK_SIZE", "1000")) CHUNK_OVERLAP = int(os.getenv("BYOEH_CHUNK_OVERLAP", "200")) # Lazy-loaded sentence-transformers model _local_model = None class ChunkingError(Exception): """Error during text chunking.""" pass class EmbeddingError(Exception): """Error during embedding generation.""" pass class EncryptionError(Exception): """Error during encryption/decryption.""" pass def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]: """ Split text into overlapping chunks. Uses a simple recursive character splitter approach: - Try to split on paragraph boundaries first - Then sentences - Then words - Finally characters Args: text: Input text to chunk chunk_size: Target chunk size in characters overlap: Overlap between chunks Returns: List of text chunks """ if not text or len(text) <= chunk_size: return [text] if text else [] chunks = [] separators = ["\n\n", "\n", ". ", " ", ""] def split_recursive(text: str, sep_idx: int = 0) -> List[str]: if len(text) <= chunk_size: return [text] if sep_idx >= len(separators): # Last resort: hard split return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - overlap)] sep = separators[sep_idx] if not sep: # Empty separator = character split parts = list(text) else: parts = text.split(sep) result = [] current = "" for part in parts: test_chunk = current + sep + part if current else part if len(test_chunk) <= chunk_size: current = test_chunk else: if current: result.append(current) # If single part is too big, recursively split it if len(part) > chunk_size: result.extend(split_recursive(part, sep_idx + 1)) current = "" else: current = part if current: result.append(current) return result raw_chunks = split_recursive(text) # Add overlap final_chunks = [] for i, chunk in enumerate(raw_chunks): if i > 0 and overlap > 0: # Add overlap from previous chunk prev_chunk = raw_chunks[i-1] overlap_text = prev_chunk[-min(overlap, len(prev_chunk)):] chunk = overlap_text + chunk final_chunks.append(chunk.strip()) return [c for c in final_chunks if c] def get_vector_size() -> int: """Get the vector dimension for the current embedding backend.""" return VECTOR_DIMENSIONS.get(EMBEDDING_BACKEND, 384) def _get_local_model(): """Lazy-load the sentence-transformers model.""" global _local_model if _local_model is None: try: from sentence_transformers import SentenceTransformer print(f"Loading local embedding model: {LOCAL_EMBEDDING_MODEL}") _local_model = SentenceTransformer(LOCAL_EMBEDDING_MODEL) print(f"Model loaded successfully (dim={_local_model.get_sentence_embedding_dimension()})") except ImportError: raise EmbeddingError( "sentence-transformers not installed. " "Install with: pip install sentence-transformers" ) return _local_model def _generate_local_embeddings(texts: List[str]) -> List[List[float]]: """Generate embeddings using local sentence-transformers model.""" if not texts: return [] model = _get_local_model() embeddings = model.encode(texts, show_progress_bar=len(texts) > 10) return [emb.tolist() for emb in embeddings] async def _generate_openai_embeddings(texts: List[str]) -> List[List[float]]: """Generate embeddings using OpenAI API.""" if not OPENAI_API_KEY: raise EmbeddingError("OPENAI_API_KEY not configured") try: async with httpx.AsyncClient() as client: response = await client.post( "https://api.openai.com/v1/embeddings", headers={ "Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json" }, json={ "model": EMBEDDING_MODEL, "input": texts }, timeout=60.0 ) if response.status_code != 200: raise EmbeddingError(f"OpenAI API error: {response.status_code} - {response.text}") data = response.json() embeddings = [item["embedding"] for item in data["data"]] return embeddings except httpx.TimeoutException: raise EmbeddingError("OpenAI API timeout") except Exception as e: raise EmbeddingError(f"Failed to generate embeddings: {str(e)}") async def generate_embeddings(texts: List[str]) -> List[List[float]]: """ Generate embeddings using configured backend. Backends: - local: sentence-transformers (default, no API key needed) - openai: OpenAI text-embedding-3-small Args: texts: List of text chunks Returns: List of embedding vectors Raises: EmbeddingError: If embedding generation fails """ if not texts: return [] if EMBEDDING_BACKEND == "local": # Local model runs synchronously but is fast return _generate_local_embeddings(texts) elif EMBEDDING_BACKEND == "openai": return await _generate_openai_embeddings(texts) else: raise EmbeddingError(f"Unknown embedding backend: {EMBEDDING_BACKEND}") async def generate_single_embedding(text: str) -> List[float]: """Generate embedding for a single text.""" embeddings = await generate_embeddings([text]) return embeddings[0] if embeddings else [] def derive_key(passphrase: str, salt: bytes) -> bytes: """ Derive encryption key from passphrase using PBKDF2. Args: passphrase: User passphrase salt: Random salt (16 bytes) Returns: 32-byte AES key """ kdf = PBKDF2HMAC( algorithm=hashes.SHA256(), length=32, salt=salt, iterations=100000, ) return kdf.derive(passphrase.encode()) def encrypt_text(text: str, passphrase: str, salt_hex: str) -> str: """ Encrypt text using AES-256-GCM. Args: text: Plaintext to encrypt passphrase: User passphrase salt_hex: Salt as hex string Returns: Base64-encoded ciphertext (IV + ciphertext) """ try: salt = bytes.fromhex(salt_hex) key = derive_key(passphrase, salt) aesgcm = AESGCM(key) iv = os.urandom(12) ciphertext = aesgcm.encrypt(iv, text.encode(), None) # Combine IV + ciphertext combined = iv + ciphertext return base64.b64encode(combined).decode() except Exception as e: raise EncryptionError(f"Encryption failed: {str(e)}") def decrypt_text(encrypted_b64: str, passphrase: str, salt_hex: str) -> str: """ Decrypt text using AES-256-GCM. Args: encrypted_b64: Base64-encoded ciphertext (IV + ciphertext) passphrase: User passphrase salt_hex: Salt as hex string Returns: Decrypted plaintext """ try: salt = bytes.fromhex(salt_hex) key = derive_key(passphrase, salt) combined = base64.b64decode(encrypted_b64) iv = combined[:12] ciphertext = combined[12:] aesgcm = AESGCM(key) plaintext = aesgcm.decrypt(iv, ciphertext, None) return plaintext.decode() except Exception as e: raise EncryptionError(f"Decryption failed: {str(e)}") def hash_key(passphrase: str, salt_hex: str) -> str: """ Create SHA-256 hash of derived key for verification. Args: passphrase: User passphrase salt_hex: Salt as hex string Returns: Hex-encoded key hash """ salt = bytes.fromhex(salt_hex) key = derive_key(passphrase, salt) return hashlib.sha256(key).hexdigest() def verify_key_hash(passphrase: str, salt_hex: str, expected_hash: str) -> bool: """ Verify passphrase matches stored key hash. Args: passphrase: User passphrase to verify salt_hex: Salt as hex string expected_hash: Expected key hash Returns: True if passphrase is correct """ computed_hash = hash_key(passphrase, salt_hex) return computed_hash == expected_hash def extract_text_from_pdf(pdf_content: bytes) -> str: """ Extract text from PDF file. Args: pdf_content: Raw PDF bytes Returns: Extracted text """ try: import PyPDF2 pdf_file = io.BytesIO(pdf_content) reader = PyPDF2.PdfReader(pdf_file) text_parts = [] for page in reader.pages: text = page.extract_text() if text: text_parts.append(text) return "\n\n".join(text_parts) except ImportError: raise ChunkingError("PyPDF2 not installed") except Exception as e: raise ChunkingError(f"Failed to extract PDF text: {str(e)}") async def process_eh_for_indexing( eh_id: str, tenant_id: str, subject: str, text_content: str, passphrase: str, salt_hex: str ) -> Tuple[int, List[dict]]: """ Full processing pipeline for Erwartungshorizont indexing. 1. Chunk the text 2. Generate embeddings 3. Encrypt chunks 4. Return prepared data for Qdrant Args: eh_id: Erwartungshorizont ID tenant_id: Tenant ID subject: Subject (deutsch, englisch, etc.) text_content: Decrypted text content passphrase: User passphrase for re-encryption salt_hex: Salt for encryption Returns: Tuple of (chunk_count, chunks_data) """ # 1. Chunk the text chunks = chunk_text(text_content) if not chunks: return 0, [] # 2. Generate embeddings embeddings = await generate_embeddings(chunks) # 3. Encrypt chunks for storage encrypted_chunks = [] for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)): encrypted_content = encrypt_text(chunk, passphrase, salt_hex) encrypted_chunks.append({ "chunk_index": i, "embedding": embedding, "encrypted_content": encrypted_content }) return len(chunks), encrypted_chunks