""" Embedding Service Configuration Environment variables for embedding generation, re-ranking, and PDF extraction. """ import os # ============================================================================= # Embedding Configuration # ============================================================================= # Backend: "local" (sentence-transformers) or "openai" EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local") OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small") # Local embedding model # Recommended: BAAI/bge-m3 (MIT, 1024 dim, multilingual) LOCAL_EMBEDDING_MODEL = os.getenv("LOCAL_EMBEDDING_MODEL", "BAAI/bge-m3") # Chunking configuration CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "1000")) CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "200")) CHUNKING_STRATEGY = os.getenv("CHUNKING_STRATEGY", "semantic") # ============================================================================= # Re-Ranker Configuration # ============================================================================= # Backend: "local" (sentence-transformers CrossEncoder) or "cohere" RERANKER_BACKEND = os.getenv("RERANKER_BACKEND", "local") COHERE_API_KEY = os.getenv("COHERE_API_KEY", "") # Local re-ranker model # Recommended: BAAI/bge-reranker-v2-m3 (Apache 2.0, multilingual) LOCAL_RERANKER_MODEL = os.getenv("LOCAL_RERANKER_MODEL", "BAAI/bge-reranker-v2-m3") # ============================================================================= # PDF Extraction Configuration # ============================================================================= # Backend: "auto", "unstructured", "pypdf" PDF_EXTRACTION_BACKEND = os.getenv("PDF_EXTRACTION_BACKEND", "auto") UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY", "") UNSTRUCTURED_API_URL = os.getenv("UNSTRUCTURED_API_URL", "") # ============================================================================= # Service Configuration # ============================================================================= SERVICE_PORT = int(os.getenv("EMBEDDING_SERVICE_PORT", "8087")) LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO") # Model dimensions lookup MODEL_DIMENSIONS = { # Multilingual / German-optimized "BAAI/bge-m3": 1024, "deepset/mxbai-embed-de-large-v1": 1024, "jinaai/jina-embeddings-v2-base-de": 768, "intfloat/multilingual-e5-large": 1024, # English-focused (smaller, faster) "all-MiniLM-L6-v2": 384, "all-mpnet-base-v2": 768, # OpenAI "text-embedding-3-small": 1536, "text-embedding-3-large": 3072, } def get_model_dimensions(model_name: str) -> int: """Get embedding dimensions for a model.""" if model_name in MODEL_DIMENSIONS: return MODEL_DIMENSIONS[model_name] for key, dim in MODEL_DIMENSIONS.items(): if key in model_name or model_name in key: return dim return 384 # Default fallback def get_current_dimensions() -> int: """Get dimensions for the currently configured model.""" if EMBEDDING_BACKEND == "local": return get_model_dimensions(LOCAL_EMBEDDING_MODEL) else: return get_model_dimensions(OPENAI_EMBEDDING_MODEL)