This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/klausur-service/embedding-service/config.py
Benjamin Admin 21a844cb8a fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

87 lines
3.1 KiB
Python

"""
Embedding Service Configuration
Environment variables for embedding generation, re-ranking, and PDF extraction.
"""
import os
# =============================================================================
# Embedding Configuration
# =============================================================================
# Backend: "local" (sentence-transformers) or "openai"
EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small")
# Local embedding model
# Recommended: BAAI/bge-m3 (MIT, 1024 dim, multilingual)
LOCAL_EMBEDDING_MODEL = os.getenv("LOCAL_EMBEDDING_MODEL", "BAAI/bge-m3")
# Chunking configuration
CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "1000"))
CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "200"))
CHUNKING_STRATEGY = os.getenv("CHUNKING_STRATEGY", "semantic")
# =============================================================================
# Re-Ranker Configuration
# =============================================================================
# Backend: "local" (sentence-transformers CrossEncoder) or "cohere"
RERANKER_BACKEND = os.getenv("RERANKER_BACKEND", "local")
COHERE_API_KEY = os.getenv("COHERE_API_KEY", "")
# Local re-ranker model
# Recommended: BAAI/bge-reranker-v2-m3 (Apache 2.0, multilingual)
LOCAL_RERANKER_MODEL = os.getenv("LOCAL_RERANKER_MODEL", "BAAI/bge-reranker-v2-m3")
# =============================================================================
# PDF Extraction Configuration
# =============================================================================
# Backend: "auto", "unstructured", "pypdf"
PDF_EXTRACTION_BACKEND = os.getenv("PDF_EXTRACTION_BACKEND", "auto")
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY", "")
UNSTRUCTURED_API_URL = os.getenv("UNSTRUCTURED_API_URL", "")
# =============================================================================
# Service Configuration
# =============================================================================
SERVICE_PORT = int(os.getenv("EMBEDDING_SERVICE_PORT", "8087"))
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
# Model dimensions lookup
MODEL_DIMENSIONS = {
# Multilingual / German-optimized
"BAAI/bge-m3": 1024,
"deepset/mxbai-embed-de-large-v1": 1024,
"jinaai/jina-embeddings-v2-base-de": 768,
"intfloat/multilingual-e5-large": 1024,
# English-focused (smaller, faster)
"all-MiniLM-L6-v2": 384,
"all-mpnet-base-v2": 768,
# OpenAI
"text-embedding-3-small": 1536,
"text-embedding-3-large": 3072,
}
def get_model_dimensions(model_name: str) -> int:
"""Get embedding dimensions for a model."""
if model_name in MODEL_DIMENSIONS:
return MODEL_DIMENSIONS[model_name]
for key, dim in MODEL_DIMENSIONS.items():
if key in model_name or model_name in key:
return dim
return 384 # Default fallback
def get_current_dimensions() -> int:
"""Get dimensions for the currently configured model."""
if EMBEDDING_BACKEND == "local":
return get_model_dimensions(LOCAL_EMBEDDING_MODEL)
else:
return get_model_dimensions(OPENAI_EMBEDDING_MODEL)