feat: BreakPilot PWA - Full codebase (clean push without large binaries)
Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
All services: admin-v2, studio-v2, website, ai-compliance-sdk, consent-service, klausur-service, voice-service, and infrastructure. Large PDFs and compiled binaries excluded via .gitignore.
This commit is contained in:
86
klausur-service/embedding-service/config.py
Normal file
86
klausur-service/embedding-service/config.py
Normal file
@@ -0,0 +1,86 @@
|
||||
"""
|
||||
Embedding Service Configuration
|
||||
|
||||
Environment variables for embedding generation, re-ranking, and PDF extraction.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
# =============================================================================
|
||||
# Embedding Configuration
|
||||
# =============================================================================
|
||||
|
||||
# Backend: "local" (sentence-transformers) or "openai"
|
||||
EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
||||
OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small")
|
||||
|
||||
# Local embedding model
|
||||
# Recommended: BAAI/bge-m3 (MIT, 1024 dim, multilingual)
|
||||
LOCAL_EMBEDDING_MODEL = os.getenv("LOCAL_EMBEDDING_MODEL", "BAAI/bge-m3")
|
||||
|
||||
# Chunking configuration
|
||||
CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "1000"))
|
||||
CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "200"))
|
||||
CHUNKING_STRATEGY = os.getenv("CHUNKING_STRATEGY", "semantic")
|
||||
|
||||
# =============================================================================
|
||||
# Re-Ranker Configuration
|
||||
# =============================================================================
|
||||
|
||||
# Backend: "local" (sentence-transformers CrossEncoder) or "cohere"
|
||||
RERANKER_BACKEND = os.getenv("RERANKER_BACKEND", "local")
|
||||
COHERE_API_KEY = os.getenv("COHERE_API_KEY", "")
|
||||
|
||||
# Local re-ranker model
|
||||
# Recommended: BAAI/bge-reranker-v2-m3 (Apache 2.0, multilingual)
|
||||
LOCAL_RERANKER_MODEL = os.getenv("LOCAL_RERANKER_MODEL", "BAAI/bge-reranker-v2-m3")
|
||||
|
||||
# =============================================================================
|
||||
# PDF Extraction Configuration
|
||||
# =============================================================================
|
||||
|
||||
# Backend: "auto", "unstructured", "pypdf"
|
||||
PDF_EXTRACTION_BACKEND = os.getenv("PDF_EXTRACTION_BACKEND", "auto")
|
||||
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY", "")
|
||||
UNSTRUCTURED_API_URL = os.getenv("UNSTRUCTURED_API_URL", "")
|
||||
|
||||
# =============================================================================
|
||||
# Service Configuration
|
||||
# =============================================================================
|
||||
|
||||
SERVICE_PORT = int(os.getenv("EMBEDDING_SERVICE_PORT", "8087"))
|
||||
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
|
||||
|
||||
# Model dimensions lookup
|
||||
MODEL_DIMENSIONS = {
|
||||
# Multilingual / German-optimized
|
||||
"BAAI/bge-m3": 1024,
|
||||
"deepset/mxbai-embed-de-large-v1": 1024,
|
||||
"jinaai/jina-embeddings-v2-base-de": 768,
|
||||
"intfloat/multilingual-e5-large": 1024,
|
||||
# English-focused (smaller, faster)
|
||||
"all-MiniLM-L6-v2": 384,
|
||||
"all-mpnet-base-v2": 768,
|
||||
# OpenAI
|
||||
"text-embedding-3-small": 1536,
|
||||
"text-embedding-3-large": 3072,
|
||||
}
|
||||
|
||||
|
||||
def get_model_dimensions(model_name: str) -> int:
|
||||
"""Get embedding dimensions for a model."""
|
||||
if model_name in MODEL_DIMENSIONS:
|
||||
return MODEL_DIMENSIONS[model_name]
|
||||
for key, dim in MODEL_DIMENSIONS.items():
|
||||
if key in model_name or model_name in key:
|
||||
return dim
|
||||
return 384 # Default fallback
|
||||
|
||||
|
||||
def get_current_dimensions() -> int:
|
||||
"""Get dimensions for the currently configured model."""
|
||||
if EMBEDDING_BACKEND == "local":
|
||||
return get_model_dimensions(LOCAL_EMBEDDING_MODEL)
|
||||
else:
|
||||
return get_model_dimensions(OPENAI_EMBEDDING_MODEL)
|
||||
Reference in New Issue
Block a user