Initial commit: breakpilot-core - Shared Infrastructure
Docker Compose with 24+ services: - PostgreSQL (PostGIS), Valkey, MinIO, Qdrant - Vault (PKI/TLS), Nginx (Reverse Proxy) - Backend Core API, Consent Service, Billing Service - RAG Service, Embedding Service - Gitea, Woodpecker CI/CD - Night Scheduler, Health Aggregator - Jitsi (Web/XMPP/JVB/Jicofo), Mailpit Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
36
embedding-service/Dockerfile
Normal file
36
embedding-service/Dockerfile
Normal file
@@ -0,0 +1,36 @@
|
||||
# Embedding Service Dockerfile
|
||||
# Handles ML-heavy operations: embeddings, re-ranking, PDF extraction
|
||||
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies for PDF extraction
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libmagic1 \
|
||||
poppler-utils \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-deu \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install PyTorch CPU-only (smaller image)
|
||||
RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu
|
||||
|
||||
# Copy and install requirements
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Note: Models are downloaded on first startup (not during build)
|
||||
# This makes the build faster but first startup slower
|
||||
# To pre-download models, mount a persistent volume for /root/.cache/huggingface
|
||||
|
||||
# Copy application code
|
||||
COPY . .
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
||||
CMD python -c "import httpx; httpx.get('http://localhost:8087/health').raise_for_status()"
|
||||
|
||||
# Run the service
|
||||
EXPOSE 8087
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8087"]
|
||||
86
embedding-service/config.py
Normal file
86
embedding-service/config.py
Normal file
@@ -0,0 +1,86 @@
|
||||
"""
|
||||
Embedding Service Configuration
|
||||
|
||||
Environment variables for embedding generation, re-ranking, and PDF extraction.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
# =============================================================================
|
||||
# Embedding Configuration
|
||||
# =============================================================================
|
||||
|
||||
# Backend: "local" (sentence-transformers) or "openai"
|
||||
EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
||||
OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small")
|
||||
|
||||
# Local embedding model
|
||||
# Recommended: BAAI/bge-m3 (MIT, 1024 dim, multilingual)
|
||||
LOCAL_EMBEDDING_MODEL = os.getenv("LOCAL_EMBEDDING_MODEL", "BAAI/bge-m3")
|
||||
|
||||
# Chunking configuration
|
||||
CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "1000"))
|
||||
CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "200"))
|
||||
CHUNKING_STRATEGY = os.getenv("CHUNKING_STRATEGY", "semantic")
|
||||
|
||||
# =============================================================================
|
||||
# Re-Ranker Configuration
|
||||
# =============================================================================
|
||||
|
||||
# Backend: "local" (sentence-transformers CrossEncoder) or "cohere"
|
||||
RERANKER_BACKEND = os.getenv("RERANKER_BACKEND", "local")
|
||||
COHERE_API_KEY = os.getenv("COHERE_API_KEY", "")
|
||||
|
||||
# Local re-ranker model
|
||||
# Recommended: BAAI/bge-reranker-v2-m3 (Apache 2.0, multilingual)
|
||||
LOCAL_RERANKER_MODEL = os.getenv("LOCAL_RERANKER_MODEL", "BAAI/bge-reranker-v2-m3")
|
||||
|
||||
# =============================================================================
|
||||
# PDF Extraction Configuration
|
||||
# =============================================================================
|
||||
|
||||
# Backend: "auto", "unstructured", "pypdf"
|
||||
PDF_EXTRACTION_BACKEND = os.getenv("PDF_EXTRACTION_BACKEND", "auto")
|
||||
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY", "")
|
||||
UNSTRUCTURED_API_URL = os.getenv("UNSTRUCTURED_API_URL", "")
|
||||
|
||||
# =============================================================================
|
||||
# Service Configuration
|
||||
# =============================================================================
|
||||
|
||||
SERVICE_PORT = int(os.getenv("EMBEDDING_SERVICE_PORT", "8087"))
|
||||
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
|
||||
|
||||
# Model dimensions lookup
|
||||
MODEL_DIMENSIONS = {
|
||||
# Multilingual / German-optimized
|
||||
"BAAI/bge-m3": 1024,
|
||||
"deepset/mxbai-embed-de-large-v1": 1024,
|
||||
"jinaai/jina-embeddings-v2-base-de": 768,
|
||||
"intfloat/multilingual-e5-large": 1024,
|
||||
# English-focused (smaller, faster)
|
||||
"all-MiniLM-L6-v2": 384,
|
||||
"all-mpnet-base-v2": 768,
|
||||
# OpenAI
|
||||
"text-embedding-3-small": 1536,
|
||||
"text-embedding-3-large": 3072,
|
||||
}
|
||||
|
||||
|
||||
def get_model_dimensions(model_name: str) -> int:
|
||||
"""Get embedding dimensions for a model."""
|
||||
if model_name in MODEL_DIMENSIONS:
|
||||
return MODEL_DIMENSIONS[model_name]
|
||||
for key, dim in MODEL_DIMENSIONS.items():
|
||||
if key in model_name or model_name in key:
|
||||
return dim
|
||||
return 384 # Default fallback
|
||||
|
||||
|
||||
def get_current_dimensions() -> int:
|
||||
"""Get dimensions for the currently configured model."""
|
||||
if EMBEDDING_BACKEND == "local":
|
||||
return get_model_dimensions(LOCAL_EMBEDDING_MODEL)
|
||||
else:
|
||||
return get_model_dimensions(OPENAI_EMBEDDING_MODEL)
|
||||
696
embedding-service/main.py
Normal file
696
embedding-service/main.py
Normal file
@@ -0,0 +1,696 @@
|
||||
"""
|
||||
Embedding Service - FastAPI Application
|
||||
|
||||
Provides REST endpoints for:
|
||||
- Embedding generation (local sentence-transformers or OpenAI)
|
||||
- Re-ranking (local CrossEncoder or Cohere)
|
||||
- PDF text extraction (Unstructured or pypdf)
|
||||
- Text chunking (semantic or recursive)
|
||||
|
||||
This service handles all ML-heavy operations, keeping the main klausur-service lightweight.
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from fastapi import FastAPI, HTTPException, UploadFile, File
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
import config
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, config.LOG_LEVEL.upper(), logging.INFO),
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
logger = logging.getLogger("embedding-service")
|
||||
|
||||
# =============================================================================
|
||||
# Lazy-loaded models
|
||||
# =============================================================================
|
||||
|
||||
_embedding_model = None
|
||||
_reranker_model = None
|
||||
|
||||
|
||||
def get_embedding_model():
|
||||
"""Lazy-load the sentence-transformers embedding model."""
|
||||
global _embedding_model
|
||||
if _embedding_model is None:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
logger.info(f"Loading embedding model: {config.LOCAL_EMBEDDING_MODEL}")
|
||||
_embedding_model = SentenceTransformer(config.LOCAL_EMBEDDING_MODEL)
|
||||
logger.info(f"Model loaded (dim={_embedding_model.get_sentence_embedding_dimension()})")
|
||||
return _embedding_model
|
||||
|
||||
|
||||
def get_reranker_model():
|
||||
"""Lazy-load the CrossEncoder reranker model."""
|
||||
global _reranker_model
|
||||
if _reranker_model is None:
|
||||
from sentence_transformers import CrossEncoder
|
||||
logger.info(f"Loading reranker model: {config.LOCAL_RERANKER_MODEL}")
|
||||
_reranker_model = CrossEncoder(config.LOCAL_RERANKER_MODEL)
|
||||
logger.info("Reranker loaded")
|
||||
return _reranker_model
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Request/Response Models
|
||||
# =============================================================================
|
||||
|
||||
class EmbedRequest(BaseModel):
|
||||
texts: List[str] = Field(..., description="List of texts to embed")
|
||||
|
||||
|
||||
class EmbedResponse(BaseModel):
|
||||
embeddings: List[List[float]]
|
||||
model: str
|
||||
dimensions: int
|
||||
|
||||
|
||||
class EmbedSingleRequest(BaseModel):
|
||||
text: str = Field(..., description="Single text to embed")
|
||||
|
||||
|
||||
class EmbedSingleResponse(BaseModel):
|
||||
embedding: List[float]
|
||||
model: str
|
||||
dimensions: int
|
||||
|
||||
|
||||
class RerankRequest(BaseModel):
|
||||
query: str = Field(..., description="Search query")
|
||||
documents: List[str] = Field(..., description="Documents to re-rank")
|
||||
top_k: int = Field(default=5, description="Number of top results to return")
|
||||
|
||||
|
||||
class RerankResult(BaseModel):
|
||||
index: int
|
||||
score: float
|
||||
text: str
|
||||
|
||||
|
||||
class RerankResponse(BaseModel):
|
||||
results: List[RerankResult]
|
||||
model: str
|
||||
|
||||
|
||||
class ChunkRequest(BaseModel):
|
||||
text: str = Field(..., description="Text to chunk")
|
||||
chunk_size: int = Field(default=1000, description="Target chunk size")
|
||||
overlap: int = Field(default=200, description="Overlap between chunks")
|
||||
strategy: str = Field(default="semantic", description="Chunking strategy: semantic or recursive")
|
||||
|
||||
|
||||
class ChunkResponse(BaseModel):
|
||||
chunks: List[str]
|
||||
count: int
|
||||
strategy: str
|
||||
|
||||
|
||||
class ExtractPDFResponse(BaseModel):
|
||||
text: str
|
||||
backend_used: str
|
||||
pages: int
|
||||
table_count: int
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
status: str
|
||||
embedding_model: str
|
||||
embedding_dimensions: int
|
||||
reranker_model: str
|
||||
pdf_backends: List[str]
|
||||
|
||||
|
||||
class ModelsResponse(BaseModel):
|
||||
embedding_backend: str
|
||||
embedding_model: str
|
||||
embedding_dimensions: int
|
||||
reranker_backend: str
|
||||
reranker_model: str
|
||||
pdf_backend: str
|
||||
available_pdf_backends: List[str]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Embedding Functions
|
||||
# =============================================================================
|
||||
|
||||
def generate_local_embeddings(texts: List[str]) -> List[List[float]]:
|
||||
"""Generate embeddings using local model."""
|
||||
if not texts:
|
||||
return []
|
||||
model = get_embedding_model()
|
||||
embeddings = model.encode(texts, show_progress_bar=len(texts) > 10)
|
||||
return [emb.tolist() for emb in embeddings]
|
||||
|
||||
|
||||
async def generate_openai_embeddings(texts: List[str]) -> List[List[float]]:
|
||||
"""Generate embeddings using OpenAI API."""
|
||||
import httpx
|
||||
|
||||
if not config.OPENAI_API_KEY:
|
||||
raise HTTPException(status_code=500, detail="OPENAI_API_KEY not configured")
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
"https://api.openai.com/v1/embeddings",
|
||||
headers={
|
||||
"Authorization": f"Bearer {config.OPENAI_API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": config.OPENAI_EMBEDDING_MODEL,
|
||||
"input": texts
|
||||
},
|
||||
timeout=60.0
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise HTTPException(
|
||||
status_code=response.status_code,
|
||||
detail=f"OpenAI API error: {response.text}"
|
||||
)
|
||||
|
||||
data = response.json()
|
||||
return [item["embedding"] for item in data["data"]]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Re-ranking Functions
|
||||
# =============================================================================
|
||||
|
||||
def rerank_local(query: str, documents: List[str], top_k: int = 5) -> List[RerankResult]:
|
||||
"""Re-rank documents using local CrossEncoder."""
|
||||
if not documents:
|
||||
return []
|
||||
|
||||
model = get_reranker_model()
|
||||
pairs = [(query, doc) for doc in documents]
|
||||
scores = model.predict(pairs)
|
||||
|
||||
results = [
|
||||
RerankResult(index=i, score=float(score), text=doc)
|
||||
for i, (score, doc) in enumerate(zip(scores, documents))
|
||||
]
|
||||
results.sort(key=lambda x: x.score, reverse=True)
|
||||
return results[:top_k]
|
||||
|
||||
|
||||
async def rerank_cohere(query: str, documents: List[str], top_k: int = 5) -> List[RerankResult]:
|
||||
"""Re-rank documents using Cohere API."""
|
||||
import httpx
|
||||
|
||||
if not config.COHERE_API_KEY:
|
||||
raise HTTPException(status_code=500, detail="COHERE_API_KEY not configured")
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
"https://api.cohere.ai/v2/rerank",
|
||||
headers={
|
||||
"Authorization": f"Bearer {config.COHERE_API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": "rerank-multilingual-v3.0",
|
||||
"query": query,
|
||||
"documents": documents,
|
||||
"top_n": top_k,
|
||||
"return_documents": False,
|
||||
},
|
||||
timeout=30.0
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise HTTPException(
|
||||
status_code=response.status_code,
|
||||
detail=f"Cohere API error: {response.text}"
|
||||
)
|
||||
|
||||
data = response.json()
|
||||
return [
|
||||
RerankResult(
|
||||
index=item["index"],
|
||||
score=item["relevance_score"],
|
||||
text=documents[item["index"]]
|
||||
)
|
||||
for item in data.get("results", [])
|
||||
]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Chunking Functions
|
||||
# =============================================================================
|
||||
|
||||
# German abbreviations that don't end sentences
|
||||
GERMAN_ABBREVIATIONS = {
|
||||
'bzw', 'ca', 'chr', 'd.h', 'dr', 'etc', 'evtl', 'ggf', 'inkl', 'max',
|
||||
'min', 'mio', 'mrd', 'nr', 'prof', 's', 'sog', 'u.a', 'u.ä', 'usw',
|
||||
'v.a', 'vgl', 'vs', 'z.b', 'z.t', 'zzgl'
|
||||
}
|
||||
|
||||
|
||||
def chunk_text_recursive(text: str, chunk_size: int, overlap: int) -> List[str]:
|
||||
"""Recursive character-based chunking."""
|
||||
import re
|
||||
|
||||
if not text or len(text) <= chunk_size:
|
||||
return [text] if text else []
|
||||
|
||||
separators = ["\n\n", "\n", ". ", " ", ""]
|
||||
|
||||
def split_recursive(text: str, sep_idx: int = 0) -> List[str]:
|
||||
if len(text) <= chunk_size:
|
||||
return [text]
|
||||
|
||||
if sep_idx >= len(separators):
|
||||
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - overlap)]
|
||||
|
||||
sep = separators[sep_idx]
|
||||
if not sep:
|
||||
parts = list(text)
|
||||
else:
|
||||
parts = text.split(sep)
|
||||
|
||||
result = []
|
||||
current = ""
|
||||
|
||||
for part in parts:
|
||||
test_chunk = current + sep + part if current else part
|
||||
|
||||
if len(test_chunk) <= chunk_size:
|
||||
current = test_chunk
|
||||
else:
|
||||
if current:
|
||||
result.append(current)
|
||||
if len(part) > chunk_size:
|
||||
result.extend(split_recursive(part, sep_idx + 1))
|
||||
current = ""
|
||||
else:
|
||||
current = part
|
||||
|
||||
if current:
|
||||
result.append(current)
|
||||
|
||||
return result
|
||||
|
||||
raw_chunks = split_recursive(text)
|
||||
|
||||
# Add overlap
|
||||
final_chunks = []
|
||||
for i, chunk in enumerate(raw_chunks):
|
||||
if i > 0 and overlap > 0:
|
||||
prev_chunk = raw_chunks[i-1]
|
||||
overlap_text = prev_chunk[-min(overlap, len(prev_chunk)):]
|
||||
chunk = overlap_text + chunk
|
||||
final_chunks.append(chunk.strip())
|
||||
|
||||
return [c for c in final_chunks if c]
|
||||
|
||||
|
||||
def chunk_text_semantic(text: str, chunk_size: int, overlap_sentences: int = 1) -> List[str]:
|
||||
"""Semantic sentence-aware chunking."""
|
||||
import re
|
||||
|
||||
if not text:
|
||||
return []
|
||||
|
||||
if len(text) <= chunk_size:
|
||||
return [text.strip()]
|
||||
|
||||
# Split into sentences (simplified for German)
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
# Protect abbreviations
|
||||
protected = text
|
||||
for abbrev in GERMAN_ABBREVIATIONS:
|
||||
pattern = re.compile(r'\b' + re.escape(abbrev) + r'\.', re.IGNORECASE)
|
||||
protected = pattern.sub(abbrev.replace('.', '<DOT>') + '<ABBR>', protected)
|
||||
|
||||
# Protect decimals and ordinals
|
||||
protected = re.sub(r'(\d)\.(\d)', r'\1<DECIMAL>\2', protected)
|
||||
protected = re.sub(r'(\d+)\.(\s)', r'\1<ORD>\2', protected)
|
||||
|
||||
# Split on sentence endings
|
||||
sentence_pattern = r'(?<=[.!?])\s+(?=[A-ZÄÖÜ])|(?<=[.!?])$'
|
||||
raw_sentences = re.split(sentence_pattern, protected)
|
||||
|
||||
# Restore protected characters
|
||||
sentences = []
|
||||
for s in raw_sentences:
|
||||
s = s.replace('<DOT>', '.').replace('<ABBR>', '.').replace('<DECIMAL>', '.').replace('<ORD>', '.')
|
||||
s = s.strip()
|
||||
if s:
|
||||
sentences.append(s)
|
||||
|
||||
# Build chunks
|
||||
chunks = []
|
||||
current_parts = []
|
||||
current_length = 0
|
||||
overlap_buffer = []
|
||||
|
||||
for sentence in sentences:
|
||||
sentence_len = len(sentence)
|
||||
|
||||
if sentence_len > chunk_size:
|
||||
if current_parts:
|
||||
chunks.append(' '.join(current_parts))
|
||||
overlap_buffer = current_parts[-overlap_sentences:] if overlap_sentences > 0 else []
|
||||
current_parts = list(overlap_buffer)
|
||||
current_length = sum(len(s) + 1 for s in current_parts)
|
||||
|
||||
if overlap_buffer:
|
||||
chunks.append(' '.join(overlap_buffer) + ' ' + sentence)
|
||||
else:
|
||||
chunks.append(sentence)
|
||||
overlap_buffer = [sentence]
|
||||
current_parts = list(overlap_buffer)
|
||||
current_length = len(sentence) + 1
|
||||
continue
|
||||
|
||||
if current_length + sentence_len + 1 > chunk_size and current_parts:
|
||||
chunks.append(' '.join(current_parts))
|
||||
overlap_buffer = current_parts[-overlap_sentences:] if overlap_sentences > 0 else []
|
||||
current_parts = list(overlap_buffer)
|
||||
current_length = sum(len(s) + 1 for s in current_parts)
|
||||
|
||||
current_parts.append(sentence)
|
||||
current_length += sentence_len + 1
|
||||
|
||||
if current_parts:
|
||||
chunks.append(' '.join(current_parts))
|
||||
|
||||
return [re.sub(r'\s+', ' ', c).strip() for c in chunks if c.strip()]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# PDF Extraction Functions
|
||||
# =============================================================================
|
||||
|
||||
def detect_pdf_backends() -> List[str]:
|
||||
"""Detect available PDF backends."""
|
||||
available = []
|
||||
|
||||
try:
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
available.append("unstructured")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from pypdf import PdfReader
|
||||
available.append("pypdf")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
return available
|
||||
|
||||
|
||||
def extract_pdf_unstructured(pdf_content: bytes) -> ExtractPDFResponse:
|
||||
"""Extract PDF using Unstructured."""
|
||||
import tempfile
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
from unstructured.documents.elements import Title, ListItem, Table, Header, Footer
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
|
||||
tmp.write(pdf_content)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
elements = partition_pdf(
|
||||
filename=tmp_path,
|
||||
strategy="auto",
|
||||
include_page_breaks=True,
|
||||
infer_table_structure=True,
|
||||
languages=["deu", "eng"],
|
||||
)
|
||||
|
||||
text_parts = []
|
||||
tables = []
|
||||
page_count = 1
|
||||
|
||||
for element in elements:
|
||||
if hasattr(element, "metadata") and hasattr(element.metadata, "page_number"):
|
||||
page_count = max(page_count, element.metadata.page_number or 1)
|
||||
|
||||
if isinstance(element, (Header, Footer)):
|
||||
continue
|
||||
|
||||
element_text = str(element)
|
||||
|
||||
if isinstance(element, Table):
|
||||
tables.append(element_text)
|
||||
text_parts.append(f"\n[TABELLE]\n{element_text}\n[/TABELLE]\n")
|
||||
elif isinstance(element, Title):
|
||||
text_parts.append(f"\n## {element_text}\n")
|
||||
elif isinstance(element, ListItem):
|
||||
text_parts.append(f"• {element_text}")
|
||||
else:
|
||||
text_parts.append(element_text)
|
||||
|
||||
return ExtractPDFResponse(
|
||||
text="\n".join(text_parts),
|
||||
backend_used="unstructured",
|
||||
pages=page_count,
|
||||
table_count=len(tables)
|
||||
)
|
||||
finally:
|
||||
import os as os_module
|
||||
try:
|
||||
os_module.unlink(tmp_path)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
def extract_pdf_pypdf(pdf_content: bytes) -> ExtractPDFResponse:
|
||||
"""Extract PDF using pypdf."""
|
||||
import io
|
||||
from pypdf import PdfReader
|
||||
|
||||
pdf_file = io.BytesIO(pdf_content)
|
||||
reader = PdfReader(pdf_file)
|
||||
|
||||
text_parts = []
|
||||
for page in reader.pages:
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
text_parts.append(text)
|
||||
|
||||
return ExtractPDFResponse(
|
||||
text="\n\n".join(text_parts),
|
||||
backend_used="pypdf",
|
||||
pages=len(reader.pages),
|
||||
table_count=0
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Application Lifecycle
|
||||
# =============================================================================
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Preload models on startup."""
|
||||
logger.info("Starting Embedding Service...")
|
||||
|
||||
if config.EMBEDDING_BACKEND == "local":
|
||||
try:
|
||||
get_embedding_model()
|
||||
logger.info("Embedding model preloaded")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to preload embedding model: {e}")
|
||||
|
||||
if config.RERANKER_BACKEND == "local":
|
||||
try:
|
||||
get_reranker_model()
|
||||
logger.info("Reranker model preloaded")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to preload reranker model: {e}")
|
||||
|
||||
logger.info("Embedding Service ready")
|
||||
yield
|
||||
logger.info("Shutting down Embedding Service")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# FastAPI Application
|
||||
# =============================================================================
|
||||
|
||||
app = FastAPI(
|
||||
title="Embedding Service",
|
||||
description="ML service for embeddings, re-ranking, and PDF extraction",
|
||||
version="1.0.0",
|
||||
lifespan=lifespan
|
||||
)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Endpoints
|
||||
# =============================================================================
|
||||
|
||||
@app.get("/health", response_model=HealthResponse)
|
||||
async def health_check():
|
||||
"""Health check endpoint."""
|
||||
return HealthResponse(
|
||||
status="healthy",
|
||||
embedding_model=config.LOCAL_EMBEDDING_MODEL if config.EMBEDDING_BACKEND == "local" else config.OPENAI_EMBEDDING_MODEL,
|
||||
embedding_dimensions=config.get_current_dimensions(),
|
||||
reranker_model=config.LOCAL_RERANKER_MODEL if config.RERANKER_BACKEND == "local" else "cohere-rerank",
|
||||
pdf_backends=detect_pdf_backends()
|
||||
)
|
||||
|
||||
|
||||
@app.get("/models", response_model=ModelsResponse)
|
||||
async def get_models():
|
||||
"""Get information about configured models."""
|
||||
return ModelsResponse(
|
||||
embedding_backend=config.EMBEDDING_BACKEND,
|
||||
embedding_model=config.LOCAL_EMBEDDING_MODEL if config.EMBEDDING_BACKEND == "local" else config.OPENAI_EMBEDDING_MODEL,
|
||||
embedding_dimensions=config.get_current_dimensions(),
|
||||
reranker_backend=config.RERANKER_BACKEND,
|
||||
reranker_model=config.LOCAL_RERANKER_MODEL if config.RERANKER_BACKEND == "local" else "cohere-rerank-multilingual-v3.0",
|
||||
pdf_backend=config.PDF_EXTRACTION_BACKEND,
|
||||
available_pdf_backends=detect_pdf_backends()
|
||||
)
|
||||
|
||||
|
||||
@app.post("/embed", response_model=EmbedResponse)
|
||||
async def embed_texts(request: EmbedRequest):
|
||||
"""Generate embeddings for multiple texts."""
|
||||
if not request.texts:
|
||||
return EmbedResponse(
|
||||
embeddings=[],
|
||||
model=config.LOCAL_EMBEDDING_MODEL if config.EMBEDDING_BACKEND == "local" else config.OPENAI_EMBEDDING_MODEL,
|
||||
dimensions=config.get_current_dimensions()
|
||||
)
|
||||
|
||||
try:
|
||||
if config.EMBEDDING_BACKEND == "local":
|
||||
embeddings = generate_local_embeddings(request.texts)
|
||||
else:
|
||||
embeddings = await generate_openai_embeddings(request.texts)
|
||||
|
||||
return EmbedResponse(
|
||||
embeddings=embeddings,
|
||||
model=config.LOCAL_EMBEDDING_MODEL if config.EMBEDDING_BACKEND == "local" else config.OPENAI_EMBEDDING_MODEL,
|
||||
dimensions=config.get_current_dimensions()
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Embedding error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.post("/embed-single", response_model=EmbedSingleResponse)
|
||||
async def embed_single_text(request: EmbedSingleRequest):
|
||||
"""Generate embedding for a single text."""
|
||||
try:
|
||||
if config.EMBEDDING_BACKEND == "local":
|
||||
embeddings = generate_local_embeddings([request.text])
|
||||
else:
|
||||
embeddings = await generate_openai_embeddings([request.text])
|
||||
|
||||
return EmbedSingleResponse(
|
||||
embedding=embeddings[0] if embeddings else [],
|
||||
model=config.LOCAL_EMBEDDING_MODEL if config.EMBEDDING_BACKEND == "local" else config.OPENAI_EMBEDDING_MODEL,
|
||||
dimensions=config.get_current_dimensions()
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Embedding error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.post("/rerank", response_model=RerankResponse)
|
||||
async def rerank_documents(request: RerankRequest):
|
||||
"""Re-rank documents based on query relevance."""
|
||||
if not request.documents:
|
||||
return RerankResponse(
|
||||
results=[],
|
||||
model=config.LOCAL_RERANKER_MODEL if config.RERANKER_BACKEND == "local" else "cohere-rerank"
|
||||
)
|
||||
|
||||
try:
|
||||
if config.RERANKER_BACKEND == "local":
|
||||
results = rerank_local(request.query, request.documents, request.top_k)
|
||||
else:
|
||||
results = await rerank_cohere(request.query, request.documents, request.top_k)
|
||||
|
||||
return RerankResponse(
|
||||
results=results,
|
||||
model=config.LOCAL_RERANKER_MODEL if config.RERANKER_BACKEND == "local" else "cohere-rerank-multilingual-v3.0"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Rerank error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.post("/chunk", response_model=ChunkResponse)
|
||||
async def chunk_text(request: ChunkRequest):
|
||||
"""Chunk text into smaller pieces."""
|
||||
if not request.text:
|
||||
return ChunkResponse(chunks=[], count=0, strategy=request.strategy)
|
||||
|
||||
try:
|
||||
if request.strategy == "semantic":
|
||||
overlap_sentences = max(1, request.overlap // 100)
|
||||
chunks = chunk_text_semantic(request.text, request.chunk_size, overlap_sentences)
|
||||
else:
|
||||
chunks = chunk_text_recursive(request.text, request.chunk_size, request.overlap)
|
||||
|
||||
return ChunkResponse(
|
||||
chunks=chunks,
|
||||
count=len(chunks),
|
||||
strategy=request.strategy
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Chunking error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.post("/extract-pdf", response_model=ExtractPDFResponse)
|
||||
async def extract_pdf(file: UploadFile = File(...)):
|
||||
"""Extract text from PDF file."""
|
||||
pdf_content = await file.read()
|
||||
available = detect_pdf_backends()
|
||||
|
||||
if not available:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail="No PDF backend available. Install: pip install pypdf unstructured"
|
||||
)
|
||||
|
||||
backend = config.PDF_EXTRACTION_BACKEND
|
||||
if backend == "auto":
|
||||
backend = "unstructured" if "unstructured" in available else "pypdf"
|
||||
|
||||
try:
|
||||
if backend == "unstructured" and "unstructured" in available:
|
||||
return extract_pdf_unstructured(pdf_content)
|
||||
elif "pypdf" in available:
|
||||
return extract_pdf_pypdf(pdf_content)
|
||||
else:
|
||||
raise HTTPException(status_code=500, detail=f"Backend {backend} not available")
|
||||
except Exception as e:
|
||||
logger.error(f"PDF extraction error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Main
|
||||
# =============================================================================
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=config.SERVICE_PORT)
|
||||
23
embedding-service/requirements.txt
Normal file
23
embedding-service/requirements.txt
Normal file
@@ -0,0 +1,23 @@
|
||||
# Embedding Service Dependencies
|
||||
# This service handles ML-heavy operations (embeddings, re-ranking, PDF extraction)
|
||||
|
||||
# Web Framework
|
||||
fastapi>=0.109.0
|
||||
uvicorn[standard]>=0.27.0
|
||||
pydantic>=2.0.0
|
||||
python-multipart>=0.0.6
|
||||
|
||||
# ML / Embeddings
|
||||
torch>=2.0.0
|
||||
sentence-transformers>=2.2.0
|
||||
|
||||
# PDF Extraction
|
||||
unstructured>=0.12.0
|
||||
pypdf>=4.0.0
|
||||
python-magic>=0.4.27
|
||||
|
||||
# HTTP Client (for OpenAI/Cohere API calls)
|
||||
httpx>=0.26.0
|
||||
|
||||
# Utilities
|
||||
python-dotenv>=1.0.0
|
||||
Reference in New Issue
Block a user