From 92ca5b7ba57c31282aecf232aaa824d6bdece241 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 27 Feb 2026 07:46:57 +0100 Subject: [PATCH] feat(rag): use Ollama for embeddings instead of embedding-service Switch to Ollama's bge-m3 model (1024-dim) for generating embeddings, solving the dimension mismatch with Qdrant collections. Embedding-service still used for chunking, reranking, and PDF extraction. Co-Authored-By: Claude Opus 4.6 --- docker-compose.yml | 4 ++ rag-service/embedding_client.py | 83 ++++++++++++++++++++++++--------- 2 files changed, 65 insertions(+), 22 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 1fecb6b..3da1012 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -385,8 +385,12 @@ services: MINIO_BUCKET: ${MINIO_BUCKET:-breakpilot-rag} MINIO_SECURE: "false" EMBEDDING_SERVICE_URL: http://embedding-service:8087 + OLLAMA_URL: ${OLLAMA_URL:-http://host.docker.internal:11434} + OLLAMA_EMBED_MODEL: ${OLLAMA_EMBED_MODEL:-bge-m3} JWT_SECRET: ${JWT_SECRET:-your-super-secret-jwt-key-change-in-production} ENVIRONMENT: ${ENVIRONMENT:-development} + extra_hosts: + - "host.docker.internal:host-gateway" depends_on: qdrant: condition: service_healthy diff --git a/rag-service/embedding_client.py b/rag-service/embedding_client.py index fe0bf0d..a1d5125 100644 --- a/rag-service/embedding_client.py +++ b/rag-service/embedding_client.py @@ -1,4 +1,5 @@ import logging +import os from typing import Optional import httpx @@ -8,44 +9,82 @@ from config import settings logger = logging.getLogger("rag-service.embedding") _TIMEOUT = httpx.Timeout(timeout=120.0, connect=10.0) +_EMBED_TIMEOUT = httpx.Timeout(timeout=300.0, connect=10.0) + +# Ollama config for embeddings (bge-m3, 1024-dim) +_OLLAMA_URL = os.getenv("OLLAMA_URL", "http://ollama:11434") +_OLLAMA_EMBED_MODEL = os.getenv("OLLAMA_EMBED_MODEL", "bge-m3") + +# Batch size for Ollama embedding requests +_EMBED_BATCH_SIZE = int(os.getenv("EMBED_BATCH_SIZE", "32")) class EmbeddingClient: - """HTTP client for the embedding-service (port 8087).""" + """ + Hybrid client: + - Embeddings via Ollama (bge-m3, 1024-dim) for Qdrant compatibility + - Chunking + PDF extraction via embedding-service (port 8087) + """ def __init__(self) -> None: - self._base_url: str = settings.EMBEDDING_SERVICE_URL.rstrip("/") + self._embed_svc_url: str = settings.EMBEDDING_SERVICE_URL.rstrip("/") + self._ollama_url: str = _OLLAMA_URL.rstrip("/") + self._embed_model: str = _OLLAMA_EMBED_MODEL - def _url(self, path: str) -> str: - return f"{self._base_url}{path}" + def _svc_url(self, path: str) -> str: + return f"{self._embed_svc_url}{path}" # ------------------------------------------------------------------ - # Embeddings + # Embeddings (via Ollama) # ------------------------------------------------------------------ async def generate_embeddings(self, texts: list[str]) -> list[list[float]]: """ - Send a batch of texts to the embedding service and return a list of - embedding vectors. + Generate embeddings via Ollama's bge-m3 model. + Processes in batches to avoid timeout on large uploads. """ - async with httpx.AsyncClient(timeout=_TIMEOUT) as client: - response = await client.post( - self._url("/embed"), - json={"texts": texts}, - ) - response.raise_for_status() - data = response.json() - return data.get("embeddings", []) + all_embeddings: list[list[float]] = [] + + for i in range(0, len(texts), _EMBED_BATCH_SIZE): + batch = texts[i : i + _EMBED_BATCH_SIZE] + batch_embeddings = [] + + async with httpx.AsyncClient(timeout=_EMBED_TIMEOUT) as client: + for text in batch: + response = await client.post( + f"{self._ollama_url}/api/embeddings", + json={ + "model": self._embed_model, + "prompt": text, + }, + ) + response.raise_for_status() + data = response.json() + embedding = data.get("embedding", []) + if not embedding: + raise ValueError( + f"Ollama returned empty embedding for model {self._embed_model}" + ) + batch_embeddings.append(embedding) + + all_embeddings.extend(batch_embeddings) + + if i + _EMBED_BATCH_SIZE < len(texts): + logger.info( + "Embedding progress: %d/%d", len(all_embeddings), len(texts) + ) + + return all_embeddings async def generate_single_embedding(self, text: str) -> list[float]: """Convenience wrapper for a single text.""" results = await self.generate_embeddings([text]) if not results: - raise ValueError("Embedding service returned empty result") + raise ValueError("Ollama returned empty result") return results[0] # ------------------------------------------------------------------ - # Reranking + # Reranking (via embedding-service) # ------------------------------------------------------------------ async def rerank_documents( @@ -60,7 +99,7 @@ class EmbeddingClient: """ async with httpx.AsyncClient(timeout=_TIMEOUT) as client: response = await client.post( - self._url("/rerank"), + self._svc_url("/rerank"), json={ "query": query, "documents": documents, @@ -72,7 +111,7 @@ class EmbeddingClient: return data.get("results", []) # ------------------------------------------------------------------ - # Chunking + # Chunking (via embedding-service) # ------------------------------------------------------------------ async def chunk_text( @@ -88,7 +127,7 @@ class EmbeddingClient: """ async with httpx.AsyncClient(timeout=_TIMEOUT) as client: response = await client.post( - self._url("/chunk"), + self._svc_url("/chunk"), json={ "text": text, "strategy": strategy, @@ -101,7 +140,7 @@ class EmbeddingClient: return data.get("chunks", []) # ------------------------------------------------------------------ - # PDF extraction + # PDF extraction (via embedding-service) # ------------------------------------------------------------------ async def extract_pdf(self, pdf_bytes: bytes) -> str: @@ -111,7 +150,7 @@ class EmbeddingClient: """ async with httpx.AsyncClient(timeout=_TIMEOUT) as client: response = await client.post( - self._url("/extract-pdf"), + self._svc_url("/extract-pdf"), files={"file": ("document.pdf", pdf_bytes, "application/pdf")}, ) response.raise_for_status()