From 92ca5b7ba57c31282aecf232aaa824d6bdece241 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Fri, 27 Feb 2026 07:46:57 +0100
Subject: [PATCH] feat(rag): use Ollama for embeddings instead of
 embedding-service

Switch to Ollama's bge-m3 model (1024-dim) for generating embeddings,
solving the dimension mismatch with Qdrant collections. Embedding-service
still used for chunking, reranking, and PDF extraction.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 docker-compose.yml              |  4 ++
 rag-service/embedding_client.py | 83 ++++++++++++++++++++++++---------
 2 files changed, 65 insertions(+), 22 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 1fecb6b..3da1012 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -385,8 +385,12 @@ services:
       MINIO_BUCKET: ${MINIO_BUCKET:-breakpilot-rag}
       MINIO_SECURE: "false"
       EMBEDDING_SERVICE_URL: http://embedding-service:8087
+      OLLAMA_URL: ${OLLAMA_URL:-http://host.docker.internal:11434}
+      OLLAMA_EMBED_MODEL: ${OLLAMA_EMBED_MODEL:-bge-m3}
       JWT_SECRET: ${JWT_SECRET:-your-super-secret-jwt-key-change-in-production}
       ENVIRONMENT: ${ENVIRONMENT:-development}
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
     depends_on:
       qdrant:
         condition: service_healthy
diff --git a/rag-service/embedding_client.py b/rag-service/embedding_client.py
index fe0bf0d..a1d5125 100644
--- a/rag-service/embedding_client.py
+++ b/rag-service/embedding_client.py
@@ -1,4 +1,5 @@
 import logging
+import os
 from typing import Optional
 
 import httpx
@@ -8,44 +9,82 @@ from config import settings
 logger = logging.getLogger("rag-service.embedding")
 
 _TIMEOUT = httpx.Timeout(timeout=120.0, connect=10.0)
+_EMBED_TIMEOUT = httpx.Timeout(timeout=300.0, connect=10.0)
+
+# Ollama config for embeddings (bge-m3, 1024-dim)
+_OLLAMA_URL = os.getenv("OLLAMA_URL", "http://ollama:11434")
+_OLLAMA_EMBED_MODEL = os.getenv("OLLAMA_EMBED_MODEL", "bge-m3")
+
+# Batch size for Ollama embedding requests
+_EMBED_BATCH_SIZE = int(os.getenv("EMBED_BATCH_SIZE", "32"))
 
 
 class EmbeddingClient:
-    """HTTP client for the embedding-service (port 8087)."""
+    """
+    Hybrid client:
+    - Embeddings via Ollama (bge-m3, 1024-dim) for Qdrant compatibility
+    - Chunking + PDF extraction via embedding-service (port 8087)
+    """
 
     def __init__(self) -> None:
-        self._base_url: str = settings.EMBEDDING_SERVICE_URL.rstrip("/")
+        self._embed_svc_url: str = settings.EMBEDDING_SERVICE_URL.rstrip("/")
+        self._ollama_url: str = _OLLAMA_URL.rstrip("/")
+        self._embed_model: str = _OLLAMA_EMBED_MODEL
 
-    def _url(self, path: str) -> str:
-        return f"{self._base_url}{path}"
+    def _svc_url(self, path: str) -> str:
+        return f"{self._embed_svc_url}{path}"
 
     # ------------------------------------------------------------------
-    # Embeddings
+    # Embeddings (via Ollama)
     # ------------------------------------------------------------------
 
     async def generate_embeddings(self, texts: list[str]) -> list[list[float]]:
         """
-        Send a batch of texts to the embedding service and return a list of
-        embedding vectors.
+        Generate embeddings via Ollama's bge-m3 model.
+        Processes in batches to avoid timeout on large uploads.
         """
-        async with httpx.AsyncClient(timeout=_TIMEOUT) as client:
-            response = await client.post(
-                self._url("/embed"),
-                json={"texts": texts},
-            )
-            response.raise_for_status()
-            data = response.json()
-            return data.get("embeddings", [])
+        all_embeddings: list[list[float]] = []
+
+        for i in range(0, len(texts), _EMBED_BATCH_SIZE):
+            batch = texts[i : i + _EMBED_BATCH_SIZE]
+            batch_embeddings = []
+
+            async with httpx.AsyncClient(timeout=_EMBED_TIMEOUT) as client:
+                for text in batch:
+                    response = await client.post(
+                        f"{self._ollama_url}/api/embeddings",
+                        json={
+                            "model": self._embed_model,
+                            "prompt": text,
+                        },
+                    )
+                    response.raise_for_status()
+                    data = response.json()
+                    embedding = data.get("embedding", [])
+                    if not embedding:
+                        raise ValueError(
+                            f"Ollama returned empty embedding for model {self._embed_model}"
+                        )
+                    batch_embeddings.append(embedding)
+
+            all_embeddings.extend(batch_embeddings)
+
+            if i + _EMBED_BATCH_SIZE < len(texts):
+                logger.info(
+                    "Embedding progress: %d/%d", len(all_embeddings), len(texts)
+                )
+
+        return all_embeddings
 
     async def generate_single_embedding(self, text: str) -> list[float]:
         """Convenience wrapper for a single text."""
         results = await self.generate_embeddings([text])
         if not results:
-            raise ValueError("Embedding service returned empty result")
+            raise ValueError("Ollama returned empty result")
         return results[0]
 
     # ------------------------------------------------------------------
-    # Reranking
+    # Reranking (via embedding-service)
     # ------------------------------------------------------------------
 
     async def rerank_documents(
@@ -60,7 +99,7 @@ class EmbeddingClient:
         """
         async with httpx.AsyncClient(timeout=_TIMEOUT) as client:
             response = await client.post(
-                self._url("/rerank"),
+                self._svc_url("/rerank"),
                 json={
                     "query": query,
                     "documents": documents,
@@ -72,7 +111,7 @@ class EmbeddingClient:
             return data.get("results", [])
 
     # ------------------------------------------------------------------
-    # Chunking
+    # Chunking (via embedding-service)
     # ------------------------------------------------------------------
 
     async def chunk_text(
@@ -88,7 +127,7 @@ class EmbeddingClient:
         """
         async with httpx.AsyncClient(timeout=_TIMEOUT) as client:
             response = await client.post(
-                self._url("/chunk"),
+                self._svc_url("/chunk"),
                 json={
                     "text": text,
                     "strategy": strategy,
@@ -101,7 +140,7 @@ class EmbeddingClient:
             return data.get("chunks", [])
 
     # ------------------------------------------------------------------
-    # PDF extraction
+    # PDF extraction (via embedding-service)
     # ------------------------------------------------------------------
 
     async def extract_pdf(self, pdf_bytes: bytes) -> str:
@@ -111,7 +150,7 @@ class EmbeddingClient:
         """
         async with httpx.AsyncClient(timeout=_TIMEOUT) as client:
             response = await client.post(
-                self._url("/extract-pdf"),
+                self._svc_url("/extract-pdf"),
                 files={"file": ("document.pdf", pdf_bytes, "application/pdf")},
             )
             response.raise_for_status()