""" DSFA RAG Embedding Service Integration. Handles embedding generation, text extraction, and fallback logic. """ import os import hashlib import logging import struct import re from typing import List import httpx logger = logging.getLogger(__name__) # Embedding service configuration EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://172.18.0.13:8087") async def get_embedding(text: str) -> List[float]: """ Get embedding for text using the embedding-service. Uses BGE-M3 model which produces 1024-dimensional vectors. """ async with httpx.AsyncClient(timeout=60.0) as client: try: response = await client.post( f"{EMBEDDING_SERVICE_URL}/embed-single", json={"text": text} ) response.raise_for_status() data = response.json() return data.get("embedding", []) except httpx.HTTPError as e: logger.error(f"Embedding service error: {e}") # Fallback to hash-based pseudo-embedding for development return _generate_fallback_embedding(text) async def get_embeddings_batch(texts: List[str]) -> List[List[float]]: """ Get embeddings for multiple texts in batch. """ async with httpx.AsyncClient(timeout=120.0) as client: try: response = await client.post( f"{EMBEDDING_SERVICE_URL}/embed", json={"texts": texts} ) response.raise_for_status() data = response.json() return data.get("embeddings", []) except httpx.HTTPError as e: logger.error(f"Embedding service batch error: {e}") # Fallback return [_generate_fallback_embedding(t) for t in texts] async def extract_text_from_url(url: str) -> str: """ Extract text from a document URL (PDF, HTML, etc.). """ async with httpx.AsyncClient(timeout=120.0) as client: try: # First try to use the embedding-service's extract-pdf endpoint response = await client.post( f"{EMBEDDING_SERVICE_URL}/extract-pdf", json={"url": url} ) response.raise_for_status() data = response.json() return data.get("text", "") except httpx.HTTPError as e: logger.error(f"PDF extraction error for {url}: {e}") # Fallback: try to fetch HTML content directly try: response = await client.get(url, follow_redirects=True) response.raise_for_status() content_type = response.headers.get("content-type", "") if "html" in content_type: # Simple HTML text extraction html = response.text # Remove scripts and styles html = re.sub(r']*>.*?', '', html, flags=re.DOTALL | re.IGNORECASE) html = re.sub(r']*>.*?', '', html, flags=re.DOTALL | re.IGNORECASE) # Remove tags text = re.sub(r'<[^>]+>', ' ', html) # Clean whitespace text = re.sub(r'\s+', ' ', text).strip() return text else: return "" except Exception as fetch_err: logger.error(f"Fallback fetch error for {url}: {fetch_err}") return "" def _generate_fallback_embedding(text: str) -> List[float]: """ Generate deterministic pseudo-embedding from text hash. Used as fallback when embedding service is unavailable. """ hash_bytes = hashlib.sha256(text.encode()).digest() embedding = [] for i in range(0, min(len(hash_bytes), 128), 4): val = struct.unpack('f', hash_bytes[i:i+4])[0] embedding.append(val % 1.0) # Pad to 1024 dimensions while len(embedding) < 1024: embedding.extend(embedding[:min(len(embedding), 1024 - len(embedding))]) return embedding[:1024]