"""
DSFA RAG Embedding Service Integration.

Handles embedding generation, text extraction, and fallback logic.
"""

import os
import hashlib
import logging
import struct
import re
from typing import List

import httpx

logger = logging.getLogger(__name__)

# Embedding service configuration
EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://172.18.0.13:8087")


async def get_embedding(text: str) -> List[float]:
    """
    Get embedding for text using the embedding-service.

    Uses BGE-M3 model which produces 1024-dimensional vectors.
    """
    async with httpx.AsyncClient(timeout=60.0) as client:
        try:
            response = await client.post(
                f"{EMBEDDING_SERVICE_URL}/embed-single",
                json={"text": text}
            )
            response.raise_for_status()
            data = response.json()
            return data.get("embedding", [])
        except httpx.HTTPError as e:
            logger.error(f"Embedding service error: {e}")
            # Fallback to hash-based pseudo-embedding for development
            return _generate_fallback_embedding(text)


async def get_embeddings_batch(texts: List[str]) -> List[List[float]]:
    """
    Get embeddings for multiple texts in batch.
    """
    async with httpx.AsyncClient(timeout=120.0) as client:
        try:
            response = await client.post(
                f"{EMBEDDING_SERVICE_URL}/embed",
                json={"texts": texts}
            )
            response.raise_for_status()
            data = response.json()
            return data.get("embeddings", [])
        except httpx.HTTPError as e:
            logger.error(f"Embedding service batch error: {e}")
            # Fallback
            return [_generate_fallback_embedding(t) for t in texts]


async def extract_text_from_url(url: str) -> str:
    """
    Extract text from a document URL (PDF, HTML, etc.).
    """
    async with httpx.AsyncClient(timeout=120.0) as client:
        try:
            # First try to use the embedding-service's extract-pdf endpoint
            response = await client.post(
                f"{EMBEDDING_SERVICE_URL}/extract-pdf",
                json={"url": url}
            )
            response.raise_for_status()
            data = response.json()
            return data.get("text", "")
        except httpx.HTTPError as e:
            logger.error(f"PDF extraction error for {url}: {e}")
            # Fallback: try to fetch HTML content directly
            try:
                response = await client.get(url, follow_redirects=True)
                response.raise_for_status()
                content_type = response.headers.get("content-type", "")
                if "html" in content_type:
                    # Simple HTML text extraction
                    html = response.text
                    # Remove scripts and styles
                    html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
                    html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
                    # Remove tags
                    text = re.sub(r'<[^>]+>', ' ', html)
                    # Clean whitespace
                    text = re.sub(r'\s+', ' ', text).strip()
                    return text
                else:
                    return ""
            except Exception as fetch_err:
                logger.error(f"Fallback fetch error for {url}: {fetch_err}")
                return ""


def _generate_fallback_embedding(text: str) -> List[float]:
    """
    Generate deterministic pseudo-embedding from text hash.
    Used as fallback when embedding service is unavailable.
    """
    hash_bytes = hashlib.sha256(text.encode()).digest()
    embedding = []
    for i in range(0, min(len(hash_bytes), 128), 4):
        val = struct.unpack('f', hash_bytes[i:i+4])[0]
        embedding.append(val % 1.0)

    # Pad to 1024 dimensions
    while len(embedding) < 1024:
        embedding.extend(embedding[:min(len(embedding), 1024 - len(embedding))])

    return embedding[:1024]