breakpilot-lehrer/klausur-service/backend/qdrant_core.py

"""
Qdrant Vector Database Service — core client and BYOEH functions.
"""

import os
from typing import List, Dict, Optional
from qdrant_client import QdrantClient
from qdrant_client.http import models
from qdrant_client.models import VectorParams, Distance, PointStruct, Filter, FieldCondition, MatchValue

QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
COLLECTION_NAME = "bp_eh"
VECTOR_SIZE = 1536  # OpenAI text-embedding-3-small

_client: Optional[QdrantClient] = None


def get_qdrant_client() -> QdrantClient:
    """Get or create Qdrant client singleton."""
    global _client
    if _client is None:
        _client = QdrantClient(url=QDRANT_URL)
    return _client


async def init_qdrant_collection() -> bool:
    """Initialize Qdrant collection for BYOEH if not exists."""
    try:
        client = get_qdrant_client()

        # Check if collection exists
        collections = client.get_collections().collections
        collection_names = [c.name for c in collections]

        if COLLECTION_NAME not in collection_names:
            client.create_collection(
                collection_name=COLLECTION_NAME,
                vectors_config=VectorParams(
                    size=VECTOR_SIZE,
                    distance=Distance.COSINE
                )
            )
            print(f"Created Qdrant collection: {COLLECTION_NAME}")
        else:
            print(f"Qdrant collection {COLLECTION_NAME} already exists")

        return True
    except Exception as e:
        print(f"Failed to initialize Qdrant: {e}")
        return False


async def index_eh_chunks(
    eh_id: str,
    tenant_id: str,
    subject: str,
    chunks: List[Dict]
) -> int:
    """
    Index EH chunks in Qdrant.

    Args:
        eh_id: Erwartungshorizont ID
        tenant_id: Tenant/School ID for isolation
        subject: Subject (deutsch, englisch, etc.)
        chunks: List of {text, embedding, encrypted_content}

    Returns:
        Number of indexed chunks
    """
    client = get_qdrant_client()

    points = []
    for i, chunk in enumerate(chunks):
        point_id = f"{eh_id}_{i}"
        points.append(
            PointStruct(
                id=point_id,
                vector=chunk["embedding"],
                payload={
                    "tenant_id": tenant_id,
                    "eh_id": eh_id,
                    "chunk_index": i,
                    "subject": subject,
                    "encrypted_content": chunk.get("encrypted_content", ""),
                    "training_allowed": False  # ALWAYS FALSE - critical for compliance
                }
            )
        )

    if points:
        client.upsert(collection_name=COLLECTION_NAME, points=points)

    return len(points)


async def search_eh(
    query_embedding: List[float],
    tenant_id: str,
    subject: Optional[str] = None,
    limit: int = 5
) -> List[Dict]:
    """
    Semantic search in tenant's Erwartungshorizonte.

    Args:
        query_embedding: Query vector (1536 dimensions)
        tenant_id: Tenant ID for isolation
        subject: Optional subject filter
        limit: Max results

    Returns:
        List of matching chunks with scores
    """
    client = get_qdrant_client()

    # Build filter conditions
    must_conditions = [
        FieldCondition(key="tenant_id", match=MatchValue(value=tenant_id))
    ]

    if subject:
        must_conditions.append(
            FieldCondition(key="subject", match=MatchValue(value=subject))
        )

    query_filter = Filter(must=must_conditions)

    results = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_embedding,
        query_filter=query_filter,
        limit=limit
    )

    return [
        {
            "id": str(r.id),
            "score": r.score,
            "eh_id": r.payload.get("eh_id"),
            "chunk_index": r.payload.get("chunk_index"),
            "encrypted_content": r.payload.get("encrypted_content"),
            "subject": r.payload.get("subject")
        }
        for r in results
    ]


async def delete_eh_vectors(eh_id: str) -> int:
    """
    Delete all vectors for a specific Erwartungshorizont.

    Args:
        eh_id: Erwartungshorizont ID

    Returns:
        Number of deleted points
    """
    client = get_qdrant_client()

    # Get all points for this EH first
    scroll_result = client.scroll(
        collection_name=COLLECTION_NAME,
        scroll_filter=Filter(
            must=[FieldCondition(key="eh_id", match=MatchValue(value=eh_id))]
        ),
        limit=1000
    )

    point_ids = [str(p.id) for p in scroll_result[0]]

    if point_ids:
        client.delete(
            collection_name=COLLECTION_NAME,
            points_selector=models.PointIdsList(points=point_ids)
        )

    return len(point_ids)


async def get_collection_info() -> Dict:
    """Get collection statistics."""
    try:
        client = get_qdrant_client()
        info = client.get_collection(COLLECTION_NAME)
        return {
            "name": COLLECTION_NAME,
            "vectors_count": info.vectors_count,
            "points_count": info.points_count,
            "status": info.status.value
        }
    except Exception as e:
        return {"error": str(e)}