breakpilot-lehrer/klausur-service/backend/zeugnis/storage.py

"""
Zeugnis Crawler - Embedding generation, MinIO upload, and Qdrant indexing.
"""

import io
import os
import uuid
from datetime import datetime
from typing import Optional, List, Dict, Any


# =============================================================================
# Configuration
# =============================================================================

QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "localhost:9000")
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "test-access-key")
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "test-secret-key")
MINIO_BUCKET = os.getenv("MINIO_BUCKET", "breakpilot-rag")
EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")

ZEUGNIS_COLLECTION = "bp_zeugnis"


# =============================================================================
# Embedding Generation
# =============================================================================

_embedding_model = None


def get_embedding_model():
    """Get or initialize embedding model."""
    global _embedding_model
    if _embedding_model is None and EMBEDDING_BACKEND == "local":
        try:
            from sentence_transformers import SentenceTransformer
            _embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
            print("Loaded local embedding model: all-MiniLM-L6-v2")
        except ImportError:
            print("Warning: sentence-transformers not installed")
    return _embedding_model


async def generate_embeddings(texts: List[str]) -> List[List[float]]:
    """Generate embeddings for a list of texts."""
    if not texts:
        return []

    if EMBEDDING_BACKEND == "local":
        model = get_embedding_model()
        if model:
            embeddings = model.encode(texts, show_progress_bar=False)
            return [emb.tolist() for emb in embeddings]
        return []

    elif EMBEDDING_BACKEND == "openai":
        import openai
        api_key = os.getenv("OPENAI_API_KEY")
        if not api_key:
            print("Warning: OPENAI_API_KEY not set")
            return []

        client = openai.AsyncOpenAI(api_key=api_key)
        response = await client.embeddings.create(
            input=texts,
            model="text-embedding-3-small"
        )
        return [item.embedding for item in response.data]

    return []


# =============================================================================
# MinIO Storage
# =============================================================================

async def upload_to_minio(
    content: bytes,
    bundesland: str,
    filename: str,
    content_type: str = "application/pdf",
    year: Optional[int] = None,
) -> Optional[str]:
    """Upload document to MinIO."""
    try:
        from minio import Minio

        client = Minio(
            MINIO_ENDPOINT,
            access_key=MINIO_ACCESS_KEY,
            secret_key=MINIO_SECRET_KEY,
            secure=os.getenv("MINIO_SECURE", "false").lower() == "true"
        )

        # Ensure bucket exists
        if not client.bucket_exists(MINIO_BUCKET):
            client.make_bucket(MINIO_BUCKET)

        # Build path
        year_str = str(year) if year else str(datetime.now().year)
        object_name = f"landes-daten/{bundesland}/zeugnis/{year_str}/{filename}"

        # Upload
        client.put_object(
            MINIO_BUCKET,
            object_name,
            io.BytesIO(content),
            len(content),
            content_type=content_type,
        )

        return object_name
    except Exception as e:
        print(f"MinIO upload failed: {e}")
        return None


# =============================================================================
# Qdrant Indexing
# =============================================================================

async def index_in_qdrant(
    doc_id: str,
    chunks: List[str],
    embeddings: List[List[float]],
    metadata: Dict[str, Any],
) -> int:
    """Index document chunks in Qdrant."""
    try:
        from qdrant_client import QdrantClient
        from qdrant_client.models import VectorParams, Distance, PointStruct

        client = QdrantClient(url=QDRANT_URL)

        # Ensure collection exists
        collections = client.get_collections().collections
        if not any(c.name == ZEUGNIS_COLLECTION for c in collections):
            vector_size = len(embeddings[0]) if embeddings else 384
            client.create_collection(
                collection_name=ZEUGNIS_COLLECTION,
                vectors_config=VectorParams(
                    size=vector_size,
                    distance=Distance.COSINE,
                ),
            )
            print(f"Created Qdrant collection: {ZEUGNIS_COLLECTION}")

        # Create points
        points = []
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
            point_id = str(uuid.uuid4())
            points.append(PointStruct(
                id=point_id,
                vector=embedding,
                payload={
                    "document_id": doc_id,
                    "chunk_index": i,
                    "chunk_text": chunk[:500],  # Store first 500 chars for preview
                    "bundesland": metadata.get("bundesland"),
                    "doc_type": metadata.get("doc_type"),
                    "title": metadata.get("title"),
                    "source_url": metadata.get("url"),
                    "training_allowed": metadata.get("training_allowed", False),
                    "indexed_at": datetime.now().isoformat(),
                }
            ))

        # Upsert
        if points:
            client.upsert(
                collection_name=ZEUGNIS_COLLECTION,
                points=points,
            )

        return len(points)
    except Exception as e:
        print(f"Qdrant indexing failed: {e}")
        return 0