""" Zeugnis Crawler - Embedding generation, MinIO upload, and Qdrant indexing. """ import io import os import uuid from datetime import datetime from typing import Optional, List, Dict, Any # ============================================================================= # Configuration # ============================================================================= QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333") MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "localhost:9000") MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "test-access-key") MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "test-secret-key") MINIO_BUCKET = os.getenv("MINIO_BUCKET", "breakpilot-rag") EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local") ZEUGNIS_COLLECTION = "bp_zeugnis" # ============================================================================= # Embedding Generation # ============================================================================= _embedding_model = None def get_embedding_model(): """Get or initialize embedding model.""" global _embedding_model if _embedding_model is None and EMBEDDING_BACKEND == "local": try: from sentence_transformers import SentenceTransformer _embedding_model = SentenceTransformer("all-MiniLM-L6-v2") print("Loaded local embedding model: all-MiniLM-L6-v2") except ImportError: print("Warning: sentence-transformers not installed") return _embedding_model async def generate_embeddings(texts: List[str]) -> List[List[float]]: """Generate embeddings for a list of texts.""" if not texts: return [] if EMBEDDING_BACKEND == "local": model = get_embedding_model() if model: embeddings = model.encode(texts, show_progress_bar=False) return [emb.tolist() for emb in embeddings] return [] elif EMBEDDING_BACKEND == "openai": import openai api_key = os.getenv("OPENAI_API_KEY") if not api_key: print("Warning: OPENAI_API_KEY not set") return [] client = openai.AsyncOpenAI(api_key=api_key) response = await client.embeddings.create( input=texts, model="text-embedding-3-small" ) return [item.embedding for item in response.data] return [] # ============================================================================= # MinIO Storage # ============================================================================= async def upload_to_minio( content: bytes, bundesland: str, filename: str, content_type: str = "application/pdf", year: Optional[int] = None, ) -> Optional[str]: """Upload document to MinIO.""" try: from minio import Minio client = Minio( MINIO_ENDPOINT, access_key=MINIO_ACCESS_KEY, secret_key=MINIO_SECRET_KEY, secure=os.getenv("MINIO_SECURE", "false").lower() == "true" ) # Ensure bucket exists if not client.bucket_exists(MINIO_BUCKET): client.make_bucket(MINIO_BUCKET) # Build path year_str = str(year) if year else str(datetime.now().year) object_name = f"landes-daten/{bundesland}/zeugnis/{year_str}/{filename}" # Upload client.put_object( MINIO_BUCKET, object_name, io.BytesIO(content), len(content), content_type=content_type, ) return object_name except Exception as e: print(f"MinIO upload failed: {e}") return None # ============================================================================= # Qdrant Indexing # ============================================================================= async def index_in_qdrant( doc_id: str, chunks: List[str], embeddings: List[List[float]], metadata: Dict[str, Any], ) -> int: """Index document chunks in Qdrant.""" try: from qdrant_client import QdrantClient from qdrant_client.models import VectorParams, Distance, PointStruct client = QdrantClient(url=QDRANT_URL) # Ensure collection exists collections = client.get_collections().collections if not any(c.name == ZEUGNIS_COLLECTION for c in collections): vector_size = len(embeddings[0]) if embeddings else 384 client.create_collection( collection_name=ZEUGNIS_COLLECTION, vectors_config=VectorParams( size=vector_size, distance=Distance.COSINE, ), ) print(f"Created Qdrant collection: {ZEUGNIS_COLLECTION}") # Create points points = [] for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)): point_id = str(uuid.uuid4()) points.append(PointStruct( id=point_id, vector=embedding, payload={ "document_id": doc_id, "chunk_index": i, "chunk_text": chunk[:500], # Store first 500 chars for preview "bundesland": metadata.get("bundesland"), "doc_type": metadata.get("doc_type"), "title": metadata.get("title"), "source_url": metadata.get("url"), "training_allowed": metadata.get("training_allowed", False), "indexed_at": datetime.now().isoformat(), } )) # Upsert if points: client.upsert( collection_name=ZEUGNIS_COLLECTION, points=points, ) return len(points) except Exception as e: print(f"Qdrant indexing failed: {e}") return 0