Files
breakpilot-lehrer/klausur-service/backend/zeugnis/storage.py
Benjamin Admin 165c493d1e
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m22s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 23s
Restructure: Move 52 files into 7 domain packages
korrektur/ zeugnis/ admin/ compliance/ worksheet/ training/ metrics/
52 shims, relative imports, RAG untouched.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 22:10:48 +02:00

181 lines
5.7 KiB
Python

"""
Zeugnis Crawler - Embedding generation, MinIO upload, and Qdrant indexing.
"""
import io
import os
import uuid
from datetime import datetime
from typing import Optional, List, Dict, Any
# =============================================================================
# Configuration
# =============================================================================
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "localhost:9000")
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "test-access-key")
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "test-secret-key")
MINIO_BUCKET = os.getenv("MINIO_BUCKET", "breakpilot-rag")
EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")
ZEUGNIS_COLLECTION = "bp_zeugnis"
# =============================================================================
# Embedding Generation
# =============================================================================
_embedding_model = None
def get_embedding_model():
"""Get or initialize embedding model."""
global _embedding_model
if _embedding_model is None and EMBEDDING_BACKEND == "local":
try:
from sentence_transformers import SentenceTransformer
_embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
print("Loaded local embedding model: all-MiniLM-L6-v2")
except ImportError:
print("Warning: sentence-transformers not installed")
return _embedding_model
async def generate_embeddings(texts: List[str]) -> List[List[float]]:
"""Generate embeddings for a list of texts."""
if not texts:
return []
if EMBEDDING_BACKEND == "local":
model = get_embedding_model()
if model:
embeddings = model.encode(texts, show_progress_bar=False)
return [emb.tolist() for emb in embeddings]
return []
elif EMBEDDING_BACKEND == "openai":
import openai
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
print("Warning: OPENAI_API_KEY not set")
return []
client = openai.AsyncOpenAI(api_key=api_key)
response = await client.embeddings.create(
input=texts,
model="text-embedding-3-small"
)
return [item.embedding for item in response.data]
return []
# =============================================================================
# MinIO Storage
# =============================================================================
async def upload_to_minio(
content: bytes,
bundesland: str,
filename: str,
content_type: str = "application/pdf",
year: Optional[int] = None,
) -> Optional[str]:
"""Upload document to MinIO."""
try:
from minio import Minio
client = Minio(
MINIO_ENDPOINT,
access_key=MINIO_ACCESS_KEY,
secret_key=MINIO_SECRET_KEY,
secure=os.getenv("MINIO_SECURE", "false").lower() == "true"
)
# Ensure bucket exists
if not client.bucket_exists(MINIO_BUCKET):
client.make_bucket(MINIO_BUCKET)
# Build path
year_str = str(year) if year else str(datetime.now().year)
object_name = f"landes-daten/{bundesland}/zeugnis/{year_str}/{filename}"
# Upload
client.put_object(
MINIO_BUCKET,
object_name,
io.BytesIO(content),
len(content),
content_type=content_type,
)
return object_name
except Exception as e:
print(f"MinIO upload failed: {e}")
return None
# =============================================================================
# Qdrant Indexing
# =============================================================================
async def index_in_qdrant(
doc_id: str,
chunks: List[str],
embeddings: List[List[float]],
metadata: Dict[str, Any],
) -> int:
"""Index document chunks in Qdrant."""
try:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
client = QdrantClient(url=QDRANT_URL)
# Ensure collection exists
collections = client.get_collections().collections
if not any(c.name == ZEUGNIS_COLLECTION for c in collections):
vector_size = len(embeddings[0]) if embeddings else 384
client.create_collection(
collection_name=ZEUGNIS_COLLECTION,
vectors_config=VectorParams(
size=vector_size,
distance=Distance.COSINE,
),
)
print(f"Created Qdrant collection: {ZEUGNIS_COLLECTION}")
# Create points
points = []
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
point_id = str(uuid.uuid4())
points.append(PointStruct(
id=point_id,
vector=embedding,
payload={
"document_id": doc_id,
"chunk_index": i,
"chunk_text": chunk[:500], # Store first 500 chars for preview
"bundesland": metadata.get("bundesland"),
"doc_type": metadata.get("doc_type"),
"title": metadata.get("title"),
"source_url": metadata.get("url"),
"training_allowed": metadata.get("training_allowed", False),
"indexed_at": datetime.now().isoformat(),
}
))
# Upsert
if points:
client.upsert(
collection_name=ZEUGNIS_COLLECTION,
points=points,
)
return len(points)
except Exception as e:
print(f"Qdrant indexing failed: {e}")
return 0