[split-required] Split 500-850 LOC files (batch 2)

backend-lehrer (10 files): - game/database.py (785 → 5), correction_api.py (683 → 4) - classroom_engine/antizipation.py (676 → 5) - llm_gateway schools/edu_search already done in prior batch klausur-service (12 files): - orientation_crop_api.py (694 → 5), pdf_export.py (677 → 4) - zeugnis_crawler.py (676 → 5), grid_editor_api.py (671 → 5) - eh_templates.py (658 → 5), mail/api.py (651 → 5) - qdrant_service.py (638 → 5), training_api.py (625 → 4) website (6 pages): - middleware (696 → 8), mail (733 → 6), consent (628 → 8) - compliance/risks (622 → 5), export (502 → 5), brandbook (629 → 7) studio-v2 (3 components): - B2BMigrationWizard (848 → 3), CleanupPanel (765 → 2) - dashboard-experimental (739 → 2) admin-lehrer (4 files): - uebersetzungen (769 → 4), manager (670 → 2) - ChunkBrowserQA (675 → 6), dsfa/page (674 → 5) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:24:01 +02:00
parent 34da9f4cda
commit b4613e26f3
118 changed files with 15258 additions and 14680 deletions
--- a/klausur-service/backend/zeugnis_storage.py
+++ b/klausur-service/backend/zeugnis_storage.py
@@ -0,0 +1,180 @@
+"""
+Zeugnis Crawler - Embedding generation, MinIO upload, and Qdrant indexing.
+"""
+
+import io
+import os
+import uuid
+from datetime import datetime
+from typing import Optional, List, Dict, Any
+
+
+# =============================================================================
+# Configuration
+# =============================================================================
+
+QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
+MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "localhost:9000")
+MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "test-access-key")
+MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "test-secret-key")
+MINIO_BUCKET = os.getenv("MINIO_BUCKET", "breakpilot-rag")
+EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")
+
+ZEUGNIS_COLLECTION = "bp_zeugnis"
+
+
+# =============================================================================
+# Embedding Generation
+# =============================================================================
+
+_embedding_model = None
+
+
+def get_embedding_model():
+    """Get or initialize embedding model."""
+    global _embedding_model
+    if _embedding_model is None and EMBEDDING_BACKEND == "local":
+        try:
+            from sentence_transformers import SentenceTransformer
+            _embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+            print("Loaded local embedding model: all-MiniLM-L6-v2")
+        except ImportError:
+            print("Warning: sentence-transformers not installed")
+    return _embedding_model
+
+
+async def generate_embeddings(texts: List[str]) -> List[List[float]]:
+    """Generate embeddings for a list of texts."""
+    if not texts:
+        return []
+
+    if EMBEDDING_BACKEND == "local":
+        model = get_embedding_model()
+        if model:
+            embeddings = model.encode(texts, show_progress_bar=False)
+            return [emb.tolist() for emb in embeddings]
+        return []
+
+    elif EMBEDDING_BACKEND == "openai":
+        import openai
+        api_key = os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            print("Warning: OPENAI_API_KEY not set")
+            return []
+
+        client = openai.AsyncOpenAI(api_key=api_key)
+        response = await client.embeddings.create(
+            input=texts,
+            model="text-embedding-3-small"
+        )
+        return [item.embedding for item in response.data]
+
+    return []
+
+
+# =============================================================================
+# MinIO Storage
+# =============================================================================
+
+async def upload_to_minio(
+    content: bytes,
+    bundesland: str,
+    filename: str,
+    content_type: str = "application/pdf",
+    year: Optional[int] = None,
+) -> Optional[str]:
+    """Upload document to MinIO."""
+    try:
+        from minio import Minio
+
+        client = Minio(
+            MINIO_ENDPOINT,
+            access_key=MINIO_ACCESS_KEY,
+            secret_key=MINIO_SECRET_KEY,
+            secure=os.getenv("MINIO_SECURE", "false").lower() == "true"
+        )
+
+        # Ensure bucket exists
+        if not client.bucket_exists(MINIO_BUCKET):
+            client.make_bucket(MINIO_BUCKET)
+
+        # Build path
+        year_str = str(year) if year else str(datetime.now().year)
+        object_name = f"landes-daten/{bundesland}/zeugnis/{year_str}/{filename}"
+
+        # Upload
+        client.put_object(
+            MINIO_BUCKET,
+            object_name,
+            io.BytesIO(content),
+            len(content),
+            content_type=content_type,
+        )
+
+        return object_name
+    except Exception as e:
+        print(f"MinIO upload failed: {e}")
+        return None
+
+
+# =============================================================================
+# Qdrant Indexing
+# =============================================================================
+
+async def index_in_qdrant(
+    doc_id: str,
+    chunks: List[str],
+    embeddings: List[List[float]],
+    metadata: Dict[str, Any],
+) -> int:
+    """Index document chunks in Qdrant."""
+    try:
+        from qdrant_client import QdrantClient
+        from qdrant_client.models import VectorParams, Distance, PointStruct
+
+        client = QdrantClient(url=QDRANT_URL)
+
+        # Ensure collection exists
+        collections = client.get_collections().collections
+        if not any(c.name == ZEUGNIS_COLLECTION for c in collections):
+            vector_size = len(embeddings[0]) if embeddings else 384
+            client.create_collection(
+                collection_name=ZEUGNIS_COLLECTION,
+                vectors_config=VectorParams(
+                    size=vector_size,
+                    distance=Distance.COSINE,
+                ),
+            )
+            print(f"Created Qdrant collection: {ZEUGNIS_COLLECTION}")
+
+        # Create points
+        points = []
+        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
+            point_id = str(uuid.uuid4())
+            points.append(PointStruct(
+                id=point_id,
+                vector=embedding,
+                payload={
+                    "document_id": doc_id,
+                    "chunk_index": i,
+                    "chunk_text": chunk[:500],  # Store first 500 chars for preview
+                    "bundesland": metadata.get("bundesland"),
+                    "doc_type": metadata.get("doc_type"),
+                    "title": metadata.get("title"),
+                    "source_url": metadata.get("url"),
+                    "training_allowed": metadata.get("training_allowed", False),
+                    "indexed_at": datetime.now().isoformat(),
+                }
+            ))
+
+        # Upsert
+        if points:
+            client.upsert(
+                collection_name=ZEUGNIS_COLLECTION,
+                points=points,
+            )
+
+        return len(points)
+    except Exception as e:
+        print(f"Qdrant indexing failed: {e}")
+        return 0