[split-required] Split 500-850 LOC files (batch 2)
backend-lehrer (10 files): - game/database.py (785 → 5), correction_api.py (683 → 4) - classroom_engine/antizipation.py (676 → 5) - llm_gateway schools/edu_search already done in prior batch klausur-service (12 files): - orientation_crop_api.py (694 → 5), pdf_export.py (677 → 4) - zeugnis_crawler.py (676 → 5), grid_editor_api.py (671 → 5) - eh_templates.py (658 → 5), mail/api.py (651 → 5) - qdrant_service.py (638 → 5), training_api.py (625 → 4) website (6 pages): - middleware (696 → 8), mail (733 → 6), consent (628 → 8) - compliance/risks (622 → 5), export (502 → 5), brandbook (629 → 7) studio-v2 (3 components): - B2BMigrationWizard (848 → 3), CleanupPanel (765 → 2) - dashboard-experimental (739 → 2) admin-lehrer (4 files): - uebersetzungen (769 → 4), manager (670 → 2) - ChunkBrowserQA (675 → 6), dsfa/page (674 → 5) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
180
klausur-service/backend/zeugnis_storage.py
Normal file
180
klausur-service/backend/zeugnis_storage.py
Normal file
@@ -0,0 +1,180 @@
|
||||
"""
|
||||
Zeugnis Crawler - Embedding generation, MinIO upload, and Qdrant indexing.
|
||||
"""
|
||||
|
||||
import io
|
||||
import os
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Dict, Any
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Configuration
|
||||
# =============================================================================
|
||||
|
||||
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
||||
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "localhost:9000")
|
||||
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "test-access-key")
|
||||
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "test-secret-key")
|
||||
MINIO_BUCKET = os.getenv("MINIO_BUCKET", "breakpilot-rag")
|
||||
EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")
|
||||
|
||||
ZEUGNIS_COLLECTION = "bp_zeugnis"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Embedding Generation
|
||||
# =============================================================================
|
||||
|
||||
_embedding_model = None
|
||||
|
||||
|
||||
def get_embedding_model():
|
||||
"""Get or initialize embedding model."""
|
||||
global _embedding_model
|
||||
if _embedding_model is None and EMBEDDING_BACKEND == "local":
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
_embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
print("Loaded local embedding model: all-MiniLM-L6-v2")
|
||||
except ImportError:
|
||||
print("Warning: sentence-transformers not installed")
|
||||
return _embedding_model
|
||||
|
||||
|
||||
async def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
||||
"""Generate embeddings for a list of texts."""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
if EMBEDDING_BACKEND == "local":
|
||||
model = get_embedding_model()
|
||||
if model:
|
||||
embeddings = model.encode(texts, show_progress_bar=False)
|
||||
return [emb.tolist() for emb in embeddings]
|
||||
return []
|
||||
|
||||
elif EMBEDDING_BACKEND == "openai":
|
||||
import openai
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
print("Warning: OPENAI_API_KEY not set")
|
||||
return []
|
||||
|
||||
client = openai.AsyncOpenAI(api_key=api_key)
|
||||
response = await client.embeddings.create(
|
||||
input=texts,
|
||||
model="text-embedding-3-small"
|
||||
)
|
||||
return [item.embedding for item in response.data]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MinIO Storage
|
||||
# =============================================================================
|
||||
|
||||
async def upload_to_minio(
|
||||
content: bytes,
|
||||
bundesland: str,
|
||||
filename: str,
|
||||
content_type: str = "application/pdf",
|
||||
year: Optional[int] = None,
|
||||
) -> Optional[str]:
|
||||
"""Upload document to MinIO."""
|
||||
try:
|
||||
from minio import Minio
|
||||
|
||||
client = Minio(
|
||||
MINIO_ENDPOINT,
|
||||
access_key=MINIO_ACCESS_KEY,
|
||||
secret_key=MINIO_SECRET_KEY,
|
||||
secure=os.getenv("MINIO_SECURE", "false").lower() == "true"
|
||||
)
|
||||
|
||||
# Ensure bucket exists
|
||||
if not client.bucket_exists(MINIO_BUCKET):
|
||||
client.make_bucket(MINIO_BUCKET)
|
||||
|
||||
# Build path
|
||||
year_str = str(year) if year else str(datetime.now().year)
|
||||
object_name = f"landes-daten/{bundesland}/zeugnis/{year_str}/{filename}"
|
||||
|
||||
# Upload
|
||||
client.put_object(
|
||||
MINIO_BUCKET,
|
||||
object_name,
|
||||
io.BytesIO(content),
|
||||
len(content),
|
||||
content_type=content_type,
|
||||
)
|
||||
|
||||
return object_name
|
||||
except Exception as e:
|
||||
print(f"MinIO upload failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Qdrant Indexing
|
||||
# =============================================================================
|
||||
|
||||
async def index_in_qdrant(
|
||||
doc_id: str,
|
||||
chunks: List[str],
|
||||
embeddings: List[List[float]],
|
||||
metadata: Dict[str, Any],
|
||||
) -> int:
|
||||
"""Index document chunks in Qdrant."""
|
||||
try:
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import VectorParams, Distance, PointStruct
|
||||
|
||||
client = QdrantClient(url=QDRANT_URL)
|
||||
|
||||
# Ensure collection exists
|
||||
collections = client.get_collections().collections
|
||||
if not any(c.name == ZEUGNIS_COLLECTION for c in collections):
|
||||
vector_size = len(embeddings[0]) if embeddings else 384
|
||||
client.create_collection(
|
||||
collection_name=ZEUGNIS_COLLECTION,
|
||||
vectors_config=VectorParams(
|
||||
size=vector_size,
|
||||
distance=Distance.COSINE,
|
||||
),
|
||||
)
|
||||
print(f"Created Qdrant collection: {ZEUGNIS_COLLECTION}")
|
||||
|
||||
# Create points
|
||||
points = []
|
||||
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
|
||||
point_id = str(uuid.uuid4())
|
||||
points.append(PointStruct(
|
||||
id=point_id,
|
||||
vector=embedding,
|
||||
payload={
|
||||
"document_id": doc_id,
|
||||
"chunk_index": i,
|
||||
"chunk_text": chunk[:500], # Store first 500 chars for preview
|
||||
"bundesland": metadata.get("bundesland"),
|
||||
"doc_type": metadata.get("doc_type"),
|
||||
"title": metadata.get("title"),
|
||||
"source_url": metadata.get("url"),
|
||||
"training_allowed": metadata.get("training_allowed", False),
|
||||
"indexed_at": datetime.now().isoformat(),
|
||||
}
|
||||
))
|
||||
|
||||
# Upsert
|
||||
if points:
|
||||
client.upsert(
|
||||
collection_name=ZEUGNIS_COLLECTION,
|
||||
points=points,
|
||||
)
|
||||
|
||||
return len(points)
|
||||
except Exception as e:
|
||||
print(f"Qdrant indexing failed: {e}")
|
||||
return 0
|
||||
Reference in New Issue
Block a user