backend-lehrer (10 files): - game/database.py (785 → 5), correction_api.py (683 → 4) - classroom_engine/antizipation.py (676 → 5) - llm_gateway schools/edu_search already done in prior batch klausur-service (12 files): - orientation_crop_api.py (694 → 5), pdf_export.py (677 → 4) - zeugnis_crawler.py (676 → 5), grid_editor_api.py (671 → 5) - eh_templates.py (658 → 5), mail/api.py (651 → 5) - qdrant_service.py (638 → 5), training_api.py (625 → 4) website (6 pages): - middleware (696 → 8), mail (733 → 6), consent (628 → 8) - compliance/risks (622 → 5), export (502 → 5), brandbook (629 → 7) studio-v2 (3 components): - B2BMigrationWizard (848 → 3), CleanupPanel (765 → 2) - dashboard-experimental (739 → 2) admin-lehrer (4 files): - uebersetzungen (769 → 4), manager (670 → 2) - ChunkBrowserQA (675 → 6), dsfa/page (674 → 5) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
181 lines
5.7 KiB
Python
181 lines
5.7 KiB
Python
"""
|
|
Zeugnis Crawler - Embedding generation, MinIO upload, and Qdrant indexing.
|
|
"""
|
|
|
|
import io
|
|
import os
|
|
import uuid
|
|
from datetime import datetime
|
|
from typing import Optional, List, Dict, Any
|
|
|
|
|
|
# =============================================================================
|
|
# Configuration
|
|
# =============================================================================
|
|
|
|
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "localhost:9000")
|
|
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "test-access-key")
|
|
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "test-secret-key")
|
|
MINIO_BUCKET = os.getenv("MINIO_BUCKET", "breakpilot-rag")
|
|
EMBEDDING_BACKEND = os.getenv("EMBEDDING_BACKEND", "local")
|
|
|
|
ZEUGNIS_COLLECTION = "bp_zeugnis"
|
|
|
|
|
|
# =============================================================================
|
|
# Embedding Generation
|
|
# =============================================================================
|
|
|
|
_embedding_model = None
|
|
|
|
|
|
def get_embedding_model():
|
|
"""Get or initialize embedding model."""
|
|
global _embedding_model
|
|
if _embedding_model is None and EMBEDDING_BACKEND == "local":
|
|
try:
|
|
from sentence_transformers import SentenceTransformer
|
|
_embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
print("Loaded local embedding model: all-MiniLM-L6-v2")
|
|
except ImportError:
|
|
print("Warning: sentence-transformers not installed")
|
|
return _embedding_model
|
|
|
|
|
|
async def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
|
"""Generate embeddings for a list of texts."""
|
|
if not texts:
|
|
return []
|
|
|
|
if EMBEDDING_BACKEND == "local":
|
|
model = get_embedding_model()
|
|
if model:
|
|
embeddings = model.encode(texts, show_progress_bar=False)
|
|
return [emb.tolist() for emb in embeddings]
|
|
return []
|
|
|
|
elif EMBEDDING_BACKEND == "openai":
|
|
import openai
|
|
api_key = os.getenv("OPENAI_API_KEY")
|
|
if not api_key:
|
|
print("Warning: OPENAI_API_KEY not set")
|
|
return []
|
|
|
|
client = openai.AsyncOpenAI(api_key=api_key)
|
|
response = await client.embeddings.create(
|
|
input=texts,
|
|
model="text-embedding-3-small"
|
|
)
|
|
return [item.embedding for item in response.data]
|
|
|
|
return []
|
|
|
|
|
|
# =============================================================================
|
|
# MinIO Storage
|
|
# =============================================================================
|
|
|
|
async def upload_to_minio(
|
|
content: bytes,
|
|
bundesland: str,
|
|
filename: str,
|
|
content_type: str = "application/pdf",
|
|
year: Optional[int] = None,
|
|
) -> Optional[str]:
|
|
"""Upload document to MinIO."""
|
|
try:
|
|
from minio import Minio
|
|
|
|
client = Minio(
|
|
MINIO_ENDPOINT,
|
|
access_key=MINIO_ACCESS_KEY,
|
|
secret_key=MINIO_SECRET_KEY,
|
|
secure=os.getenv("MINIO_SECURE", "false").lower() == "true"
|
|
)
|
|
|
|
# Ensure bucket exists
|
|
if not client.bucket_exists(MINIO_BUCKET):
|
|
client.make_bucket(MINIO_BUCKET)
|
|
|
|
# Build path
|
|
year_str = str(year) if year else str(datetime.now().year)
|
|
object_name = f"landes-daten/{bundesland}/zeugnis/{year_str}/{filename}"
|
|
|
|
# Upload
|
|
client.put_object(
|
|
MINIO_BUCKET,
|
|
object_name,
|
|
io.BytesIO(content),
|
|
len(content),
|
|
content_type=content_type,
|
|
)
|
|
|
|
return object_name
|
|
except Exception as e:
|
|
print(f"MinIO upload failed: {e}")
|
|
return None
|
|
|
|
|
|
# =============================================================================
|
|
# Qdrant Indexing
|
|
# =============================================================================
|
|
|
|
async def index_in_qdrant(
|
|
doc_id: str,
|
|
chunks: List[str],
|
|
embeddings: List[List[float]],
|
|
metadata: Dict[str, Any],
|
|
) -> int:
|
|
"""Index document chunks in Qdrant."""
|
|
try:
|
|
from qdrant_client import QdrantClient
|
|
from qdrant_client.models import VectorParams, Distance, PointStruct
|
|
|
|
client = QdrantClient(url=QDRANT_URL)
|
|
|
|
# Ensure collection exists
|
|
collections = client.get_collections().collections
|
|
if not any(c.name == ZEUGNIS_COLLECTION for c in collections):
|
|
vector_size = len(embeddings[0]) if embeddings else 384
|
|
client.create_collection(
|
|
collection_name=ZEUGNIS_COLLECTION,
|
|
vectors_config=VectorParams(
|
|
size=vector_size,
|
|
distance=Distance.COSINE,
|
|
),
|
|
)
|
|
print(f"Created Qdrant collection: {ZEUGNIS_COLLECTION}")
|
|
|
|
# Create points
|
|
points = []
|
|
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
|
|
point_id = str(uuid.uuid4())
|
|
points.append(PointStruct(
|
|
id=point_id,
|
|
vector=embedding,
|
|
payload={
|
|
"document_id": doc_id,
|
|
"chunk_index": i,
|
|
"chunk_text": chunk[:500], # Store first 500 chars for preview
|
|
"bundesland": metadata.get("bundesland"),
|
|
"doc_type": metadata.get("doc_type"),
|
|
"title": metadata.get("title"),
|
|
"source_url": metadata.get("url"),
|
|
"training_allowed": metadata.get("training_allowed", False),
|
|
"indexed_at": datetime.now().isoformat(),
|
|
}
|
|
))
|
|
|
|
# Upsert
|
|
if points:
|
|
client.upsert(
|
|
collection_name=ZEUGNIS_COLLECTION,
|
|
points=points,
|
|
)
|
|
|
|
return len(points)
|
|
except Exception as e:
|
|
print(f"Qdrant indexing failed: {e}")
|
|
return 0
|