Files
breakpilot-lehrer/klausur-service/backend/minio_storage.py
Benjamin Boenisch 5a31f52310 Initial commit: breakpilot-lehrer - Lehrer KI Platform
Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website,
Klausur-Service, School-Service, Voice-Service, Geo-Service,
BreakPilot Drive, Agent-Core

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:26 +01:00

361 lines
9.7 KiB
Python

"""
MinIO Storage Service for RAG Documents
Provides S3-compatible object storage for PDFs and other training documents.
"""
import os
from typing import Optional, List, Dict, BinaryIO
from datetime import datetime, timedelta
from pathlib import Path
import io
# MinIO Configuration - Credentials from Vault or environment (test defaults for CI)
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "localhost:9000")
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "test-access-key")
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "test-secret-key")
MINIO_BUCKET = os.getenv("MINIO_BUCKET", "breakpilot-rag")
MINIO_SECURE = os.getenv("MINIO_SECURE", "false").lower() == "true"
# Flag to check if using test defaults (MinIO operations will fail gracefully)
_MINIO_CONFIGURED = MINIO_ACCESS_KEY != "test-access-key"
# Lazy import to avoid issues when minio is not installed
_minio_client = None
def _get_minio_client():
"""Get or create MinIO client singleton."""
global _minio_client
if _minio_client is None:
try:
from minio import Minio
_minio_client = Minio(
MINIO_ENDPOINT,
access_key=MINIO_ACCESS_KEY,
secret_key=MINIO_SECRET_KEY,
secure=MINIO_SECURE,
)
except ImportError:
print("Warning: minio package not installed. MinIO storage disabled.")
return None
except Exception as e:
print(f"Warning: Failed to connect to MinIO: {e}")
return None
return _minio_client
async def init_minio_bucket() -> bool:
"""Initialize MinIO bucket if not exists."""
client = _get_minio_client()
if client is None:
return False
try:
if not client.bucket_exists(MINIO_BUCKET):
client.make_bucket(MINIO_BUCKET)
print(f"Created MinIO bucket: {MINIO_BUCKET}")
return True
except Exception as e:
print(f"Failed to initialize MinIO bucket: {e}")
return False
async def upload_document(
file_data: bytes,
object_name: str,
content_type: str = "application/pdf",
metadata: Optional[Dict[str, str]] = None,
) -> Optional[str]:
"""
Upload a document to MinIO.
Args:
file_data: File content as bytes
object_name: Path in bucket (e.g., "landes-daten/ni/klausur/2024/doc.pdf")
content_type: MIME type
metadata: Optional metadata dict
Returns:
Full object path or None on failure
"""
client = _get_minio_client()
if client is None:
return None
try:
# Ensure bucket exists
await init_minio_bucket()
# Upload file
data_stream = io.BytesIO(file_data)
client.put_object(
bucket_name=MINIO_BUCKET,
object_name=object_name,
data=data_stream,
length=len(file_data),
content_type=content_type,
metadata=metadata or {},
)
return f"{MINIO_BUCKET}/{object_name}"
except Exception as e:
print(f"Failed to upload to MinIO: {e}")
return None
async def download_document(object_name: str) -> Optional[bytes]:
"""
Download a document from MinIO.
Args:
object_name: Path in bucket
Returns:
File content as bytes or None on failure
"""
client = _get_minio_client()
if client is None:
return None
try:
response = client.get_object(MINIO_BUCKET, object_name)
data = response.read()
response.close()
response.release_conn()
return data
except Exception as e:
print(f"Failed to download from MinIO: {e}")
return None
async def list_documents(
prefix: str = "",
recursive: bool = True,
) -> List[Dict]:
"""
List documents in MinIO bucket.
Args:
prefix: Path prefix to filter (e.g., "landes-daten/ni/")
recursive: Whether to list recursively
Returns:
List of document info dicts
"""
client = _get_minio_client()
if client is None:
return []
try:
objects = client.list_objects(
MINIO_BUCKET,
prefix=prefix,
recursive=recursive,
)
return [
{
"name": obj.object_name,
"size": obj.size,
"last_modified": obj.last_modified.isoformat() if obj.last_modified else None,
"etag": obj.etag,
}
for obj in objects
]
except Exception as e:
print(f"Failed to list MinIO objects: {e}")
return []
async def delete_document(object_name: str) -> bool:
"""
Delete a document from MinIO.
Args:
object_name: Path in bucket
Returns:
True on success
"""
client = _get_minio_client()
if client is None:
return False
try:
client.remove_object(MINIO_BUCKET, object_name)
return True
except Exception as e:
print(f"Failed to delete from MinIO: {e}")
return False
async def get_presigned_url(
object_name: str,
expires: int = 3600,
) -> Optional[str]:
"""
Get a presigned URL for temporary access to a document.
Args:
object_name: Path in bucket
expires: URL expiration time in seconds (default 1 hour)
Returns:
Presigned URL or None on failure
"""
client = _get_minio_client()
if client is None:
return None
try:
url = client.presigned_get_object(
MINIO_BUCKET,
object_name,
expires=timedelta(seconds=expires),
)
return url
except Exception as e:
print(f"Failed to generate presigned URL: {e}")
return None
async def get_storage_stats() -> Dict:
"""Get storage statistics."""
client = _get_minio_client()
if client is None:
return {"error": "MinIO not available", "connected": False}
try:
# Count objects and calculate total size
objects = list(client.list_objects(MINIO_BUCKET, recursive=True))
total_size = sum(obj.size for obj in objects)
total_count = len(objects)
# Group by prefix
by_prefix: Dict[str, int] = {}
for obj in objects:
parts = obj.object_name.split("/")
if len(parts) >= 2:
prefix = f"{parts[0]}/{parts[1]}"
by_prefix[prefix] = by_prefix.get(prefix, 0) + 1
return {
"connected": True,
"bucket": MINIO_BUCKET,
"total_objects": total_count,
"total_size_bytes": total_size,
"total_size_mb": round(total_size / (1024 * 1024), 2),
"by_prefix": by_prefix,
}
except Exception as e:
return {"error": str(e), "connected": False}
# =============================================================================
# RAG-specific Storage Functions
# =============================================================================
def get_minio_path(
data_type: str, # "landes-daten" or "lehrer-daten"
bundesland: str, # "ni", "by", etc.
use_case: str, # "klausur", "zeugnis", "lehrplan"
year: int,
filename: str,
) -> str:
"""
Generate MinIO path following the RAG-Admin-Spec.md structure.
Example: landes-daten/ni/klausur/2024/2024_Deutsch_eA_I_EWH.pdf
"""
return f"{data_type}/{bundesland.lower()}/{use_case}/{year}/{filename}"
async def upload_rag_document(
file_data: bytes,
filename: str,
bundesland: str = "ni",
use_case: str = "klausur",
year: Optional[int] = None,
metadata: Optional[Dict[str, str]] = None,
) -> Optional[str]:
"""
Upload a document to the RAG storage structure.
Args:
file_data: PDF content
filename: Original filename
bundesland: State code (ni, by, etc.)
use_case: klausur, zeugnis, lehrplan
year: Document year (defaults to current year)
metadata: Optional metadata
Returns:
MinIO path on success
"""
if year is None:
year = datetime.now().year
object_path = get_minio_path(
data_type="landes-daten",
bundesland=bundesland,
use_case=use_case,
year=year,
filename=filename,
)
# Add RAG metadata
rag_metadata = {
"bundesland": bundesland,
"use_case": use_case,
"year": str(year),
"training_allowed": "true", # Landes-Daten allow training
**(metadata or {}),
}
return await upload_document(
file_data=file_data,
object_name=object_path,
content_type="application/pdf",
metadata=rag_metadata,
)
async def upload_teacher_document(
file_data: bytes,
filename: str,
tenant_id: str,
teacher_id: str,
metadata: Optional[Dict[str, str]] = None,
) -> Optional[str]:
"""
Upload a teacher's document (BYOEH - encrypted, no training).
Args:
file_data: Encrypted PDF content
filename: Original filename (will be stored as .enc)
tenant_id: School/tenant ID
teacher_id: Teacher ID
metadata: Optional metadata
Returns:
MinIO path on success
"""
enc_filename = filename if filename.endswith(".enc") else f"{filename}.enc"
object_path = f"lehrer-daten/{tenant_id}/{teacher_id}/{enc_filename}"
# Teacher data - never allow training
teacher_metadata = {
"tenant_id": tenant_id,
"teacher_id": teacher_id,
"training_allowed": "false", # CRITICAL: Never train on teacher data
"encrypted": "true",
**(metadata or {}),
}
return await upload_document(
file_data=file_data,
object_name=object_path,
content_type="application/octet-stream",
metadata=teacher_metadata,
)