Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
361 lines
9.7 KiB
Python
361 lines
9.7 KiB
Python
"""
|
|
MinIO Storage Service for RAG Documents
|
|
Provides S3-compatible object storage for PDFs and other training documents.
|
|
"""
|
|
|
|
import os
|
|
from typing import Optional, List, Dict, BinaryIO
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
import io
|
|
|
|
# MinIO Configuration - Credentials from Vault or environment (test defaults for CI)
|
|
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "localhost:9000")
|
|
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "test-access-key")
|
|
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "test-secret-key")
|
|
MINIO_BUCKET = os.getenv("MINIO_BUCKET", "breakpilot-rag")
|
|
MINIO_SECURE = os.getenv("MINIO_SECURE", "false").lower() == "true"
|
|
|
|
# Flag to check if using test defaults (MinIO operations will fail gracefully)
|
|
_MINIO_CONFIGURED = MINIO_ACCESS_KEY != "test-access-key"
|
|
|
|
# Lazy import to avoid issues when minio is not installed
|
|
_minio_client = None
|
|
|
|
|
|
def _get_minio_client():
|
|
"""Get or create MinIO client singleton."""
|
|
global _minio_client
|
|
if _minio_client is None:
|
|
try:
|
|
from minio import Minio
|
|
_minio_client = Minio(
|
|
MINIO_ENDPOINT,
|
|
access_key=MINIO_ACCESS_KEY,
|
|
secret_key=MINIO_SECRET_KEY,
|
|
secure=MINIO_SECURE,
|
|
)
|
|
except ImportError:
|
|
print("Warning: minio package not installed. MinIO storage disabled.")
|
|
return None
|
|
except Exception as e:
|
|
print(f"Warning: Failed to connect to MinIO: {e}")
|
|
return None
|
|
return _minio_client
|
|
|
|
|
|
async def init_minio_bucket() -> bool:
|
|
"""Initialize MinIO bucket if not exists."""
|
|
client = _get_minio_client()
|
|
if client is None:
|
|
return False
|
|
|
|
try:
|
|
if not client.bucket_exists(MINIO_BUCKET):
|
|
client.make_bucket(MINIO_BUCKET)
|
|
print(f"Created MinIO bucket: {MINIO_BUCKET}")
|
|
return True
|
|
except Exception as e:
|
|
print(f"Failed to initialize MinIO bucket: {e}")
|
|
return False
|
|
|
|
|
|
async def upload_document(
|
|
file_data: bytes,
|
|
object_name: str,
|
|
content_type: str = "application/pdf",
|
|
metadata: Optional[Dict[str, str]] = None,
|
|
) -> Optional[str]:
|
|
"""
|
|
Upload a document to MinIO.
|
|
|
|
Args:
|
|
file_data: File content as bytes
|
|
object_name: Path in bucket (e.g., "landes-daten/ni/klausur/2024/doc.pdf")
|
|
content_type: MIME type
|
|
metadata: Optional metadata dict
|
|
|
|
Returns:
|
|
Full object path or None on failure
|
|
"""
|
|
client = _get_minio_client()
|
|
if client is None:
|
|
return None
|
|
|
|
try:
|
|
# Ensure bucket exists
|
|
await init_minio_bucket()
|
|
|
|
# Upload file
|
|
data_stream = io.BytesIO(file_data)
|
|
client.put_object(
|
|
bucket_name=MINIO_BUCKET,
|
|
object_name=object_name,
|
|
data=data_stream,
|
|
length=len(file_data),
|
|
content_type=content_type,
|
|
metadata=metadata or {},
|
|
)
|
|
|
|
return f"{MINIO_BUCKET}/{object_name}"
|
|
except Exception as e:
|
|
print(f"Failed to upload to MinIO: {e}")
|
|
return None
|
|
|
|
|
|
async def download_document(object_name: str) -> Optional[bytes]:
|
|
"""
|
|
Download a document from MinIO.
|
|
|
|
Args:
|
|
object_name: Path in bucket
|
|
|
|
Returns:
|
|
File content as bytes or None on failure
|
|
"""
|
|
client = _get_minio_client()
|
|
if client is None:
|
|
return None
|
|
|
|
try:
|
|
response = client.get_object(MINIO_BUCKET, object_name)
|
|
data = response.read()
|
|
response.close()
|
|
response.release_conn()
|
|
return data
|
|
except Exception as e:
|
|
print(f"Failed to download from MinIO: {e}")
|
|
return None
|
|
|
|
|
|
async def list_documents(
|
|
prefix: str = "",
|
|
recursive: bool = True,
|
|
) -> List[Dict]:
|
|
"""
|
|
List documents in MinIO bucket.
|
|
|
|
Args:
|
|
prefix: Path prefix to filter (e.g., "landes-daten/ni/")
|
|
recursive: Whether to list recursively
|
|
|
|
Returns:
|
|
List of document info dicts
|
|
"""
|
|
client = _get_minio_client()
|
|
if client is None:
|
|
return []
|
|
|
|
try:
|
|
objects = client.list_objects(
|
|
MINIO_BUCKET,
|
|
prefix=prefix,
|
|
recursive=recursive,
|
|
)
|
|
|
|
return [
|
|
{
|
|
"name": obj.object_name,
|
|
"size": obj.size,
|
|
"last_modified": obj.last_modified.isoformat() if obj.last_modified else None,
|
|
"etag": obj.etag,
|
|
}
|
|
for obj in objects
|
|
]
|
|
except Exception as e:
|
|
print(f"Failed to list MinIO objects: {e}")
|
|
return []
|
|
|
|
|
|
async def delete_document(object_name: str) -> bool:
|
|
"""
|
|
Delete a document from MinIO.
|
|
|
|
Args:
|
|
object_name: Path in bucket
|
|
|
|
Returns:
|
|
True on success
|
|
"""
|
|
client = _get_minio_client()
|
|
if client is None:
|
|
return False
|
|
|
|
try:
|
|
client.remove_object(MINIO_BUCKET, object_name)
|
|
return True
|
|
except Exception as e:
|
|
print(f"Failed to delete from MinIO: {e}")
|
|
return False
|
|
|
|
|
|
async def get_presigned_url(
|
|
object_name: str,
|
|
expires: int = 3600,
|
|
) -> Optional[str]:
|
|
"""
|
|
Get a presigned URL for temporary access to a document.
|
|
|
|
Args:
|
|
object_name: Path in bucket
|
|
expires: URL expiration time in seconds (default 1 hour)
|
|
|
|
Returns:
|
|
Presigned URL or None on failure
|
|
"""
|
|
client = _get_minio_client()
|
|
if client is None:
|
|
return None
|
|
|
|
try:
|
|
url = client.presigned_get_object(
|
|
MINIO_BUCKET,
|
|
object_name,
|
|
expires=timedelta(seconds=expires),
|
|
)
|
|
return url
|
|
except Exception as e:
|
|
print(f"Failed to generate presigned URL: {e}")
|
|
return None
|
|
|
|
|
|
async def get_storage_stats() -> Dict:
|
|
"""Get storage statistics."""
|
|
client = _get_minio_client()
|
|
if client is None:
|
|
return {"error": "MinIO not available", "connected": False}
|
|
|
|
try:
|
|
# Count objects and calculate total size
|
|
objects = list(client.list_objects(MINIO_BUCKET, recursive=True))
|
|
total_size = sum(obj.size for obj in objects)
|
|
total_count = len(objects)
|
|
|
|
# Group by prefix
|
|
by_prefix: Dict[str, int] = {}
|
|
for obj in objects:
|
|
parts = obj.object_name.split("/")
|
|
if len(parts) >= 2:
|
|
prefix = f"{parts[0]}/{parts[1]}"
|
|
by_prefix[prefix] = by_prefix.get(prefix, 0) + 1
|
|
|
|
return {
|
|
"connected": True,
|
|
"bucket": MINIO_BUCKET,
|
|
"total_objects": total_count,
|
|
"total_size_bytes": total_size,
|
|
"total_size_mb": round(total_size / (1024 * 1024), 2),
|
|
"by_prefix": by_prefix,
|
|
}
|
|
except Exception as e:
|
|
return {"error": str(e), "connected": False}
|
|
|
|
|
|
# =============================================================================
|
|
# RAG-specific Storage Functions
|
|
# =============================================================================
|
|
|
|
def get_minio_path(
|
|
data_type: str, # "landes-daten" or "lehrer-daten"
|
|
bundesland: str, # "ni", "by", etc.
|
|
use_case: str, # "klausur", "zeugnis", "lehrplan"
|
|
year: int,
|
|
filename: str,
|
|
) -> str:
|
|
"""
|
|
Generate MinIO path following the RAG-Admin-Spec.md structure.
|
|
|
|
Example: landes-daten/ni/klausur/2024/2024_Deutsch_eA_I_EWH.pdf
|
|
"""
|
|
return f"{data_type}/{bundesland.lower()}/{use_case}/{year}/{filename}"
|
|
|
|
|
|
async def upload_rag_document(
|
|
file_data: bytes,
|
|
filename: str,
|
|
bundesland: str = "ni",
|
|
use_case: str = "klausur",
|
|
year: Optional[int] = None,
|
|
metadata: Optional[Dict[str, str]] = None,
|
|
) -> Optional[str]:
|
|
"""
|
|
Upload a document to the RAG storage structure.
|
|
|
|
Args:
|
|
file_data: PDF content
|
|
filename: Original filename
|
|
bundesland: State code (ni, by, etc.)
|
|
use_case: klausur, zeugnis, lehrplan
|
|
year: Document year (defaults to current year)
|
|
metadata: Optional metadata
|
|
|
|
Returns:
|
|
MinIO path on success
|
|
"""
|
|
if year is None:
|
|
year = datetime.now().year
|
|
|
|
object_path = get_minio_path(
|
|
data_type="landes-daten",
|
|
bundesland=bundesland,
|
|
use_case=use_case,
|
|
year=year,
|
|
filename=filename,
|
|
)
|
|
|
|
# Add RAG metadata
|
|
rag_metadata = {
|
|
"bundesland": bundesland,
|
|
"use_case": use_case,
|
|
"year": str(year),
|
|
"training_allowed": "true", # Landes-Daten allow training
|
|
**(metadata or {}),
|
|
}
|
|
|
|
return await upload_document(
|
|
file_data=file_data,
|
|
object_name=object_path,
|
|
content_type="application/pdf",
|
|
metadata=rag_metadata,
|
|
)
|
|
|
|
|
|
async def upload_teacher_document(
|
|
file_data: bytes,
|
|
filename: str,
|
|
tenant_id: str,
|
|
teacher_id: str,
|
|
metadata: Optional[Dict[str, str]] = None,
|
|
) -> Optional[str]:
|
|
"""
|
|
Upload a teacher's document (BYOEH - encrypted, no training).
|
|
|
|
Args:
|
|
file_data: Encrypted PDF content
|
|
filename: Original filename (will be stored as .enc)
|
|
tenant_id: School/tenant ID
|
|
teacher_id: Teacher ID
|
|
metadata: Optional metadata
|
|
|
|
Returns:
|
|
MinIO path on success
|
|
"""
|
|
enc_filename = filename if filename.endswith(".enc") else f"{filename}.enc"
|
|
object_path = f"lehrer-daten/{tenant_id}/{teacher_id}/{enc_filename}"
|
|
|
|
# Teacher data - never allow training
|
|
teacher_metadata = {
|
|
"tenant_id": tenant_id,
|
|
"teacher_id": teacher_id,
|
|
"training_allowed": "false", # CRITICAL: Never train on teacher data
|
|
"encrypted": "true",
|
|
**(metadata or {}),
|
|
}
|
|
|
|
return await upload_document(
|
|
file_data=file_data,
|
|
object_name=object_path,
|
|
content_type="application/octet-stream",
|
|
metadata=teacher_metadata,
|
|
)
|