""" MinIO Storage Service for RAG Documents Provides S3-compatible object storage for PDFs and other training documents. """ import os from typing import Optional, List, Dict, BinaryIO from datetime import datetime, timedelta from pathlib import Path import io # MinIO Configuration - Credentials from Vault or environment (test defaults for CI) MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "localhost:9000") MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "test-access-key") MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "test-secret-key") MINIO_BUCKET = os.getenv("MINIO_BUCKET", "breakpilot-rag") MINIO_SECURE = os.getenv("MINIO_SECURE", "false").lower() == "true" # Flag to check if using test defaults (MinIO operations will fail gracefully) _MINIO_CONFIGURED = MINIO_ACCESS_KEY != "test-access-key" # Lazy import to avoid issues when minio is not installed _minio_client = None def _get_minio_client(): """Get or create MinIO client singleton.""" global _minio_client if _minio_client is None: try: from minio import Minio _minio_client = Minio( MINIO_ENDPOINT, access_key=MINIO_ACCESS_KEY, secret_key=MINIO_SECRET_KEY, secure=MINIO_SECURE, ) except ImportError: print("Warning: minio package not installed. MinIO storage disabled.") return None except Exception as e: print(f"Warning: Failed to connect to MinIO: {e}") return None return _minio_client async def init_minio_bucket() -> bool: """Initialize MinIO bucket if not exists.""" client = _get_minio_client() if client is None: return False try: if not client.bucket_exists(MINIO_BUCKET): client.make_bucket(MINIO_BUCKET) print(f"Created MinIO bucket: {MINIO_BUCKET}") return True except Exception as e: print(f"Failed to initialize MinIO bucket: {e}") return False async def upload_document( file_data: bytes, object_name: str, content_type: str = "application/pdf", metadata: Optional[Dict[str, str]] = None, ) -> Optional[str]: """ Upload a document to MinIO. Args: file_data: File content as bytes object_name: Path in bucket (e.g., "landes-daten/ni/klausur/2024/doc.pdf") content_type: MIME type metadata: Optional metadata dict Returns: Full object path or None on failure """ client = _get_minio_client() if client is None: return None try: # Ensure bucket exists await init_minio_bucket() # Upload file data_stream = io.BytesIO(file_data) client.put_object( bucket_name=MINIO_BUCKET, object_name=object_name, data=data_stream, length=len(file_data), content_type=content_type, metadata=metadata or {}, ) return f"{MINIO_BUCKET}/{object_name}" except Exception as e: print(f"Failed to upload to MinIO: {e}") return None async def download_document(object_name: str) -> Optional[bytes]: """ Download a document from MinIO. Args: object_name: Path in bucket Returns: File content as bytes or None on failure """ client = _get_minio_client() if client is None: return None try: response = client.get_object(MINIO_BUCKET, object_name) data = response.read() response.close() response.release_conn() return data except Exception as e: print(f"Failed to download from MinIO: {e}") return None async def list_documents( prefix: str = "", recursive: bool = True, ) -> List[Dict]: """ List documents in MinIO bucket. Args: prefix: Path prefix to filter (e.g., "landes-daten/ni/") recursive: Whether to list recursively Returns: List of document info dicts """ client = _get_minio_client() if client is None: return [] try: objects = client.list_objects( MINIO_BUCKET, prefix=prefix, recursive=recursive, ) return [ { "name": obj.object_name, "size": obj.size, "last_modified": obj.last_modified.isoformat() if obj.last_modified else None, "etag": obj.etag, } for obj in objects ] except Exception as e: print(f"Failed to list MinIO objects: {e}") return [] async def delete_document(object_name: str) -> bool: """ Delete a document from MinIO. Args: object_name: Path in bucket Returns: True on success """ client = _get_minio_client() if client is None: return False try: client.remove_object(MINIO_BUCKET, object_name) return True except Exception as e: print(f"Failed to delete from MinIO: {e}") return False async def get_presigned_url( object_name: str, expires: int = 3600, ) -> Optional[str]: """ Get a presigned URL for temporary access to a document. Args: object_name: Path in bucket expires: URL expiration time in seconds (default 1 hour) Returns: Presigned URL or None on failure """ client = _get_minio_client() if client is None: return None try: url = client.presigned_get_object( MINIO_BUCKET, object_name, expires=timedelta(seconds=expires), ) return url except Exception as e: print(f"Failed to generate presigned URL: {e}") return None async def get_storage_stats() -> Dict: """Get storage statistics.""" client = _get_minio_client() if client is None: return {"error": "MinIO not available", "connected": False} try: # Count objects and calculate total size objects = list(client.list_objects(MINIO_BUCKET, recursive=True)) total_size = sum(obj.size for obj in objects) total_count = len(objects) # Group by prefix by_prefix: Dict[str, int] = {} for obj in objects: parts = obj.object_name.split("/") if len(parts) >= 2: prefix = f"{parts[0]}/{parts[1]}" by_prefix[prefix] = by_prefix.get(prefix, 0) + 1 return { "connected": True, "bucket": MINIO_BUCKET, "total_objects": total_count, "total_size_bytes": total_size, "total_size_mb": round(total_size / (1024 * 1024), 2), "by_prefix": by_prefix, } except Exception as e: return {"error": str(e), "connected": False} # ============================================================================= # RAG-specific Storage Functions # ============================================================================= def get_minio_path( data_type: str, # "landes-daten" or "lehrer-daten" bundesland: str, # "ni", "by", etc. use_case: str, # "klausur", "zeugnis", "lehrplan" year: int, filename: str, ) -> str: """ Generate MinIO path following the RAG-Admin-Spec.md structure. Example: landes-daten/ni/klausur/2024/2024_Deutsch_eA_I_EWH.pdf """ return f"{data_type}/{bundesland.lower()}/{use_case}/{year}/{filename}" async def upload_rag_document( file_data: bytes, filename: str, bundesland: str = "ni", use_case: str = "klausur", year: Optional[int] = None, metadata: Optional[Dict[str, str]] = None, ) -> Optional[str]: """ Upload a document to the RAG storage structure. Args: file_data: PDF content filename: Original filename bundesland: State code (ni, by, etc.) use_case: klausur, zeugnis, lehrplan year: Document year (defaults to current year) metadata: Optional metadata Returns: MinIO path on success """ if year is None: year = datetime.now().year object_path = get_minio_path( data_type="landes-daten", bundesland=bundesland, use_case=use_case, year=year, filename=filename, ) # Add RAG metadata rag_metadata = { "bundesland": bundesland, "use_case": use_case, "year": str(year), "training_allowed": "true", # Landes-Daten allow training **(metadata or {}), } return await upload_document( file_data=file_data, object_name=object_path, content_type="application/pdf", metadata=rag_metadata, ) async def upload_teacher_document( file_data: bytes, filename: str, tenant_id: str, teacher_id: str, metadata: Optional[Dict[str, str]] = None, ) -> Optional[str]: """ Upload a teacher's document (BYOEH - encrypted, no training). Args: file_data: Encrypted PDF content filename: Original filename (will be stored as .enc) tenant_id: School/tenant ID teacher_id: Teacher ID metadata: Optional metadata Returns: MinIO path on success """ enc_filename = filename if filename.endswith(".enc") else f"{filename}.enc" object_path = f"lehrer-daten/{tenant_id}/{teacher_id}/{enc_filename}" # Teacher data - never allow training teacher_metadata = { "tenant_id": tenant_id, "teacher_id": teacher_id, "training_allowed": "false", # CRITICAL: Never train on teacher data "encrypted": "true", **(metadata or {}), } return await upload_document( file_data=file_data, object_name=object_path, content_type="application/octet-stream", metadata=teacher_metadata, )