Files
breakpilot-lehrer/klausur-service/backend/upload_api_chunked.py
Benjamin Admin bd4b956e3c [split-required] Split final 43 files (500-668 LOC) to complete refactoring
klausur-service (11 files):
- cv_gutter_repair, ocr_pipeline_regression, upload_api
- ocr_pipeline_sessions, smart_spell, nru_worksheet_generator
- ocr_pipeline_overlays, mail/aggregator, zeugnis_api
- cv_syllable_detect, self_rag

backend-lehrer (17 files):
- classroom_engine/suggestions, generators/quiz_generator
- worksheets_api, llm_gateway/comparison, state_engine_api
- classroom/models (→ 4 submodules), services/file_processor
- alerts_agent/api/wizard+digests+routes, content_generators/pdf
- classroom/routes/sessions, llm_gateway/inference
- classroom_engine/analytics, auth/keycloak_auth
- alerts_agent/processing/rule_engine, ai_processor/print_versions

agent-core (5 files):
- brain/memory_store, brain/knowledge_graph, brain/context_manager
- orchestrator/supervisor, sessions/session_manager

admin-lehrer (5 components):
- GridOverlay, StepGridReview, DevOpsPipelineSidebar
- DataFlowDiagram, sbom/wizard/page

website (2 files):
- DependencyMap, lehrer/abitur-archiv

Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 09:41:42 +02:00

321 lines
8.9 KiB
Python

"""
Chunked Upload API — init, chunk, finalize, simple upload, status, cancel, list.
Extracted from upload_api.py for modularity.
DSGVO-konform: Data stays local in WLAN, no external transmission.
"""
import os
import uuid
import shutil
import hashlib
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, Optional
from fastapi import APIRouter, HTTPException, UploadFile, File, Form
from pydantic import BaseModel
# Configuration
UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", "/app/uploads"))
CHUNK_DIR = Path(os.getenv("CHUNK_DIR", "/app/chunks"))
EH_UPLOAD_DIR = Path(os.getenv("EH_UPLOAD_DIR", "/app/eh-uploads"))
# Ensure directories exist
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
CHUNK_DIR.mkdir(parents=True, exist_ok=True)
EH_UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
# In-memory storage for upload sessions (for simplicity)
# In production, use Redis or database
_upload_sessions: Dict[str, dict] = {}
router = APIRouter(prefix="/api/v1/upload", tags=["Mobile Upload"])
class InitUploadRequest(BaseModel):
filename: str
filesize: int
chunks: int
destination: str = "klausur" # "klausur" or "rag"
class InitUploadResponse(BaseModel):
upload_id: str
chunk_size: int
total_chunks: int
message: str
class ChunkUploadResponse(BaseModel):
upload_id: str
chunk_index: int
received: bool
chunks_received: int
total_chunks: int
class FinalizeResponse(BaseModel):
upload_id: str
filename: str
filepath: str
filesize: int
checksum: str
message: str
@router.post("/init", response_model=InitUploadResponse)
async def init_upload(request: InitUploadRequest):
"""
Initialize a chunked upload session.
Returns an upload_id that must be used for subsequent chunk uploads.
"""
upload_id = str(uuid.uuid4())
# Create session directory
session_dir = CHUNK_DIR / upload_id
session_dir.mkdir(parents=True, exist_ok=True)
# Store session info
_upload_sessions[upload_id] = {
"filename": request.filename,
"filesize": request.filesize,
"total_chunks": request.chunks,
"received_chunks": set(),
"destination": request.destination,
"session_dir": str(session_dir),
"created_at": datetime.now(timezone.utc).isoformat(),
}
return InitUploadResponse(
upload_id=upload_id,
chunk_size=5 * 1024 * 1024, # 5 MB
total_chunks=request.chunks,
message="Upload-Session erstellt"
)
@router.post("/chunk", response_model=ChunkUploadResponse)
async def upload_chunk(
chunk: UploadFile = File(...),
upload_id: str = Form(...),
chunk_index: int = Form(...)
):
"""
Upload a single chunk of a file.
Chunks are stored temporarily until finalize is called.
"""
if upload_id not in _upload_sessions:
raise HTTPException(status_code=404, detail="Upload-Session nicht gefunden")
session = _upload_sessions[upload_id]
if chunk_index < 0 or chunk_index >= session["total_chunks"]:
raise HTTPException(
status_code=400,
detail=f"Ungueltiger Chunk-Index: {chunk_index}"
)
# Save chunk
chunk_path = Path(session["session_dir"]) / f"chunk_{chunk_index:05d}"
with open(chunk_path, "wb") as f:
content = await chunk.read()
f.write(content)
# Track received chunks
session["received_chunks"].add(chunk_index)
return ChunkUploadResponse(
upload_id=upload_id,
chunk_index=chunk_index,
received=True,
chunks_received=len(session["received_chunks"]),
total_chunks=session["total_chunks"]
)
@router.post("/finalize", response_model=FinalizeResponse)
async def finalize_upload(upload_id: str = Form(...)):
"""
Finalize the upload by combining all chunks into a single file.
Validates that all chunks were received and calculates checksum.
"""
if upload_id not in _upload_sessions:
raise HTTPException(status_code=404, detail="Upload-Session nicht gefunden")
session = _upload_sessions[upload_id]
# Check if all chunks received
if len(session["received_chunks"]) != session["total_chunks"]:
missing = session["total_chunks"] - len(session["received_chunks"])
raise HTTPException(
status_code=400,
detail=f"Nicht alle Chunks empfangen. Fehlend: {missing}"
)
# Determine destination directory
if session["destination"] == "rag":
dest_dir = EH_UPLOAD_DIR
else:
dest_dir = UPLOAD_DIR
# Generate unique filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
safe_filename = session["filename"].replace(" ", "_")
final_filename = f"{timestamp}_{safe_filename}"
final_path = dest_dir / final_filename
# Combine chunks
hasher = hashlib.sha256()
total_size = 0
with open(final_path, "wb") as outfile:
for i in range(session["total_chunks"]):
chunk_path = Path(session["session_dir"]) / f"chunk_{i:05d}"
if not chunk_path.exists():
raise HTTPException(
status_code=500,
detail=f"Chunk {i} nicht gefunden"
)
with open(chunk_path, "rb") as infile:
data = infile.read()
outfile.write(data)
hasher.update(data)
total_size += len(data)
# Clean up chunks
shutil.rmtree(session["session_dir"], ignore_errors=True)
del _upload_sessions[upload_id]
checksum = hasher.hexdigest()
return FinalizeResponse(
upload_id=upload_id,
filename=final_filename,
filepath=str(final_path),
filesize=total_size,
checksum=checksum,
message="Upload erfolgreich abgeschlossen"
)
@router.post("/simple")
async def simple_upload(
file: UploadFile = File(...),
destination: str = Form("klausur")
):
"""
Simple single-request upload for smaller files (<10MB).
For larger files, use the chunked upload endpoints.
"""
# Determine destination directory
if destination == "rag":
dest_dir = EH_UPLOAD_DIR
else:
dest_dir = UPLOAD_DIR
# Generate unique filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
safe_filename = file.filename.replace(" ", "_") if file.filename else "upload.pdf"
final_filename = f"{timestamp}_{safe_filename}"
final_path = dest_dir / final_filename
# Calculate checksum while writing
hasher = hashlib.sha256()
total_size = 0
with open(final_path, "wb") as f:
while True:
chunk = await file.read(1024 * 1024) # Read 1MB at a time
if not chunk:
break
f.write(chunk)
hasher.update(chunk)
total_size += len(chunk)
return {
"filename": final_filename,
"filepath": str(final_path),
"filesize": total_size,
"checksum": hasher.hexdigest(),
"message": "Upload erfolgreich"
}
@router.get("/status/{upload_id}")
async def get_upload_status(upload_id: str):
"""
Get the status of an ongoing upload.
"""
if upload_id not in _upload_sessions:
raise HTTPException(status_code=404, detail="Upload-Session nicht gefunden")
session = _upload_sessions[upload_id]
return {
"upload_id": upload_id,
"filename": session["filename"],
"total_chunks": session["total_chunks"],
"received_chunks": len(session["received_chunks"]),
"progress_percent": round(
len(session["received_chunks"]) / session["total_chunks"] * 100, 1
),
"destination": session["destination"],
"created_at": session["created_at"]
}
@router.delete("/cancel/{upload_id}")
async def cancel_upload(upload_id: str):
"""
Cancel an ongoing upload and clean up temporary files.
"""
if upload_id not in _upload_sessions:
raise HTTPException(status_code=404, detail="Upload-Session nicht gefunden")
session = _upload_sessions[upload_id]
# Clean up chunks
shutil.rmtree(session["session_dir"], ignore_errors=True)
del _upload_sessions[upload_id]
return {"message": "Upload abgebrochen", "upload_id": upload_id}
@router.get("/list")
async def list_uploads(destination: str = "klausur"):
"""
List all uploaded files in the specified destination.
"""
if destination == "rag":
dest_dir = EH_UPLOAD_DIR
else:
dest_dir = UPLOAD_DIR
files = []
for f in dest_dir.iterdir():
if f.is_file() and f.suffix.lower() == ".pdf":
stat = f.stat()
files.append({
"filename": f.name,
"size": stat.st_size,
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
})
files.sort(key=lambda x: x["modified"], reverse=True)
return {
"destination": destination,
"count": len(files),
"files": files[:50] # Limit to 50 most recent
}