[split-required] Split final 43 files (500-668 LOC) to complete refactoring
klausur-service (11 files): - cv_gutter_repair, ocr_pipeline_regression, upload_api - ocr_pipeline_sessions, smart_spell, nru_worksheet_generator - ocr_pipeline_overlays, mail/aggregator, zeugnis_api - cv_syllable_detect, self_rag backend-lehrer (17 files): - classroom_engine/suggestions, generators/quiz_generator - worksheets_api, llm_gateway/comparison, state_engine_api - classroom/models (→ 4 submodules), services/file_processor - alerts_agent/api/wizard+digests+routes, content_generators/pdf - classroom/routes/sessions, llm_gateway/inference - classroom_engine/analytics, auth/keycloak_auth - alerts_agent/processing/rule_engine, ai_processor/print_versions agent-core (5 files): - brain/memory_store, brain/knowledge_graph, brain/context_manager - orchestrator/supervisor, sessions/session_manager admin-lehrer (5 components): - GridOverlay, StepGridReview, DevOpsPipelineSidebar - DataFlowDiagram, sbom/wizard/page website (2 files): - DependencyMap, lehrer/abitur-archiv Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
320
klausur-service/backend/upload_api_chunked.py
Normal file
320
klausur-service/backend/upload_api_chunked.py
Normal file
@@ -0,0 +1,320 @@
|
||||
"""
|
||||
Chunked Upload API — init, chunk, finalize, simple upload, status, cancel, list.
|
||||
|
||||
Extracted from upload_api.py for modularity.
|
||||
|
||||
DSGVO-konform: Data stays local in WLAN, no external transmission.
|
||||
"""
|
||||
|
||||
import os
|
||||
import uuid
|
||||
import shutil
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException, UploadFile, File, Form
|
||||
from pydantic import BaseModel
|
||||
|
||||
# Configuration
|
||||
UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", "/app/uploads"))
|
||||
CHUNK_DIR = Path(os.getenv("CHUNK_DIR", "/app/chunks"))
|
||||
EH_UPLOAD_DIR = Path(os.getenv("EH_UPLOAD_DIR", "/app/eh-uploads"))
|
||||
|
||||
# Ensure directories exist
|
||||
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CHUNK_DIR.mkdir(parents=True, exist_ok=True)
|
||||
EH_UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# In-memory storage for upload sessions (for simplicity)
|
||||
# In production, use Redis or database
|
||||
_upload_sessions: Dict[str, dict] = {}
|
||||
|
||||
router = APIRouter(prefix="/api/v1/upload", tags=["Mobile Upload"])
|
||||
|
||||
|
||||
class InitUploadRequest(BaseModel):
|
||||
filename: str
|
||||
filesize: int
|
||||
chunks: int
|
||||
destination: str = "klausur" # "klausur" or "rag"
|
||||
|
||||
|
||||
class InitUploadResponse(BaseModel):
|
||||
upload_id: str
|
||||
chunk_size: int
|
||||
total_chunks: int
|
||||
message: str
|
||||
|
||||
|
||||
class ChunkUploadResponse(BaseModel):
|
||||
upload_id: str
|
||||
chunk_index: int
|
||||
received: bool
|
||||
chunks_received: int
|
||||
total_chunks: int
|
||||
|
||||
|
||||
class FinalizeResponse(BaseModel):
|
||||
upload_id: str
|
||||
filename: str
|
||||
filepath: str
|
||||
filesize: int
|
||||
checksum: str
|
||||
message: str
|
||||
|
||||
|
||||
@router.post("/init", response_model=InitUploadResponse)
|
||||
async def init_upload(request: InitUploadRequest):
|
||||
"""
|
||||
Initialize a chunked upload session.
|
||||
|
||||
Returns an upload_id that must be used for subsequent chunk uploads.
|
||||
"""
|
||||
upload_id = str(uuid.uuid4())
|
||||
|
||||
# Create session directory
|
||||
session_dir = CHUNK_DIR / upload_id
|
||||
session_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Store session info
|
||||
_upload_sessions[upload_id] = {
|
||||
"filename": request.filename,
|
||||
"filesize": request.filesize,
|
||||
"total_chunks": request.chunks,
|
||||
"received_chunks": set(),
|
||||
"destination": request.destination,
|
||||
"session_dir": str(session_dir),
|
||||
"created_at": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
|
||||
return InitUploadResponse(
|
||||
upload_id=upload_id,
|
||||
chunk_size=5 * 1024 * 1024, # 5 MB
|
||||
total_chunks=request.chunks,
|
||||
message="Upload-Session erstellt"
|
||||
)
|
||||
|
||||
|
||||
@router.post("/chunk", response_model=ChunkUploadResponse)
|
||||
async def upload_chunk(
|
||||
chunk: UploadFile = File(...),
|
||||
upload_id: str = Form(...),
|
||||
chunk_index: int = Form(...)
|
||||
):
|
||||
"""
|
||||
Upload a single chunk of a file.
|
||||
|
||||
Chunks are stored temporarily until finalize is called.
|
||||
"""
|
||||
if upload_id not in _upload_sessions:
|
||||
raise HTTPException(status_code=404, detail="Upload-Session nicht gefunden")
|
||||
|
||||
session = _upload_sessions[upload_id]
|
||||
|
||||
if chunk_index < 0 or chunk_index >= session["total_chunks"]:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Ungueltiger Chunk-Index: {chunk_index}"
|
||||
)
|
||||
|
||||
# Save chunk
|
||||
chunk_path = Path(session["session_dir"]) / f"chunk_{chunk_index:05d}"
|
||||
|
||||
with open(chunk_path, "wb") as f:
|
||||
content = await chunk.read()
|
||||
f.write(content)
|
||||
|
||||
# Track received chunks
|
||||
session["received_chunks"].add(chunk_index)
|
||||
|
||||
return ChunkUploadResponse(
|
||||
upload_id=upload_id,
|
||||
chunk_index=chunk_index,
|
||||
received=True,
|
||||
chunks_received=len(session["received_chunks"]),
|
||||
total_chunks=session["total_chunks"]
|
||||
)
|
||||
|
||||
|
||||
@router.post("/finalize", response_model=FinalizeResponse)
|
||||
async def finalize_upload(upload_id: str = Form(...)):
|
||||
"""
|
||||
Finalize the upload by combining all chunks into a single file.
|
||||
|
||||
Validates that all chunks were received and calculates checksum.
|
||||
"""
|
||||
if upload_id not in _upload_sessions:
|
||||
raise HTTPException(status_code=404, detail="Upload-Session nicht gefunden")
|
||||
|
||||
session = _upload_sessions[upload_id]
|
||||
|
||||
# Check if all chunks received
|
||||
if len(session["received_chunks"]) != session["total_chunks"]:
|
||||
missing = session["total_chunks"] - len(session["received_chunks"])
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Nicht alle Chunks empfangen. Fehlend: {missing}"
|
||||
)
|
||||
|
||||
# Determine destination directory
|
||||
if session["destination"] == "rag":
|
||||
dest_dir = EH_UPLOAD_DIR
|
||||
else:
|
||||
dest_dir = UPLOAD_DIR
|
||||
|
||||
# Generate unique filename
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
safe_filename = session["filename"].replace(" ", "_")
|
||||
final_filename = f"{timestamp}_{safe_filename}"
|
||||
final_path = dest_dir / final_filename
|
||||
|
||||
# Combine chunks
|
||||
hasher = hashlib.sha256()
|
||||
total_size = 0
|
||||
|
||||
with open(final_path, "wb") as outfile:
|
||||
for i in range(session["total_chunks"]):
|
||||
chunk_path = Path(session["session_dir"]) / f"chunk_{i:05d}"
|
||||
|
||||
if not chunk_path.exists():
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Chunk {i} nicht gefunden"
|
||||
)
|
||||
|
||||
with open(chunk_path, "rb") as infile:
|
||||
data = infile.read()
|
||||
outfile.write(data)
|
||||
hasher.update(data)
|
||||
total_size += len(data)
|
||||
|
||||
# Clean up chunks
|
||||
shutil.rmtree(session["session_dir"], ignore_errors=True)
|
||||
del _upload_sessions[upload_id]
|
||||
|
||||
checksum = hasher.hexdigest()
|
||||
|
||||
return FinalizeResponse(
|
||||
upload_id=upload_id,
|
||||
filename=final_filename,
|
||||
filepath=str(final_path),
|
||||
filesize=total_size,
|
||||
checksum=checksum,
|
||||
message="Upload erfolgreich abgeschlossen"
|
||||
)
|
||||
|
||||
|
||||
@router.post("/simple")
|
||||
async def simple_upload(
|
||||
file: UploadFile = File(...),
|
||||
destination: str = Form("klausur")
|
||||
):
|
||||
"""
|
||||
Simple single-request upload for smaller files (<10MB).
|
||||
|
||||
For larger files, use the chunked upload endpoints.
|
||||
"""
|
||||
# Determine destination directory
|
||||
if destination == "rag":
|
||||
dest_dir = EH_UPLOAD_DIR
|
||||
else:
|
||||
dest_dir = UPLOAD_DIR
|
||||
|
||||
# Generate unique filename
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
safe_filename = file.filename.replace(" ", "_") if file.filename else "upload.pdf"
|
||||
final_filename = f"{timestamp}_{safe_filename}"
|
||||
final_path = dest_dir / final_filename
|
||||
|
||||
# Calculate checksum while writing
|
||||
hasher = hashlib.sha256()
|
||||
total_size = 0
|
||||
|
||||
with open(final_path, "wb") as f:
|
||||
while True:
|
||||
chunk = await file.read(1024 * 1024) # Read 1MB at a time
|
||||
if not chunk:
|
||||
break
|
||||
f.write(chunk)
|
||||
hasher.update(chunk)
|
||||
total_size += len(chunk)
|
||||
|
||||
return {
|
||||
"filename": final_filename,
|
||||
"filepath": str(final_path),
|
||||
"filesize": total_size,
|
||||
"checksum": hasher.hexdigest(),
|
||||
"message": "Upload erfolgreich"
|
||||
}
|
||||
|
||||
|
||||
@router.get("/status/{upload_id}")
|
||||
async def get_upload_status(upload_id: str):
|
||||
"""
|
||||
Get the status of an ongoing upload.
|
||||
"""
|
||||
if upload_id not in _upload_sessions:
|
||||
raise HTTPException(status_code=404, detail="Upload-Session nicht gefunden")
|
||||
|
||||
session = _upload_sessions[upload_id]
|
||||
|
||||
return {
|
||||
"upload_id": upload_id,
|
||||
"filename": session["filename"],
|
||||
"total_chunks": session["total_chunks"],
|
||||
"received_chunks": len(session["received_chunks"]),
|
||||
"progress_percent": round(
|
||||
len(session["received_chunks"]) / session["total_chunks"] * 100, 1
|
||||
),
|
||||
"destination": session["destination"],
|
||||
"created_at": session["created_at"]
|
||||
}
|
||||
|
||||
|
||||
@router.delete("/cancel/{upload_id}")
|
||||
async def cancel_upload(upload_id: str):
|
||||
"""
|
||||
Cancel an ongoing upload and clean up temporary files.
|
||||
"""
|
||||
if upload_id not in _upload_sessions:
|
||||
raise HTTPException(status_code=404, detail="Upload-Session nicht gefunden")
|
||||
|
||||
session = _upload_sessions[upload_id]
|
||||
|
||||
# Clean up chunks
|
||||
shutil.rmtree(session["session_dir"], ignore_errors=True)
|
||||
del _upload_sessions[upload_id]
|
||||
|
||||
return {"message": "Upload abgebrochen", "upload_id": upload_id}
|
||||
|
||||
|
||||
@router.get("/list")
|
||||
async def list_uploads(destination: str = "klausur"):
|
||||
"""
|
||||
List all uploaded files in the specified destination.
|
||||
"""
|
||||
if destination == "rag":
|
||||
dest_dir = EH_UPLOAD_DIR
|
||||
else:
|
||||
dest_dir = UPLOAD_DIR
|
||||
|
||||
files = []
|
||||
|
||||
for f in dest_dir.iterdir():
|
||||
if f.is_file() and f.suffix.lower() == ".pdf":
|
||||
stat = f.stat()
|
||||
files.append({
|
||||
"filename": f.name,
|
||||
"size": stat.st_size,
|
||||
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
|
||||
})
|
||||
|
||||
files.sort(key=lambda x: x["modified"], reverse=True)
|
||||
|
||||
return {
|
||||
"destination": destination,
|
||||
"count": len(files),
|
||||
"files": files[:50] # Limit to 50 most recent
|
||||
}
|
||||
Reference in New Issue
Block a user