Files
breakpilot-lehrer/klausur-service/backend/upload/chunked.py
Benjamin Admin d093a4d388
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m23s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 19s
Restructure: Move final 12 root files into packages (klausur-service)
ocr/spell/  (3): smart_spell, core, text
upload/     (3): api, chunked, mobile
crawler/    (3): github, github_core, github_parsers
+ unified_grid → grid/, tesseract_extractor → ocr/engines/, htr_api → ocr/pipeline/

12 shims added. Only main.py, config.py, storage + RAG files remain at root.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 23:19:11 +02:00

321 lines
8.9 KiB
Python

"""
Chunked Upload API — init, chunk, finalize, simple upload, status, cancel, list.
Extracted from upload_api.py for modularity.
DSGVO-konform: Data stays local in WLAN, no external transmission.
"""
import os
import uuid
import shutil
import hashlib
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, Optional
from fastapi import APIRouter, HTTPException, UploadFile, File, Form
from pydantic import BaseModel
# Configuration
UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", "/app/uploads"))
CHUNK_DIR = Path(os.getenv("CHUNK_DIR", "/app/chunks"))
EH_UPLOAD_DIR = Path(os.getenv("EH_UPLOAD_DIR", "/app/eh-uploads"))
# Ensure directories exist
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
CHUNK_DIR.mkdir(parents=True, exist_ok=True)
EH_UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
# In-memory storage for upload sessions (for simplicity)
# In production, use Redis or database
_upload_sessions: Dict[str, dict] = {}
router = APIRouter(prefix="/api/v1/upload", tags=["Mobile Upload"])
class InitUploadRequest(BaseModel):
filename: str
filesize: int
chunks: int
destination: str = "klausur" # "klausur" or "rag"
class InitUploadResponse(BaseModel):
upload_id: str
chunk_size: int
total_chunks: int
message: str
class ChunkUploadResponse(BaseModel):
upload_id: str
chunk_index: int
received: bool
chunks_received: int
total_chunks: int
class FinalizeResponse(BaseModel):
upload_id: str
filename: str
filepath: str
filesize: int
checksum: str
message: str
@router.post("/init", response_model=InitUploadResponse)
async def init_upload(request: InitUploadRequest):
"""
Initialize a chunked upload session.
Returns an upload_id that must be used for subsequent chunk uploads.
"""
upload_id = str(uuid.uuid4())
# Create session directory
session_dir = CHUNK_DIR / upload_id
session_dir.mkdir(parents=True, exist_ok=True)
# Store session info
_upload_sessions[upload_id] = {
"filename": request.filename,
"filesize": request.filesize,
"total_chunks": request.chunks,
"received_chunks": set(),
"destination": request.destination,
"session_dir": str(session_dir),
"created_at": datetime.now(timezone.utc).isoformat(),
}
return InitUploadResponse(
upload_id=upload_id,
chunk_size=5 * 1024 * 1024, # 5 MB
total_chunks=request.chunks,
message="Upload-Session erstellt"
)
@router.post("/chunk", response_model=ChunkUploadResponse)
async def upload_chunk(
chunk: UploadFile = File(...),
upload_id: str = Form(...),
chunk_index: int = Form(...)
):
"""
Upload a single chunk of a file.
Chunks are stored temporarily until finalize is called.
"""
if upload_id not in _upload_sessions:
raise HTTPException(status_code=404, detail="Upload-Session nicht gefunden")
session = _upload_sessions[upload_id]
if chunk_index < 0 or chunk_index >= session["total_chunks"]:
raise HTTPException(
status_code=400,
detail=f"Ungueltiger Chunk-Index: {chunk_index}"
)
# Save chunk
chunk_path = Path(session["session_dir"]) / f"chunk_{chunk_index:05d}"
with open(chunk_path, "wb") as f:
content = await chunk.read()
f.write(content)
# Track received chunks
session["received_chunks"].add(chunk_index)
return ChunkUploadResponse(
upload_id=upload_id,
chunk_index=chunk_index,
received=True,
chunks_received=len(session["received_chunks"]),
total_chunks=session["total_chunks"]
)
@router.post("/finalize", response_model=FinalizeResponse)
async def finalize_upload(upload_id: str = Form(...)):
"""
Finalize the upload by combining all chunks into a single file.
Validates that all chunks were received and calculates checksum.
"""
if upload_id not in _upload_sessions:
raise HTTPException(status_code=404, detail="Upload-Session nicht gefunden")
session = _upload_sessions[upload_id]
# Check if all chunks received
if len(session["received_chunks"]) != session["total_chunks"]:
missing = session["total_chunks"] - len(session["received_chunks"])
raise HTTPException(
status_code=400,
detail=f"Nicht alle Chunks empfangen. Fehlend: {missing}"
)
# Determine destination directory
if session["destination"] == "rag":
dest_dir = EH_UPLOAD_DIR
else:
dest_dir = UPLOAD_DIR
# Generate unique filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
safe_filename = session["filename"].replace(" ", "_")
final_filename = f"{timestamp}_{safe_filename}"
final_path = dest_dir / final_filename
# Combine chunks
hasher = hashlib.sha256()
total_size = 0
with open(final_path, "wb") as outfile:
for i in range(session["total_chunks"]):
chunk_path = Path(session["session_dir"]) / f"chunk_{i:05d}"
if not chunk_path.exists():
raise HTTPException(
status_code=500,
detail=f"Chunk {i} nicht gefunden"
)
with open(chunk_path, "rb") as infile:
data = infile.read()
outfile.write(data)
hasher.update(data)
total_size += len(data)
# Clean up chunks
shutil.rmtree(session["session_dir"], ignore_errors=True)
del _upload_sessions[upload_id]
checksum = hasher.hexdigest()
return FinalizeResponse(
upload_id=upload_id,
filename=final_filename,
filepath=str(final_path),
filesize=total_size,
checksum=checksum,
message="Upload erfolgreich abgeschlossen"
)
@router.post("/simple")
async def simple_upload(
file: UploadFile = File(...),
destination: str = Form("klausur")
):
"""
Simple single-request upload for smaller files (<10MB).
For larger files, use the chunked upload endpoints.
"""
# Determine destination directory
if destination == "rag":
dest_dir = EH_UPLOAD_DIR
else:
dest_dir = UPLOAD_DIR
# Generate unique filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
safe_filename = file.filename.replace(" ", "_") if file.filename else "upload.pdf"
final_filename = f"{timestamp}_{safe_filename}"
final_path = dest_dir / final_filename
# Calculate checksum while writing
hasher = hashlib.sha256()
total_size = 0
with open(final_path, "wb") as f:
while True:
chunk = await file.read(1024 * 1024) # Read 1MB at a time
if not chunk:
break
f.write(chunk)
hasher.update(chunk)
total_size += len(chunk)
return {
"filename": final_filename,
"filepath": str(final_path),
"filesize": total_size,
"checksum": hasher.hexdigest(),
"message": "Upload erfolgreich"
}
@router.get("/status/{upload_id}")
async def get_upload_status(upload_id: str):
"""
Get the status of an ongoing upload.
"""
if upload_id not in _upload_sessions:
raise HTTPException(status_code=404, detail="Upload-Session nicht gefunden")
session = _upload_sessions[upload_id]
return {
"upload_id": upload_id,
"filename": session["filename"],
"total_chunks": session["total_chunks"],
"received_chunks": len(session["received_chunks"]),
"progress_percent": round(
len(session["received_chunks"]) / session["total_chunks"] * 100, 1
),
"destination": session["destination"],
"created_at": session["created_at"]
}
@router.delete("/cancel/{upload_id}")
async def cancel_upload(upload_id: str):
"""
Cancel an ongoing upload and clean up temporary files.
"""
if upload_id not in _upload_sessions:
raise HTTPException(status_code=404, detail="Upload-Session nicht gefunden")
session = _upload_sessions[upload_id]
# Clean up chunks
shutil.rmtree(session["session_dir"], ignore_errors=True)
del _upload_sessions[upload_id]
return {"message": "Upload abgebrochen", "upload_id": upload_id}
@router.get("/list")
async def list_uploads(destination: str = "klausur"):
"""
List all uploaded files in the specified destination.
"""
if destination == "rag":
dest_dir = EH_UPLOAD_DIR
else:
dest_dir = UPLOAD_DIR
files = []
for f in dest_dir.iterdir():
if f.is_file() and f.suffix.lower() == ".pdf":
stat = f.stat()
files.append({
"filename": f.name,
"size": stat.st_size,
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
})
files.sort(key=lambda x: x["modified"], reverse=True)
return {
"destination": destination,
"count": len(files),
"files": files[:50] # Limit to 50 most recent
}