Restructure: Move final 12 root files into packages (klausur-service)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m23s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m23s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 19s
ocr/spell/ (3): smart_spell, core, text upload/ (3): api, chunked, mobile crawler/ (3): github, github_core, github_parsers + unified_grid → grid/, tesseract_extractor → ocr/engines/, htr_api → ocr/pipeline/ 12 shims added. Only main.py, config.py, storage + RAG files remain at root. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
320
klausur-service/backend/upload/chunked.py
Normal file
320
klausur-service/backend/upload/chunked.py
Normal file
@@ -0,0 +1,320 @@
|
||||
"""
|
||||
Chunked Upload API — init, chunk, finalize, simple upload, status, cancel, list.
|
||||
|
||||
Extracted from upload_api.py for modularity.
|
||||
|
||||
DSGVO-konform: Data stays local in WLAN, no external transmission.
|
||||
"""
|
||||
|
||||
import os
|
||||
import uuid
|
||||
import shutil
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException, UploadFile, File, Form
|
||||
from pydantic import BaseModel
|
||||
|
||||
# Configuration
|
||||
UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", "/app/uploads"))
|
||||
CHUNK_DIR = Path(os.getenv("CHUNK_DIR", "/app/chunks"))
|
||||
EH_UPLOAD_DIR = Path(os.getenv("EH_UPLOAD_DIR", "/app/eh-uploads"))
|
||||
|
||||
# Ensure directories exist
|
||||
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CHUNK_DIR.mkdir(parents=True, exist_ok=True)
|
||||
EH_UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# In-memory storage for upload sessions (for simplicity)
|
||||
# In production, use Redis or database
|
||||
_upload_sessions: Dict[str, dict] = {}
|
||||
|
||||
router = APIRouter(prefix="/api/v1/upload", tags=["Mobile Upload"])
|
||||
|
||||
|
||||
class InitUploadRequest(BaseModel):
|
||||
filename: str
|
||||
filesize: int
|
||||
chunks: int
|
||||
destination: str = "klausur" # "klausur" or "rag"
|
||||
|
||||
|
||||
class InitUploadResponse(BaseModel):
|
||||
upload_id: str
|
||||
chunk_size: int
|
||||
total_chunks: int
|
||||
message: str
|
||||
|
||||
|
||||
class ChunkUploadResponse(BaseModel):
|
||||
upload_id: str
|
||||
chunk_index: int
|
||||
received: bool
|
||||
chunks_received: int
|
||||
total_chunks: int
|
||||
|
||||
|
||||
class FinalizeResponse(BaseModel):
|
||||
upload_id: str
|
||||
filename: str
|
||||
filepath: str
|
||||
filesize: int
|
||||
checksum: str
|
||||
message: str
|
||||
|
||||
|
||||
@router.post("/init", response_model=InitUploadResponse)
|
||||
async def init_upload(request: InitUploadRequest):
|
||||
"""
|
||||
Initialize a chunked upload session.
|
||||
|
||||
Returns an upload_id that must be used for subsequent chunk uploads.
|
||||
"""
|
||||
upload_id = str(uuid.uuid4())
|
||||
|
||||
# Create session directory
|
||||
session_dir = CHUNK_DIR / upload_id
|
||||
session_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Store session info
|
||||
_upload_sessions[upload_id] = {
|
||||
"filename": request.filename,
|
||||
"filesize": request.filesize,
|
||||
"total_chunks": request.chunks,
|
||||
"received_chunks": set(),
|
||||
"destination": request.destination,
|
||||
"session_dir": str(session_dir),
|
||||
"created_at": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
|
||||
return InitUploadResponse(
|
||||
upload_id=upload_id,
|
||||
chunk_size=5 * 1024 * 1024, # 5 MB
|
||||
total_chunks=request.chunks,
|
||||
message="Upload-Session erstellt"
|
||||
)
|
||||
|
||||
|
||||
@router.post("/chunk", response_model=ChunkUploadResponse)
|
||||
async def upload_chunk(
|
||||
chunk: UploadFile = File(...),
|
||||
upload_id: str = Form(...),
|
||||
chunk_index: int = Form(...)
|
||||
):
|
||||
"""
|
||||
Upload a single chunk of a file.
|
||||
|
||||
Chunks are stored temporarily until finalize is called.
|
||||
"""
|
||||
if upload_id not in _upload_sessions:
|
||||
raise HTTPException(status_code=404, detail="Upload-Session nicht gefunden")
|
||||
|
||||
session = _upload_sessions[upload_id]
|
||||
|
||||
if chunk_index < 0 or chunk_index >= session["total_chunks"]:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Ungueltiger Chunk-Index: {chunk_index}"
|
||||
)
|
||||
|
||||
# Save chunk
|
||||
chunk_path = Path(session["session_dir"]) / f"chunk_{chunk_index:05d}"
|
||||
|
||||
with open(chunk_path, "wb") as f:
|
||||
content = await chunk.read()
|
||||
f.write(content)
|
||||
|
||||
# Track received chunks
|
||||
session["received_chunks"].add(chunk_index)
|
||||
|
||||
return ChunkUploadResponse(
|
||||
upload_id=upload_id,
|
||||
chunk_index=chunk_index,
|
||||
received=True,
|
||||
chunks_received=len(session["received_chunks"]),
|
||||
total_chunks=session["total_chunks"]
|
||||
)
|
||||
|
||||
|
||||
@router.post("/finalize", response_model=FinalizeResponse)
|
||||
async def finalize_upload(upload_id: str = Form(...)):
|
||||
"""
|
||||
Finalize the upload by combining all chunks into a single file.
|
||||
|
||||
Validates that all chunks were received and calculates checksum.
|
||||
"""
|
||||
if upload_id not in _upload_sessions:
|
||||
raise HTTPException(status_code=404, detail="Upload-Session nicht gefunden")
|
||||
|
||||
session = _upload_sessions[upload_id]
|
||||
|
||||
# Check if all chunks received
|
||||
if len(session["received_chunks"]) != session["total_chunks"]:
|
||||
missing = session["total_chunks"] - len(session["received_chunks"])
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Nicht alle Chunks empfangen. Fehlend: {missing}"
|
||||
)
|
||||
|
||||
# Determine destination directory
|
||||
if session["destination"] == "rag":
|
||||
dest_dir = EH_UPLOAD_DIR
|
||||
else:
|
||||
dest_dir = UPLOAD_DIR
|
||||
|
||||
# Generate unique filename
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
safe_filename = session["filename"].replace(" ", "_")
|
||||
final_filename = f"{timestamp}_{safe_filename}"
|
||||
final_path = dest_dir / final_filename
|
||||
|
||||
# Combine chunks
|
||||
hasher = hashlib.sha256()
|
||||
total_size = 0
|
||||
|
||||
with open(final_path, "wb") as outfile:
|
||||
for i in range(session["total_chunks"]):
|
||||
chunk_path = Path(session["session_dir"]) / f"chunk_{i:05d}"
|
||||
|
||||
if not chunk_path.exists():
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Chunk {i} nicht gefunden"
|
||||
)
|
||||
|
||||
with open(chunk_path, "rb") as infile:
|
||||
data = infile.read()
|
||||
outfile.write(data)
|
||||
hasher.update(data)
|
||||
total_size += len(data)
|
||||
|
||||
# Clean up chunks
|
||||
shutil.rmtree(session["session_dir"], ignore_errors=True)
|
||||
del _upload_sessions[upload_id]
|
||||
|
||||
checksum = hasher.hexdigest()
|
||||
|
||||
return FinalizeResponse(
|
||||
upload_id=upload_id,
|
||||
filename=final_filename,
|
||||
filepath=str(final_path),
|
||||
filesize=total_size,
|
||||
checksum=checksum,
|
||||
message="Upload erfolgreich abgeschlossen"
|
||||
)
|
||||
|
||||
|
||||
@router.post("/simple")
|
||||
async def simple_upload(
|
||||
file: UploadFile = File(...),
|
||||
destination: str = Form("klausur")
|
||||
):
|
||||
"""
|
||||
Simple single-request upload for smaller files (<10MB).
|
||||
|
||||
For larger files, use the chunked upload endpoints.
|
||||
"""
|
||||
# Determine destination directory
|
||||
if destination == "rag":
|
||||
dest_dir = EH_UPLOAD_DIR
|
||||
else:
|
||||
dest_dir = UPLOAD_DIR
|
||||
|
||||
# Generate unique filename
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
safe_filename = file.filename.replace(" ", "_") if file.filename else "upload.pdf"
|
||||
final_filename = f"{timestamp}_{safe_filename}"
|
||||
final_path = dest_dir / final_filename
|
||||
|
||||
# Calculate checksum while writing
|
||||
hasher = hashlib.sha256()
|
||||
total_size = 0
|
||||
|
||||
with open(final_path, "wb") as f:
|
||||
while True:
|
||||
chunk = await file.read(1024 * 1024) # Read 1MB at a time
|
||||
if not chunk:
|
||||
break
|
||||
f.write(chunk)
|
||||
hasher.update(chunk)
|
||||
total_size += len(chunk)
|
||||
|
||||
return {
|
||||
"filename": final_filename,
|
||||
"filepath": str(final_path),
|
||||
"filesize": total_size,
|
||||
"checksum": hasher.hexdigest(),
|
||||
"message": "Upload erfolgreich"
|
||||
}
|
||||
|
||||
|
||||
@router.get("/status/{upload_id}")
|
||||
async def get_upload_status(upload_id: str):
|
||||
"""
|
||||
Get the status of an ongoing upload.
|
||||
"""
|
||||
if upload_id not in _upload_sessions:
|
||||
raise HTTPException(status_code=404, detail="Upload-Session nicht gefunden")
|
||||
|
||||
session = _upload_sessions[upload_id]
|
||||
|
||||
return {
|
||||
"upload_id": upload_id,
|
||||
"filename": session["filename"],
|
||||
"total_chunks": session["total_chunks"],
|
||||
"received_chunks": len(session["received_chunks"]),
|
||||
"progress_percent": round(
|
||||
len(session["received_chunks"]) / session["total_chunks"] * 100, 1
|
||||
),
|
||||
"destination": session["destination"],
|
||||
"created_at": session["created_at"]
|
||||
}
|
||||
|
||||
|
||||
@router.delete("/cancel/{upload_id}")
|
||||
async def cancel_upload(upload_id: str):
|
||||
"""
|
||||
Cancel an ongoing upload and clean up temporary files.
|
||||
"""
|
||||
if upload_id not in _upload_sessions:
|
||||
raise HTTPException(status_code=404, detail="Upload-Session nicht gefunden")
|
||||
|
||||
session = _upload_sessions[upload_id]
|
||||
|
||||
# Clean up chunks
|
||||
shutil.rmtree(session["session_dir"], ignore_errors=True)
|
||||
del _upload_sessions[upload_id]
|
||||
|
||||
return {"message": "Upload abgebrochen", "upload_id": upload_id}
|
||||
|
||||
|
||||
@router.get("/list")
|
||||
async def list_uploads(destination: str = "klausur"):
|
||||
"""
|
||||
List all uploaded files in the specified destination.
|
||||
"""
|
||||
if destination == "rag":
|
||||
dest_dir = EH_UPLOAD_DIR
|
||||
else:
|
||||
dest_dir = UPLOAD_DIR
|
||||
|
||||
files = []
|
||||
|
||||
for f in dest_dir.iterdir():
|
||||
if f.is_file() and f.suffix.lower() == ".pdf":
|
||||
stat = f.stat()
|
||||
files.append({
|
||||
"filename": f.name,
|
||||
"size": stat.st_size,
|
||||
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
|
||||
})
|
||||
|
||||
files.sort(key=lambda x: x["modified"], reverse=True)
|
||||
|
||||
return {
|
||||
"destination": destination,
|
||||
"count": len(files),
|
||||
"files": files[:50] # Limit to 50 most recent
|
||||
}
|
||||
Reference in New Issue
Block a user