""" OCR Pipeline Reprocess Endpoint. POST /sessions/{session_id}/reprocess — clear downstream + restart from step. Lizenz: Apache 2.0 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging from typing import Any, Dict from fastapi import APIRouter, HTTPException, Request from ocr_pipeline_common import _cache from ocr_pipeline_session_store import get_session_db, update_session_db logger = logging.getLogger(__name__) router = APIRouter(tags=["ocr-pipeline"]) @router.post("/sessions/{session_id}/reprocess") async def reprocess_session(session_id: str, request: Request): """Re-run pipeline from a specific step, clearing downstream data. Body: {"from_step": 5} (1-indexed step number) Pipeline order: Orientation(1) -> Deskew(2) -> Dewarp(3) -> Crop(4) -> Columns(5) -> Rows(6) -> Words(7) -> LLM-Review(8) -> Reconstruction(9) -> Validation(10) Clears downstream results: - from_step <= 1: orientation_result + all downstream - from_step <= 2: deskew_result + all downstream - from_step <= 3: dewarp_result + all downstream - from_step <= 4: crop_result + all downstream - from_step <= 5: column_result, row_result, word_result - from_step <= 6: row_result, word_result - from_step <= 7: word_result (cells, vocab_entries) - from_step <= 8: word_result.llm_review only """ session = await get_session_db(session_id) if not session: raise HTTPException(status_code=404, detail=f"Session {session_id} not found") body = await request.json() from_step = body.get("from_step", 1) if not isinstance(from_step, int) or from_step < 1 or from_step > 10: raise HTTPException(status_code=400, detail="from_step must be between 1 and 10") update_kwargs: Dict[str, Any] = {"current_step": from_step} # Clear downstream data based on from_step # New pipeline order: Orient(2) -> Deskew(3) -> Dewarp(4) -> Crop(5) -> # Columns(6) -> Rows(7) -> Words(8) -> LLM(9) -> Recon(10) -> GT(11) if from_step <= 8: update_kwargs["word_result"] = None elif from_step == 9: # Only clear LLM review from word_result word_result = session.get("word_result") if word_result: word_result.pop("llm_review", None) word_result.pop("llm_corrections", None) update_kwargs["word_result"] = word_result if from_step <= 7: update_kwargs["row_result"] = None if from_step <= 6: update_kwargs["column_result"] = None if from_step <= 4: update_kwargs["crop_result"] = None if from_step <= 3: update_kwargs["dewarp_result"] = None if from_step <= 2: update_kwargs["deskew_result"] = None if from_step <= 1: update_kwargs["orientation_result"] = None await update_session_db(session_id, **update_kwargs) # Also clear cache if session_id in _cache: for key in list(update_kwargs.keys()): if key != "current_step": _cache[session_id][key] = update_kwargs[key] _cache[session_id]["current_step"] = from_step logger.info(f"Session {session_id} reprocessing from step {from_step}") return { "session_id": session_id, "from_step": from_step, "cleared": [k for k in update_kwargs if k != "current_step"], }