breakpilot-lehrer/klausur-service/backend/ocr_pipeline_reprocess.py

"""
OCR Pipeline Reprocess Endpoint.

POST /sessions/{session_id}/reprocess — clear downstream + restart from step.

Lizenz: Apache 2.0
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
from typing import Any, Dict

from fastapi import APIRouter, HTTPException, Request

from ocr_pipeline_common import _cache
from ocr_pipeline_session_store import get_session_db, update_session_db

logger = logging.getLogger(__name__)

router = APIRouter(tags=["ocr-pipeline"])


@router.post("/sessions/{session_id}/reprocess")
async def reprocess_session(session_id: str, request: Request):
    """Re-run pipeline from a specific step, clearing downstream data.

    Body: {"from_step": 5}  (1-indexed step number)

    Pipeline order: Orientation(1) -> Deskew(2) -> Dewarp(3) -> Crop(4) -> Columns(5) ->
                    Rows(6) -> Words(7) -> LLM-Review(8) -> Reconstruction(9) -> Validation(10)

    Clears downstream results:
    - from_step <= 1: orientation_result + all downstream
    - from_step <= 2: deskew_result + all downstream
    - from_step <= 3: dewarp_result + all downstream
    - from_step <= 4: crop_result + all downstream
    - from_step <= 5: column_result, row_result, word_result
    - from_step <= 6: row_result, word_result
    - from_step <= 7: word_result (cells, vocab_entries)
    - from_step <= 8: word_result.llm_review only
    """
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")

    body = await request.json()
    from_step = body.get("from_step", 1)
    if not isinstance(from_step, int) or from_step < 1 or from_step > 10:
        raise HTTPException(status_code=400, detail="from_step must be between 1 and 10")

    update_kwargs: Dict[str, Any] = {"current_step": from_step}

    # Clear downstream data based on from_step
    # New pipeline order: Orient(2) -> Deskew(3) -> Dewarp(4) -> Crop(5) ->
    #   Columns(6) -> Rows(7) -> Words(8) -> LLM(9) -> Recon(10) -> GT(11)
    if from_step <= 8:
        update_kwargs["word_result"] = None
    elif from_step == 9:
        # Only clear LLM review from word_result
        word_result = session.get("word_result")
        if word_result:
            word_result.pop("llm_review", None)
            word_result.pop("llm_corrections", None)
            update_kwargs["word_result"] = word_result

    if from_step <= 7:
        update_kwargs["row_result"] = None
    if from_step <= 6:
        update_kwargs["column_result"] = None
    if from_step <= 4:
        update_kwargs["crop_result"] = None
    if from_step <= 3:
        update_kwargs["dewarp_result"] = None
    if from_step <= 2:
        update_kwargs["deskew_result"] = None
    if from_step <= 1:
        update_kwargs["orientation_result"] = None

    await update_session_db(session_id, **update_kwargs)

    # Also clear cache
    if session_id in _cache:
        for key in list(update_kwargs.keys()):
            if key != "current_step":
                _cache[session_id][key] = update_kwargs[key]
        _cache[session_id]["current_step"] = from_step

    logger.info(f"Session {session_id} reprocessing from step {from_step}")

    return {
        "session_id": session_id,
        "from_step": from_step,
        "cleared": [k for k in update_kwargs if k != "current_step"],
    }