Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m25s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 20s
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
95 lines
3.3 KiB
Python
95 lines
3.3 KiB
Python
"""
|
|
OCR Pipeline Reprocess Endpoint.
|
|
|
|
POST /sessions/{session_id}/reprocess — clear downstream + restart from step.
|
|
|
|
Lizenz: Apache 2.0
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Any, Dict
|
|
|
|
from fastapi import APIRouter, HTTPException, Request
|
|
|
|
from .common import _cache
|
|
from .session_store import get_session_db, update_session_db
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(tags=["ocr-pipeline"])
|
|
|
|
|
|
@router.post("/sessions/{session_id}/reprocess")
|
|
async def reprocess_session(session_id: str, request: Request):
|
|
"""Re-run pipeline from a specific step, clearing downstream data.
|
|
|
|
Body: {"from_step": 5} (1-indexed step number)
|
|
|
|
Pipeline order: Orientation(1) -> Deskew(2) -> Dewarp(3) -> Crop(4) -> Columns(5) ->
|
|
Rows(6) -> Words(7) -> LLM-Review(8) -> Reconstruction(9) -> Validation(10)
|
|
|
|
Clears downstream results:
|
|
- from_step <= 1: orientation_result + all downstream
|
|
- from_step <= 2: deskew_result + all downstream
|
|
- from_step <= 3: dewarp_result + all downstream
|
|
- from_step <= 4: crop_result + all downstream
|
|
- from_step <= 5: column_result, row_result, word_result
|
|
- from_step <= 6: row_result, word_result
|
|
- from_step <= 7: word_result (cells, vocab_entries)
|
|
- from_step <= 8: word_result.llm_review only
|
|
"""
|
|
session = await get_session_db(session_id)
|
|
if not session:
|
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
|
|
|
body = await request.json()
|
|
from_step = body.get("from_step", 1)
|
|
if not isinstance(from_step, int) or from_step < 1 or from_step > 10:
|
|
raise HTTPException(status_code=400, detail="from_step must be between 1 and 10")
|
|
|
|
update_kwargs: Dict[str, Any] = {"current_step": from_step}
|
|
|
|
# Clear downstream data based on from_step
|
|
# New pipeline order: Orient(2) -> Deskew(3) -> Dewarp(4) -> Crop(5) ->
|
|
# Columns(6) -> Rows(7) -> Words(8) -> LLM(9) -> Recon(10) -> GT(11)
|
|
if from_step <= 8:
|
|
update_kwargs["word_result"] = None
|
|
elif from_step == 9:
|
|
# Only clear LLM review from word_result
|
|
word_result = session.get("word_result")
|
|
if word_result:
|
|
word_result.pop("llm_review", None)
|
|
word_result.pop("llm_corrections", None)
|
|
update_kwargs["word_result"] = word_result
|
|
|
|
if from_step <= 7:
|
|
update_kwargs["row_result"] = None
|
|
if from_step <= 6:
|
|
update_kwargs["column_result"] = None
|
|
if from_step <= 4:
|
|
update_kwargs["crop_result"] = None
|
|
if from_step <= 3:
|
|
update_kwargs["dewarp_result"] = None
|
|
if from_step <= 2:
|
|
update_kwargs["deskew_result"] = None
|
|
if from_step <= 1:
|
|
update_kwargs["orientation_result"] = None
|
|
|
|
await update_session_db(session_id, **update_kwargs)
|
|
|
|
# Also clear cache
|
|
if session_id in _cache:
|
|
for key in list(update_kwargs.keys()):
|
|
if key != "current_step":
|
|
_cache[session_id][key] = update_kwargs[key]
|
|
_cache[session_id]["current_step"] = from_step
|
|
|
|
logger.info(f"Session {session_id} reprocessing from step {from_step}")
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"from_step": from_step,
|
|
"cleared": [k for k in update_kwargs if k != "current_step"],
|
|
}
|