feat(ocr-pipeline): 6 systematic improvements for robustness, performance & UX

1. Unit tests: 76 new parametrized tests for noise filter, phonetic detection, cell text cleaning, and row merging (116 total, all green) 2. Continuation-row merge: detect multi-line vocab entries where text wraps (lowercase EN + empty DE) and merge into previous entry 3. Empty DE fallback: secondary PSM=7 OCR pass for cells missed by PSM=6 4. Batch-OCR: collect empty cells per column, run single Tesseract call on column strip instead of per-cell (~66% fewer calls for 3+ empty cells) 5. StepReconstruction UI: font scaling via naturalHeight, empty EN/DE field highlighting, undo/redo (Ctrl+Z), per-cell reset button 6. Session reprocess: POST /sessions/{id}/reprocess endpoint to re-run from any step, with reprocess button on completed pipeline steps Also fixes pre-existing dewarp_image tuple unpacking bug in run_cv_pipeline and updates dewarp tests to match current (image, info) return signature. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 14:46:38 +01:00
parent c3a924a620
commit e718353d9f
6 changed files with 775 additions and 79 deletions
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -1623,6 +1623,69 @@ async def save_reconstruction(session_id: str, request: Request):
    }


+@router.post("/sessions/{session_id}/reprocess")
+async def reprocess_session(session_id: str, request: Request):
+    """Re-run pipeline from a specific step, clearing downstream data.
+
+    Body: {"from_step": 5}  (1-indexed step number)
+
+    Clears downstream results:
+    - from_step <= 1: deskew_result, dewarp_result, column_result, row_result, word_result
+    - from_step <= 2: dewarp_result, column_result, row_result, word_result
+    - from_step <= 3: column_result, row_result, word_result
+    - from_step <= 4: row_result, word_result
+    - from_step <= 5: word_result (cells, vocab_entries)
+    - from_step <= 6: word_result.llm_review only
+    """
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    body = await request.json()
+    from_step = body.get("from_step", 1)
+    if not isinstance(from_step, int) or from_step < 1 or from_step > 7:
+        raise HTTPException(status_code=400, detail="from_step must be between 1 and 7")
+
+    update_kwargs: Dict[str, Any] = {"current_step": from_step}
+
+    # Clear downstream data based on from_step
+    if from_step <= 5:
+        update_kwargs["word_result"] = None
+    elif from_step == 6:
+        # Only clear LLM review from word_result
+        word_result = session.get("word_result")
+        if word_result:
+            word_result.pop("llm_review", None)
+            word_result.pop("llm_corrections", None)
+            update_kwargs["word_result"] = word_result
+
+    if from_step <= 4:
+        update_kwargs["row_result"] = None
+    if from_step <= 3:
+        update_kwargs["column_result"] = None
+    if from_step <= 2:
+        update_kwargs["dewarp_result"] = None
+    if from_step <= 1:
+        update_kwargs["deskew_result"] = None
+
+    await update_session_db(session_id, **update_kwargs)
+
+    # Also clear cache
+    if session_id in _cache:
+        for key in list(update_kwargs.keys()):
+            if key != "current_step":
+                _cache[session_id][key] = update_kwargs[key]
+        _cache[session_id]["current_step"] = from_step
+
+    logger.info(f"Session {session_id} reprocessing from step {from_step}")
+
+    return {
+        "session_id": session_id,
+        "from_step": from_step,
+        "cleared": [k for k in update_kwargs if k != "current_step"],
+    }
+
+
 async def _get_rows_overlay(session_id: str) -> Response:
    """Generate dewarped image with row bands drawn on it."""
    session = await get_session_db(session_id)