[split-required] Split 700-870 LOC files across all services

backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:01:18 +02:00
parent b6983ab1dc
commit 34da9f4cda
106 changed files with 16500 additions and 16947 deletions
--- a/klausur-service/backend/ocr_pipeline_reprocess.py
+++ b/klausur-service/backend/ocr_pipeline_reprocess.py
@@ -0,0 +1,94 @@
+"""
+OCR Pipeline Reprocess Endpoint.
+
+POST /sessions/{session_id}/reprocess — clear downstream + restart from step.
+
+Lizenz: Apache 2.0
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Any, Dict
+
+from fastapi import APIRouter, HTTPException, Request
+
+from ocr_pipeline_common import _cache
+from ocr_pipeline_session_store import get_session_db, update_session_db
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(tags=["ocr-pipeline"])
+
+
+@router.post("/sessions/{session_id}/reprocess")
+async def reprocess_session(session_id: str, request: Request):
+    """Re-run pipeline from a specific step, clearing downstream data.
+
+    Body: {"from_step": 5}  (1-indexed step number)
+
+    Pipeline order: Orientation(1) -> Deskew(2) -> Dewarp(3) -> Crop(4) -> Columns(5) ->
+                    Rows(6) -> Words(7) -> LLM-Review(8) -> Reconstruction(9) -> Validation(10)
+
+    Clears downstream results:
+    - from_step <= 1: orientation_result + all downstream
+    - from_step <= 2: deskew_result + all downstream
+    - from_step <= 3: dewarp_result + all downstream
+    - from_step <= 4: crop_result + all downstream
+    - from_step <= 5: column_result, row_result, word_result
+    - from_step <= 6: row_result, word_result
+    - from_step <= 7: word_result (cells, vocab_entries)
+    - from_step <= 8: word_result.llm_review only
+    """
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    body = await request.json()
+    from_step = body.get("from_step", 1)
+    if not isinstance(from_step, int) or from_step < 1 or from_step > 10:
+        raise HTTPException(status_code=400, detail="from_step must be between 1 and 10")
+
+    update_kwargs: Dict[str, Any] = {"current_step": from_step}
+
+    # Clear downstream data based on from_step
+    # New pipeline order: Orient(2) -> Deskew(3) -> Dewarp(4) -> Crop(5) ->
+    #   Columns(6) -> Rows(7) -> Words(8) -> LLM(9) -> Recon(10) -> GT(11)
+    if from_step <= 8:
+        update_kwargs["word_result"] = None
+    elif from_step == 9:
+        # Only clear LLM review from word_result
+        word_result = session.get("word_result")
+        if word_result:
+            word_result.pop("llm_review", None)
+            word_result.pop("llm_corrections", None)
+            update_kwargs["word_result"] = word_result
+
+    if from_step <= 7:
+        update_kwargs["row_result"] = None
+    if from_step <= 6:
+        update_kwargs["column_result"] = None
+    if from_step <= 4:
+        update_kwargs["crop_result"] = None
+    if from_step <= 3:
+        update_kwargs["dewarp_result"] = None
+    if from_step <= 2:
+        update_kwargs["deskew_result"] = None
+    if from_step <= 1:
+        update_kwargs["orientation_result"] = None
+
+    await update_session_db(session_id, **update_kwargs)
+
+    # Also clear cache
+    if session_id in _cache:
+        for key in list(update_kwargs.keys()):
+            if key != "current_step":
+                _cache[session_id][key] = update_kwargs[key]
+        _cache[session_id]["current_step"] = from_step
+
+    logger.info(f"Session {session_id} reprocessing from step {from_step}")
+
+    return {
+        "session_id": session_id,
+        "from_step": from_step,
+        "cleared": [k for k in update_kwargs if k != "current_step"],
+    }