[split-required] Split 700-870 LOC files across all services
backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
94
klausur-service/backend/ocr_pipeline_reprocess.py
Normal file
94
klausur-service/backend/ocr_pipeline_reprocess.py
Normal file
@@ -0,0 +1,94 @@
|
||||
"""
|
||||
OCR Pipeline Reprocess Endpoint.
|
||||
|
||||
POST /sessions/{session_id}/reprocess — clear downstream + restart from step.
|
||||
|
||||
Lizenz: Apache 2.0
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Request
|
||||
|
||||
from ocr_pipeline_common import _cache
|
||||
from ocr_pipeline_session_store import get_session_db, update_session_db
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(tags=["ocr-pipeline"])
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/reprocess")
|
||||
async def reprocess_session(session_id: str, request: Request):
|
||||
"""Re-run pipeline from a specific step, clearing downstream data.
|
||||
|
||||
Body: {"from_step": 5} (1-indexed step number)
|
||||
|
||||
Pipeline order: Orientation(1) -> Deskew(2) -> Dewarp(3) -> Crop(4) -> Columns(5) ->
|
||||
Rows(6) -> Words(7) -> LLM-Review(8) -> Reconstruction(9) -> Validation(10)
|
||||
|
||||
Clears downstream results:
|
||||
- from_step <= 1: orientation_result + all downstream
|
||||
- from_step <= 2: deskew_result + all downstream
|
||||
- from_step <= 3: dewarp_result + all downstream
|
||||
- from_step <= 4: crop_result + all downstream
|
||||
- from_step <= 5: column_result, row_result, word_result
|
||||
- from_step <= 6: row_result, word_result
|
||||
- from_step <= 7: word_result (cells, vocab_entries)
|
||||
- from_step <= 8: word_result.llm_review only
|
||||
"""
|
||||
session = await get_session_db(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||
|
||||
body = await request.json()
|
||||
from_step = body.get("from_step", 1)
|
||||
if not isinstance(from_step, int) or from_step < 1 or from_step > 10:
|
||||
raise HTTPException(status_code=400, detail="from_step must be between 1 and 10")
|
||||
|
||||
update_kwargs: Dict[str, Any] = {"current_step": from_step}
|
||||
|
||||
# Clear downstream data based on from_step
|
||||
# New pipeline order: Orient(2) -> Deskew(3) -> Dewarp(4) -> Crop(5) ->
|
||||
# Columns(6) -> Rows(7) -> Words(8) -> LLM(9) -> Recon(10) -> GT(11)
|
||||
if from_step <= 8:
|
||||
update_kwargs["word_result"] = None
|
||||
elif from_step == 9:
|
||||
# Only clear LLM review from word_result
|
||||
word_result = session.get("word_result")
|
||||
if word_result:
|
||||
word_result.pop("llm_review", None)
|
||||
word_result.pop("llm_corrections", None)
|
||||
update_kwargs["word_result"] = word_result
|
||||
|
||||
if from_step <= 7:
|
||||
update_kwargs["row_result"] = None
|
||||
if from_step <= 6:
|
||||
update_kwargs["column_result"] = None
|
||||
if from_step <= 4:
|
||||
update_kwargs["crop_result"] = None
|
||||
if from_step <= 3:
|
||||
update_kwargs["dewarp_result"] = None
|
||||
if from_step <= 2:
|
||||
update_kwargs["deskew_result"] = None
|
||||
if from_step <= 1:
|
||||
update_kwargs["orientation_result"] = None
|
||||
|
||||
await update_session_db(session_id, **update_kwargs)
|
||||
|
||||
# Also clear cache
|
||||
if session_id in _cache:
|
||||
for key in list(update_kwargs.keys()):
|
||||
if key != "current_step":
|
||||
_cache[session_id][key] = update_kwargs[key]
|
||||
_cache[session_id]["current_step"] = from_step
|
||||
|
||||
logger.info(f"Session {session_id} reprocessing from step {from_step}")
|
||||
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"from_step": from_step,
|
||||
"cleared": [k for k in update_kwargs if k != "current_step"],
|
||||
}
|
||||
Reference in New Issue
Block a user