Files
breakpilot-lehrer/klausur-service/backend/ocr_pipeline_words.py
Benjamin Admin b6983ab1dc [split-required] Split 500-1000 LOC files across all services
backend-lehrer (5 files):
- alerts_agent/db/repository.py (992 → 5), abitur_docs_api.py (956 → 3)
- teacher_dashboard_api.py (951 → 3), services/pdf_service.py (916 → 3)
- mail/mail_db.py (987 → 6)

klausur-service (5 files):
- legal_templates_ingestion.py (942 → 3), ocr_pipeline_postprocess.py (929 → 4)
- ocr_pipeline_words.py (876 → 3), ocr_pipeline_ocr_merge.py (616 → 2)
- KorrekturPage.tsx (956 → 6)

website (5 pages):
- mail (985 → 9), edu-search (958 → 8), mac-mini (950 → 7)
- ocr-labeling (946 → 7), audit-workspace (871 → 4)

studio-v2 (5 files + 1 deleted):
- page.tsx (946 → 5), MessagesContext.tsx (925 → 4)
- korrektur (914 → 6), worksheet-cleanup (899 → 6)
- useVocabWorksheet.ts (888 → 3)
- Deleted dead page-original.tsx (934 LOC)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 23:35:37 +02:00

186 lines
6.1 KiB
Python

"""
OCR Pipeline Words — composite router for word detection, PaddleOCR direct,
and ground truth endpoints.
Split into sub-modules:
ocr_pipeline_words_detect — main detect_words endpoint (Step 7)
ocr_pipeline_words_stream — SSE streaming generators
This barrel module contains the PaddleOCR direct endpoint and ground truth
endpoints, and assembles all word-related routers.
Lizenz: Apache 2.0
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import time
from datetime import datetime
from typing import Any, Dict, List, Optional
import cv2
import numpy as np
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from cv_words_first import build_grid_from_words
from ocr_pipeline_session_store import (
get_session_db,
get_session_image,
update_session_db,
)
from ocr_pipeline_common import (
_cache,
_append_pipeline_log,
)
from ocr_pipeline_words_detect import router as _detect_router
logger = logging.getLogger(__name__)
_local_router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
# ---------------------------------------------------------------------------
# Pydantic models
# ---------------------------------------------------------------------------
class WordGroundTruthRequest(BaseModel):
is_correct: bool
corrected_entries: Optional[List[Dict[str, Any]]] = None
notes: Optional[str] = None
# ---------------------------------------------------------------------------
# PaddleOCR Direct Endpoint
# ---------------------------------------------------------------------------
@_local_router.post("/sessions/{session_id}/paddle-direct")
async def paddle_direct(session_id: str):
"""Run PaddleOCR on the preprocessed image and build a word grid directly."""
img_png = await get_session_image(session_id, "cropped")
if not img_png:
img_png = await get_session_image(session_id, "dewarped")
if not img_png:
img_png = await get_session_image(session_id, "original")
if not img_png:
raise HTTPException(status_code=404, detail="No image found for this session")
img_arr = np.frombuffer(img_png, dtype=np.uint8)
img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)
if img_bgr is None:
raise HTTPException(status_code=400, detail="Failed to decode original image")
img_h, img_w = img_bgr.shape[:2]
from cv_ocr_engines import ocr_region_paddle
t0 = time.time()
word_dicts = await ocr_region_paddle(img_bgr, region=None)
if not word_dicts:
raise HTTPException(status_code=400, detail="PaddleOCR returned no words")
cells, columns_meta = build_grid_from_words(word_dicts, img_w, img_h)
duration = time.time() - t0
for cell in cells:
cell["ocr_engine"] = "paddle_direct"
n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
n_cols = len(columns_meta)
col_types = {c.get("type") for c in columns_meta}
is_vocab = bool(col_types & {"column_en", "column_de"})
word_result = {
"cells": cells,
"grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
"columns_used": columns_meta,
"layout": "vocab" if is_vocab else "generic",
"image_width": img_w,
"image_height": img_h,
"duration_seconds": round(duration, 2),
"ocr_engine": "paddle_direct",
"grid_method": "paddle_direct",
"summary": {
"total_cells": len(cells),
"non_empty_cells": sum(1 for c in cells if c.get("text")),
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
},
}
await update_session_db(
session_id,
word_result=word_result,
cropped_png=img_png,
current_step=8,
)
logger.info(
"paddle_direct session %s: %d cells (%d rows, %d cols) in %.2fs",
session_id, len(cells), n_rows, n_cols, duration,
)
await _append_pipeline_log(session_id, "paddle_direct", {
"total_cells": len(cells),
"non_empty_cells": word_result["summary"]["non_empty_cells"],
"ocr_engine": "paddle_direct",
}, duration_ms=int(duration * 1000))
return {"session_id": session_id, **word_result}
# ---------------------------------------------------------------------------
# Ground Truth Words Endpoints
# ---------------------------------------------------------------------------
@_local_router.post("/sessions/{session_id}/ground-truth/words")
async def save_word_ground_truth(session_id: str, req: WordGroundTruthRequest):
"""Save ground truth feedback for the word recognition step."""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
ground_truth = session.get("ground_truth") or {}
gt = {
"is_correct": req.is_correct,
"corrected_entries": req.corrected_entries,
"notes": req.notes,
"saved_at": datetime.utcnow().isoformat(),
"word_result": session.get("word_result"),
}
ground_truth["words"] = gt
await update_session_db(session_id, ground_truth=ground_truth)
if session_id in _cache:
_cache[session_id]["ground_truth"] = ground_truth
return {"session_id": session_id, "ground_truth": gt}
@_local_router.get("/sessions/{session_id}/ground-truth/words")
async def get_word_ground_truth(session_id: str):
"""Retrieve saved ground truth for word recognition."""
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
ground_truth = session.get("ground_truth") or {}
words_gt = ground_truth.get("words")
if not words_gt:
raise HTTPException(status_code=404, detail="No word ground truth saved")
return {
"session_id": session_id,
"words_gt": words_gt,
"words_auto": session.get("word_result"),
}
# ---------------------------------------------------------------------------
# Composite router
# ---------------------------------------------------------------------------
router = APIRouter()
router.include_router(_detect_router)
router.include_router(_local_router)