diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 727814c..8ed0b09 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -2196,6 +2196,7 @@ RAPIDOCR_AVAILABLE = False try: from rapidocr import RapidOCR as _RapidOCRClass + from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType RAPIDOCR_AVAILABLE = True logger.info("RapidOCR available — can be used as alternative to Tesseract") except ImportError: @@ -2203,11 +2204,21 @@ except ImportError: def _get_rapid_engine(): - """Lazy-init RapidOCR engine (downloads models on first use).""" + """Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support.""" global _rapid_engine if _rapid_engine is None: - _rapid_engine = _RapidOCRClass() - logger.info("RapidOCR engine initialized") + _rapid_engine = _RapidOCRClass(params={ + # PP-OCRv5 Latin model — supports German umlauts (ä, ö, ü, ß) + "Rec.lang_type": _LangRec.LATIN, + "Rec.model_type": _ModelType.SERVER, + "Rec.ocr_version": _OCRVersion.PPOCRV5, + # Tighter detection boxes to reduce word merging + "Det.unclip_ratio": 1.3, + "Det.box_thresh": 0.6, + # Silence verbose logging + "Global.log_level": "critical", + }) + logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)") return _rapid_engine