From 859342300e7949dcd4726eb8253ee57db3f566c5 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 28 Feb 2026 18:17:49 +0100 Subject: [PATCH] fix(ocr-pipeline): configure RapidOCR for German + tighter word detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Switch to PP-OCRv5 Latin model (supports ä, ö, ü, ß) - Use SERVER model for better accuracy - Lower Det.unclip_ratio 1.6→1.3 to reduce word merging - Raise Det.box_thresh 0.5→0.6 for stricter detection Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 727814c..8ed0b09 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -2196,6 +2196,7 @@ RAPIDOCR_AVAILABLE = False try: from rapidocr import RapidOCR as _RapidOCRClass + from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType RAPIDOCR_AVAILABLE = True logger.info("RapidOCR available — can be used as alternative to Tesseract") except ImportError: @@ -2203,11 +2204,21 @@ except ImportError: def _get_rapid_engine(): - """Lazy-init RapidOCR engine (downloads models on first use).""" + """Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support.""" global _rapid_engine if _rapid_engine is None: - _rapid_engine = _RapidOCRClass() - logger.info("RapidOCR engine initialized") + _rapid_engine = _RapidOCRClass(params={ + # PP-OCRv5 Latin model — supports German umlauts (ä, ö, ü, ß) + "Rec.lang_type": _LangRec.LATIN, + "Rec.model_type": _ModelType.SERVER, + "Rec.ocr_version": _OCRVersion.PPOCRV5, + # Tighter detection boxes to reduce word merging + "Det.unclip_ratio": 1.3, + "Det.box_thresh": 0.6, + # Silence verbose logging + "Global.log_level": "critical", + }) + logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)") return _rapid_engine