fix(ocr-pipeline): configure RapidOCR for German + tighter word detection

- Switch to PP-OCRv5 Latin model (supports ä, ö, ü, ß) - Use SERVER model for better accuracy - Lower Det.unclip_ratio 1.6→1.3 to reduce word merging - Raise Det.box_thresh 0.5→0.6 for stricter detection Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 18:17:49 +01:00
parent 8c42fefa77
commit 859342300e
1 changed files with 14 additions and 3 deletions
@@ -2196,6 +2196,7 @@ RAPIDOCR_AVAILABLE = False

 try:
    from rapidocr import RapidOCR as _RapidOCRClass
+    from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType
    RAPIDOCR_AVAILABLE = True
    logger.info("RapidOCR available — can be used as alternative to Tesseract")
 except ImportError:
@@ -2203,11 +2204,21 @@ except ImportError:


 def _get_rapid_engine():
-    """Lazy-init RapidOCR engine (downloads models on first use)."""
+    """Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support."""
    global _rapid_engine
    if _rapid_engine is None:
-        _rapid_engine = _RapidOCRClass()
-        logger.info("RapidOCR engine initialized")
+        _rapid_engine = _RapidOCRClass(params={
+            # PP-OCRv5 Latin model — supports German umlauts (ä, ö, ü, ß)
+            "Rec.lang_type": _LangRec.LATIN,
+            "Rec.model_type": _ModelType.SERVER,
+            "Rec.ocr_version": _OCRVersion.PPOCRV5,
+            # Tighter detection boxes to reduce word merging
+            "Det.unclip_ratio": 1.3,
+            "Det.box_thresh": 0.6,
+            # Silence verbose logging
+            "Global.log_level": "critical",
+        })
+        logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)")
    return _rapid_engine