fix(ocr-pipeline): configure RapidOCR for German + tighter word detection

- Switch to PP-OCRv5 Latin model (supports ä, ö, ü, ß)
- Use SERVER model for better accuracy
- Lower Det.unclip_ratio 1.6→1.3 to reduce word merging
- Raise Det.box_thresh 0.5→0.6 for stricter detection

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-28 18:17:49 +01:00
parent 8c42fefa77
commit 859342300e

View File

@@ -2196,6 +2196,7 @@ RAPIDOCR_AVAILABLE = False
try:
from rapidocr import RapidOCR as _RapidOCRClass
from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType
RAPIDOCR_AVAILABLE = True
logger.info("RapidOCR available — can be used as alternative to Tesseract")
except ImportError:
@@ -2203,11 +2204,21 @@ except ImportError:
def _get_rapid_engine():
"""Lazy-init RapidOCR engine (downloads models on first use)."""
"""Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support."""
global _rapid_engine
if _rapid_engine is None:
_rapid_engine = _RapidOCRClass()
logger.info("RapidOCR engine initialized")
_rapid_engine = _RapidOCRClass(params={
# PP-OCRv5 Latin model — supports German umlauts (ä, ö, ü, ß)
"Rec.lang_type": _LangRec.LATIN,
"Rec.model_type": _ModelType.SERVER,
"Rec.ocr_version": _OCRVersion.PPOCRV5,
# Tighter detection boxes to reduce word merging
"Det.unclip_ratio": 1.3,
"Det.box_thresh": 0.6,
# Silence verbose logging
"Global.log_level": "critical",
})
logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)")
return _rapid_engine