fix(ocr-pipeline): configure RapidOCR for German + tighter word detection
- Switch to PP-OCRv5 Latin model (supports ä, ö, ü, ß) - Use SERVER model for better accuracy - Lower Det.unclip_ratio 1.6→1.3 to reduce word merging - Raise Det.box_thresh 0.5→0.6 for stricter detection Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2196,6 +2196,7 @@ RAPIDOCR_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from rapidocr import RapidOCR as _RapidOCRClass
|
||||
from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType
|
||||
RAPIDOCR_AVAILABLE = True
|
||||
logger.info("RapidOCR available — can be used as alternative to Tesseract")
|
||||
except ImportError:
|
||||
@@ -2203,11 +2204,21 @@ except ImportError:
|
||||
|
||||
|
||||
def _get_rapid_engine():
|
||||
"""Lazy-init RapidOCR engine (downloads models on first use)."""
|
||||
"""Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support."""
|
||||
global _rapid_engine
|
||||
if _rapid_engine is None:
|
||||
_rapid_engine = _RapidOCRClass()
|
||||
logger.info("RapidOCR engine initialized")
|
||||
_rapid_engine = _RapidOCRClass(params={
|
||||
# PP-OCRv5 Latin model — supports German umlauts (ä, ö, ü, ß)
|
||||
"Rec.lang_type": _LangRec.LATIN,
|
||||
"Rec.model_type": _ModelType.SERVER,
|
||||
"Rec.ocr_version": _OCRVersion.PPOCRV5,
|
||||
# Tighter detection boxes to reduce word merging
|
||||
"Det.unclip_ratio": 1.3,
|
||||
"Det.box_thresh": 0.6,
|
||||
# Silence verbose logging
|
||||
"Global.log_level": "critical",
|
||||
})
|
||||
logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)")
|
||||
return _rapid_engine
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user