fix(ocr-pipeline): configure RapidOCR for German + tighter word detection
- Switch to PP-OCRv5 Latin model (supports ä, ö, ü, ß) - Use SERVER model for better accuracy - Lower Det.unclip_ratio 1.6→1.3 to reduce word merging - Raise Det.box_thresh 0.5→0.6 for stricter detection Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2196,6 +2196,7 @@ RAPIDOCR_AVAILABLE = False
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
from rapidocr import RapidOCR as _RapidOCRClass
|
from rapidocr import RapidOCR as _RapidOCRClass
|
||||||
|
from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType
|
||||||
RAPIDOCR_AVAILABLE = True
|
RAPIDOCR_AVAILABLE = True
|
||||||
logger.info("RapidOCR available — can be used as alternative to Tesseract")
|
logger.info("RapidOCR available — can be used as alternative to Tesseract")
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@@ -2203,11 +2204,21 @@ except ImportError:
|
|||||||
|
|
||||||
|
|
||||||
def _get_rapid_engine():
|
def _get_rapid_engine():
|
||||||
"""Lazy-init RapidOCR engine (downloads models on first use)."""
|
"""Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support."""
|
||||||
global _rapid_engine
|
global _rapid_engine
|
||||||
if _rapid_engine is None:
|
if _rapid_engine is None:
|
||||||
_rapid_engine = _RapidOCRClass()
|
_rapid_engine = _RapidOCRClass(params={
|
||||||
logger.info("RapidOCR engine initialized")
|
# PP-OCRv5 Latin model — supports German umlauts (ä, ö, ü, ß)
|
||||||
|
"Rec.lang_type": _LangRec.LATIN,
|
||||||
|
"Rec.model_type": _ModelType.SERVER,
|
||||||
|
"Rec.ocr_version": _OCRVersion.PPOCRV5,
|
||||||
|
# Tighter detection boxes to reduce word merging
|
||||||
|
"Det.unclip_ratio": 1.3,
|
||||||
|
"Det.box_thresh": 0.6,
|
||||||
|
# Silence verbose logging
|
||||||
|
"Global.log_level": "critical",
|
||||||
|
})
|
||||||
|
logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)")
|
||||||
return _rapid_engine
|
return _rapid_engine
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user