feat: use local RapidOCR as default in ocr_region_paddle(), remote as fallback
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 17s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 17s
RapidOCR uses the same PP-OCRv5 ONNX models locally, avoiding 504 timeouts from remote PaddleOCR on large images. Set FORCE_REMOTE_PADDLE=1 to bypass. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,6 +7,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|||||||
|
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
from typing import Any, Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
@@ -392,11 +393,32 @@ async def ocr_region_paddle(
|
|||||||
img_bgr: np.ndarray,
|
img_bgr: np.ndarray,
|
||||||
region: Optional["PageRegion"] = None,
|
region: Optional["PageRegion"] = None,
|
||||||
) -> List[Dict[str, Any]]:
|
) -> List[Dict[str, Any]]:
|
||||||
"""Run OCR via remote PaddleOCR service (Hetzner).
|
"""Run OCR via local RapidOCR (default) or remote PaddleOCR (fallback).
|
||||||
|
|
||||||
If *region* is given, crops before sending. Otherwise sends the full image.
|
Uses RapidOCR (same PP-OCRv5 ONNX models) locally for speed and reliability.
|
||||||
Returns word dicts in the standard format (left/top in absolute coords).
|
Falls back to remote PaddleOCR service only if:
|
||||||
|
- env FORCE_REMOTE_PADDLE=1 is set, or
|
||||||
|
- RapidOCR fails or returns no words
|
||||||
"""
|
"""
|
||||||
|
force_remote = os.environ.get("FORCE_REMOTE_PADDLE", "").strip() == "1"
|
||||||
|
|
||||||
|
if not force_remote:
|
||||||
|
try:
|
||||||
|
if region is None:
|
||||||
|
h, w = img_bgr.shape[:2]
|
||||||
|
_region = PageRegion(type="full_page", x=0, y=0, width=w, height=h)
|
||||||
|
else:
|
||||||
|
_region = region
|
||||||
|
|
||||||
|
words = ocr_region_rapid(img_bgr, _region)
|
||||||
|
if words:
|
||||||
|
logger.info("ocr_region_paddle: used local RapidOCR (%d words)", len(words))
|
||||||
|
return words
|
||||||
|
logger.warning("ocr_region_paddle: RapidOCR returned 0 words, trying remote")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("ocr_region_paddle: RapidOCR failed (%s), trying remote", e)
|
||||||
|
|
||||||
|
# --- Remote PaddleOCR fallback (Hetzner x86_64) ---
|
||||||
from services.paddleocr_remote import ocr_remote_paddle
|
from services.paddleocr_remote import ocr_remote_paddle
|
||||||
|
|
||||||
if region is not None:
|
if region is not None:
|
||||||
@@ -431,6 +453,7 @@ async def ocr_region_paddle(
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
words, _w, _h = await ocr_remote_paddle(jpg_buf.tobytes(), filename="scan.jpg")
|
words, _w, _h = await ocr_remote_paddle(jpg_buf.tobytes(), filename="scan.jpg")
|
||||||
|
logger.info("ocr_region_paddle: used remote PaddleOCR (%d words)", len(words))
|
||||||
|
|
||||||
# Scale coordinates back to original size and shift to absolute image space
|
# Scale coordinates back to original size and shift to absolute image space
|
||||||
inv_scale = 1.0 / scale if scale != 1.0 else 1.0
|
inv_scale = 1.0 / scale if scale != 1.0 else 1.0
|
||||||
|
|||||||
Reference in New Issue
Block a user