feat: PaddleOCR Remote-Engine (PP-OCRv5 Latin auf Hetzner x86_64)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 31s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m7s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 21s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 31s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m7s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 21s
PaddleOCR als neue engine=paddle Option in der OCR-Pipeline. Microservice auf Hetzner (paddleocr-service/), async HTTP-Client (paddleocr_remote.py), Frontend-Dropdown, automatisch words_first. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -385,6 +385,51 @@ def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str
|
||||
return []
|
||||
|
||||
|
||||
# --- Remote PaddleOCR (Hetzner x86_64) ---
|
||||
|
||||
|
||||
async def ocr_region_paddle(
|
||||
img_bgr: np.ndarray,
|
||||
region: Optional["PageRegion"] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Run OCR via remote PaddleOCR service (Hetzner).
|
||||
|
||||
If *region* is given, crops before sending. Otherwise sends the full image.
|
||||
Returns word dicts in the standard format (left/top in absolute coords).
|
||||
"""
|
||||
from services.paddleocr_remote import ocr_remote_paddle
|
||||
|
||||
if region is not None:
|
||||
crop = img_bgr[
|
||||
region.y : region.y + region.height,
|
||||
region.x : region.x + region.width,
|
||||
]
|
||||
offset_x, offset_y = region.x, region.y
|
||||
else:
|
||||
crop = img_bgr
|
||||
offset_x, offset_y = 0, 0
|
||||
|
||||
if crop.size == 0:
|
||||
return []
|
||||
|
||||
# Encode as PNG
|
||||
success, png_buf = cv2.imencode(".png", crop)
|
||||
if not success:
|
||||
logger.error("ocr_region_paddle: cv2.imencode failed")
|
||||
return []
|
||||
|
||||
words, _w, _h = await ocr_remote_paddle(png_buf.tobytes())
|
||||
|
||||
# Shift coordinates to absolute image space
|
||||
for w in words:
|
||||
w["left"] += offset_x
|
||||
w["top"] += offset_y
|
||||
if region is not None:
|
||||
w["region_type"] = region.type
|
||||
|
||||
return words
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Post-Processing: Deterministic Quality Fixes
|
||||
# =============================================================================
|
||||
|
||||
@@ -1865,7 +1865,7 @@ async def detect_words(
|
||||
"""Build word grid from columns × rows, OCR each cell.
|
||||
|
||||
Query params:
|
||||
engine: 'auto' (default), 'tesseract', or 'rapid'
|
||||
engine: 'auto' (default), 'tesseract', 'rapid', or 'paddle'
|
||||
pronunciation: 'british' (default) or 'american' — for IPA dictionary lookup
|
||||
stream: false (default) for JSON response, true for SSE streaming
|
||||
skip_heal_gaps: false (default). When true, cells keep exact row geometry
|
||||
@@ -1874,6 +1874,11 @@ async def detect_words(
|
||||
'v2' uses pre-detected columns/rows (top-down).
|
||||
'words_first' clusters words bottom-up (no column/row detection needed).
|
||||
"""
|
||||
# PaddleOCR is full-page remote OCR → force words_first grid method
|
||||
if engine == "paddle" and grid_method != "words_first":
|
||||
logger.info("detect_words: engine=paddle requires words_first, overriding grid_method=%s", grid_method)
|
||||
grid_method = "words_first"
|
||||
|
||||
if session_id not in _cache:
|
||||
logger.info("detect_words: session %s not in cache, loading from DB", session_id)
|
||||
await _load_session_to_cache(session_id)
|
||||
@@ -1993,33 +1998,43 @@ async def detect_words(
|
||||
t0 = time.time()
|
||||
img_h, img_w = dewarped_bgr.shape[:2]
|
||||
|
||||
# Get word_dicts from cache or run Tesseract full-page
|
||||
wf_word_dicts = cached.get("_word_dicts")
|
||||
if wf_word_dicts is None:
|
||||
ocr_img_tmp = create_ocr_image(dewarped_bgr)
|
||||
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
|
||||
if geo_result is not None:
|
||||
_geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
|
||||
cached["_word_dicts"] = wf_word_dicts
|
||||
cached["_inv"] = inv
|
||||
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
||||
# For paddle engine: run remote PaddleOCR full-page instead of Tesseract
|
||||
if engine == "paddle":
|
||||
from cv_ocr_engines import ocr_region_paddle
|
||||
|
||||
wf_word_dicts = await ocr_region_paddle(dewarped_bgr, region=None)
|
||||
# PaddleOCR returns absolute coordinates, no content_bounds offset needed
|
||||
cached["_paddle_word_dicts"] = wf_word_dicts
|
||||
else:
|
||||
# Get word_dicts from cache or run Tesseract full-page
|
||||
wf_word_dicts = cached.get("_word_dicts")
|
||||
if wf_word_dicts is None:
|
||||
ocr_img_tmp = create_ocr_image(dewarped_bgr)
|
||||
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
|
||||
if geo_result is not None:
|
||||
_geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
|
||||
cached["_word_dicts"] = wf_word_dicts
|
||||
cached["_inv"] = inv
|
||||
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
||||
|
||||
if not wf_word_dicts:
|
||||
raise HTTPException(status_code=400, detail="No words detected — cannot build words-first grid")
|
||||
|
||||
# Convert word coordinates to absolute image coordinates if needed
|
||||
# (detect_column_geometry returns words relative to content ROI)
|
||||
content_bounds = cached.get("_content_bounds")
|
||||
if content_bounds:
|
||||
lx, _rx, ty, _by = content_bounds
|
||||
abs_words = []
|
||||
for w in wf_word_dicts:
|
||||
abs_words.append({
|
||||
**w,
|
||||
'left': w['left'] + lx,
|
||||
'top': w['top'] + ty,
|
||||
})
|
||||
wf_word_dicts = abs_words
|
||||
# PaddleOCR already returns absolute coordinates — skip offset.
|
||||
if engine != "paddle":
|
||||
content_bounds = cached.get("_content_bounds")
|
||||
if content_bounds:
|
||||
lx, _rx, ty, _by = content_bounds
|
||||
abs_words = []
|
||||
for w in wf_word_dicts:
|
||||
abs_words.append({
|
||||
**w,
|
||||
'left': w['left'] + lx,
|
||||
'top': w['top'] + ty,
|
||||
})
|
||||
wf_word_dicts = abs_words
|
||||
|
||||
cells, columns_meta = build_grid_from_words(wf_word_dicts, img_w, img_h)
|
||||
duration = time.time() - t0
|
||||
@@ -2035,7 +2050,7 @@ async def detect_words(
|
||||
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
||||
n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
|
||||
n_cols = len(columns_meta)
|
||||
used_engine = "words_first"
|
||||
used_engine = "paddle" if engine == "paddle" else "words_first"
|
||||
|
||||
word_result = {
|
||||
"cells": cells,
|
||||
|
||||
44
klausur-service/backend/services/paddleocr_remote.py
Normal file
44
klausur-service/backend/services/paddleocr_remote.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""Remote PaddleOCR client — calls Hetzner service."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PADDLEOCR_REMOTE_URL = os.environ.get("PADDLEOCR_REMOTE_URL", "")
|
||||
PADDLEOCR_API_KEY = os.environ.get("PADDLEOCR_API_KEY", "")
|
||||
_TIMEOUT = 30.0
|
||||
|
||||
|
||||
async def ocr_remote_paddle(
|
||||
image_bytes: bytes,
|
||||
filename: str = "scan.png",
|
||||
) -> Tuple[List[Dict], int, int]:
|
||||
"""Send image to remote PaddleOCR, return (word_dicts, img_w, img_h)."""
|
||||
if not PADDLEOCR_REMOTE_URL:
|
||||
raise RuntimeError("PADDLEOCR_REMOTE_URL not configured")
|
||||
|
||||
headers = {}
|
||||
if PADDLEOCR_API_KEY:
|
||||
headers["X-API-Key"] = PADDLEOCR_API_KEY
|
||||
|
||||
async with httpx.AsyncClient(timeout=_TIMEOUT, verify=False) as client:
|
||||
resp = await client.post(
|
||||
f"{PADDLEOCR_REMOTE_URL.rstrip('/')}/ocr",
|
||||
files={"file": (filename, image_bytes, "image/png")},
|
||||
headers=headers,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
words = data.get("words", [])
|
||||
logger.info(
|
||||
"PaddleOCR remote returned %d words (img %dx%d)",
|
||||
len(words),
|
||||
data.get("image_width", 0),
|
||||
data.get("image_height", 0),
|
||||
)
|
||||
return words, data["image_width"], data["image_height"]
|
||||
101
klausur-service/backend/tests/test_paddleocr_remote.py
Normal file
101
klausur-service/backend/tests/test_paddleocr_remote.py
Normal file
@@ -0,0 +1,101 @@
|
||||
"""Tests for the remote PaddleOCR client."""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import AsyncMock, patch, MagicMock
|
||||
|
||||
|
||||
SAMPLE_RESPONSE = {
|
||||
"words": [
|
||||
{"text": "Hello", "left": 10, "top": 20, "width": 80, "height": 30, "conf": 95.2},
|
||||
{"text": "World", "left": 100, "top": 20, "width": 90, "height": 30, "conf": 91.0},
|
||||
],
|
||||
"image_width": 640,
|
||||
"image_height": 480,
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ocr_remote_paddle_success():
|
||||
"""Successful OCR call returns word dicts and image dimensions."""
|
||||
mock_response = MagicMock()
|
||||
mock_response.json.return_value = SAMPLE_RESPONSE
|
||||
mock_response.raise_for_status = MagicMock()
|
||||
|
||||
mock_client = AsyncMock()
|
||||
mock_client.post.return_value = mock_response
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
|
||||
with patch("services.paddleocr_remote.PADDLEOCR_REMOTE_URL", "https://example.com:8095"), \
|
||||
patch("services.paddleocr_remote.PADDLEOCR_API_KEY", "test-key"), \
|
||||
patch("httpx.AsyncClient", return_value=mock_client):
|
||||
|
||||
from services.paddleocr_remote import ocr_remote_paddle
|
||||
words, w, h = await ocr_remote_paddle(b"fake-png-bytes", "test.png")
|
||||
|
||||
assert len(words) == 2
|
||||
assert words[0]["text"] == "Hello"
|
||||
assert words[1]["text"] == "World"
|
||||
assert w == 640
|
||||
assert h == 480
|
||||
|
||||
# Verify API key was sent
|
||||
call_kwargs = mock_client.post.call_args
|
||||
assert call_kwargs.kwargs["headers"]["X-API-Key"] == "test-key"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ocr_remote_paddle_no_url():
|
||||
"""Raises RuntimeError when PADDLEOCR_REMOTE_URL is not configured."""
|
||||
with patch("services.paddleocr_remote.PADDLEOCR_REMOTE_URL", ""):
|
||||
from services.paddleocr_remote import ocr_remote_paddle
|
||||
with pytest.raises(RuntimeError, match="PADDLEOCR_REMOTE_URL not configured"):
|
||||
await ocr_remote_paddle(b"fake-png-bytes")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ocr_remote_paddle_no_api_key():
|
||||
"""When no API key is set, no X-API-Key header is sent."""
|
||||
mock_response = MagicMock()
|
||||
mock_response.json.return_value = SAMPLE_RESPONSE
|
||||
mock_response.raise_for_status = MagicMock()
|
||||
|
||||
mock_client = AsyncMock()
|
||||
mock_client.post.return_value = mock_response
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
|
||||
with patch("services.paddleocr_remote.PADDLEOCR_REMOTE_URL", "https://example.com:8095"), \
|
||||
patch("services.paddleocr_remote.PADDLEOCR_API_KEY", ""), \
|
||||
patch("httpx.AsyncClient", return_value=mock_client):
|
||||
|
||||
from services.paddleocr_remote import ocr_remote_paddle
|
||||
words, w, h = await ocr_remote_paddle(b"fake-png-bytes")
|
||||
|
||||
assert len(words) == 2
|
||||
call_kwargs = mock_client.post.call_args
|
||||
assert "X-API-Key" not in call_kwargs.kwargs["headers"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ocr_remote_paddle_http_error():
|
||||
"""HTTP errors are raised to the caller."""
|
||||
import httpx
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.raise_for_status.side_effect = httpx.HTTPStatusError(
|
||||
"401 Unauthorized", request=MagicMock(), response=MagicMock()
|
||||
)
|
||||
|
||||
mock_client = AsyncMock()
|
||||
mock_client.post.return_value = mock_response
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
|
||||
with patch("services.paddleocr_remote.PADDLEOCR_REMOTE_URL", "https://example.com:8095"), \
|
||||
patch("services.paddleocr_remote.PADDLEOCR_API_KEY", "wrong-key"), \
|
||||
patch("httpx.AsyncClient", return_value=mock_client):
|
||||
|
||||
from services.paddleocr_remote import ocr_remote_paddle
|
||||
with pytest.raises(httpx.HTTPStatusError):
|
||||
await ocr_remote_paddle(b"fake-png-bytes")
|
||||
Reference in New Issue
Block a user