feat: PaddleOCR Remote-Engine (PP-OCRv5 Latin auf Hetzner x86_64)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 31s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m7s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 21s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 31s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m7s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 21s
PaddleOCR als neue engine=paddle Option in der OCR-Pipeline. Microservice auf Hetzner (paddleocr-service/), async HTTP-Client (paddleocr_remote.py), Frontend-Dropdown, automatisch words_first. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -60,7 +60,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
|
|||||||
const [editedEntries, setEditedEntries] = useState<WordEntry[]>([])
|
const [editedEntries, setEditedEntries] = useState<WordEntry[]>([])
|
||||||
const [editedCells, setEditedCells] = useState<GridCell[]>([])
|
const [editedCells, setEditedCells] = useState<GridCell[]>([])
|
||||||
const [mode, setMode] = useState<'overview' | 'labeling'>('overview')
|
const [mode, setMode] = useState<'overview' | 'labeling'>('overview')
|
||||||
const [ocrEngine, setOcrEngine] = useState<'auto' | 'tesseract' | 'rapid'>('auto')
|
const [ocrEngine, setOcrEngine] = useState<'auto' | 'tesseract' | 'rapid' | 'paddle'>('auto')
|
||||||
const [usedEngine, setUsedEngine] = useState<string>('')
|
const [usedEngine, setUsedEngine] = useState<string>('')
|
||||||
const [pronunciation, setPronunciation] = useState<'british' | 'american'>('british')
|
const [pronunciation, setPronunciation] = useState<'british' | 'american'>('british')
|
||||||
const [gridMethod, setGridMethod] = useState<'v2' | 'words_first'>('v2')
|
const [gridMethod, setGridMethod] = useState<'v2' | 'words_first'>('v2')
|
||||||
@@ -810,12 +810,13 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
|
|||||||
{/* OCR Engine selector */}
|
{/* OCR Engine selector */}
|
||||||
<select
|
<select
|
||||||
value={ocrEngine}
|
value={ocrEngine}
|
||||||
onChange={(e) => setOcrEngine(e.target.value as 'auto' | 'tesseract' | 'rapid')}
|
onChange={(e) => setOcrEngine(e.target.value as 'auto' | 'tesseract' | 'rapid' | 'paddle')}
|
||||||
className="px-2 py-1.5 text-xs border rounded-lg dark:bg-gray-700 dark:border-gray-600"
|
className="px-2 py-1.5 text-xs border rounded-lg dark:bg-gray-700 dark:border-gray-600"
|
||||||
>
|
>
|
||||||
<option value="auto">Auto (RapidOCR wenn verfuegbar)</option>
|
<option value="auto">Auto (RapidOCR wenn verfuegbar)</option>
|
||||||
<option value="rapid">RapidOCR (ONNX)</option>
|
<option value="rapid">RapidOCR (ONNX)</option>
|
||||||
<option value="tesseract">Tesseract</option>
|
<option value="tesseract">Tesseract</option>
|
||||||
|
<option value="paddle">PaddleOCR (Hetzner)</option>
|
||||||
</select>
|
</select>
|
||||||
|
|
||||||
{/* Pronunciation selector (only for vocab) */}
|
{/* Pronunciation selector (only for vocab) */}
|
||||||
@@ -843,6 +844,8 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
|
|||||||
<span className={`px-2 py-0.5 rounded text-[10px] uppercase font-semibold ${
|
<span className={`px-2 py-0.5 rounded text-[10px] uppercase font-semibold ${
|
||||||
usedEngine === 'rapid'
|
usedEngine === 'rapid'
|
||||||
? 'bg-purple-100 dark:bg-purple-900/30 text-purple-700 dark:text-purple-300'
|
? 'bg-purple-100 dark:bg-purple-900/30 text-purple-700 dark:text-purple-300'
|
||||||
|
: usedEngine === 'paddle'
|
||||||
|
? 'bg-blue-100 dark:bg-blue-900/30 text-blue-700 dark:text-blue-300'
|
||||||
: 'bg-gray-100 dark:bg-gray-700 text-gray-600 dark:text-gray-400'
|
: 'bg-gray-100 dark:bg-gray-700 text-gray-600 dark:text-gray-400'
|
||||||
}`}>
|
}`}>
|
||||||
{usedEngine}
|
{usedEngine}
|
||||||
|
|||||||
@@ -236,6 +236,8 @@ services:
|
|||||||
MINIO_BUCKET: ${MINIO_BUCKET:-breakpilot-rag}
|
MINIO_BUCKET: ${MINIO_BUCKET:-breakpilot-rag}
|
||||||
MINIO_SECURE: "false"
|
MINIO_SECURE: "false"
|
||||||
PADDLEOCR_SERVICE_URL: http://paddleocr-service:8095
|
PADDLEOCR_SERVICE_URL: http://paddleocr-service:8095
|
||||||
|
PADDLEOCR_REMOTE_URL: ${PADDLEOCR_REMOTE_URL:-https://hetzner.meghsakha.com:8095}
|
||||||
|
PADDLEOCR_API_KEY: ${PADDLEOCR_API_KEY:-}
|
||||||
VAULT_ADDR: http://bp-core-vault:8200
|
VAULT_ADDR: http://bp-core-vault:8200
|
||||||
VAULT_TOKEN: ${VAULT_TOKEN:-breakpilot-dev-token}
|
VAULT_TOKEN: ${VAULT_TOKEN:-breakpilot-dev-token}
|
||||||
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-}
|
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-}
|
||||||
|
|||||||
@@ -269,7 +269,7 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`.
|
|||||||
|
|
||||||
| Parameter | Default | Beschreibung |
|
| Parameter | Default | Beschreibung |
|
||||||
|-----------|---------|--------------|
|
|-----------|---------|--------------|
|
||||||
| `engine` | `auto` | OCR-Engine: `auto`, `tesseract`, `rapid` |
|
| `engine` | `auto` | OCR-Engine: `auto`, `tesseract`, `rapid`, `paddle` |
|
||||||
| `pronunciation` | `british` | IPA-Woerterbuch: `british` oder `american` |
|
| `pronunciation` | `british` | IPA-Woerterbuch: `british` oder `american` |
|
||||||
| `stream` | `false` | SSE-Streaming (nur bei `grid_method=v2`) |
|
| `stream` | `false` | SSE-Streaming (nur bei `grid_method=v2`) |
|
||||||
| `skip_heal_gaps` | `false` | Zeilen-Luecken nicht heilen (Overlay-Modus) |
|
| `skip_heal_gaps` | `false` | Zeilen-Luecken nicht heilen (Overlay-Modus) |
|
||||||
@@ -706,10 +706,32 @@ Isolierte OCR einer einzelnen Zelle (Spalte × Zeile Schnittflaeche):
|
|||||||
1. **Crop:** Exakte Spalten- × Zeilengrenzen mit 3px internem Padding
|
1. **Crop:** Exakte Spalten- × Zeilengrenzen mit 3px internem Padding
|
||||||
2. **Density-Check:** Ueberspringe leere Zellen (`dark_ratio < 0.005`)
|
2. **Density-Check:** Ueberspringe leere Zellen (`dark_ratio < 0.005`)
|
||||||
3. **Upscaling:** Kleine Crops (Hoehe < 80px) werden 3× vergroessert
|
3. **Upscaling:** Kleine Crops (Hoehe < 80px) werden 3× vergroessert
|
||||||
4. **OCR:** Engine-spezifisch (Tesseract, TrOCR, RapidOCR, LightON)
|
4. **OCR:** Engine-spezifisch (Tesseract, TrOCR, RapidOCR, LightON, PaddleOCR)
|
||||||
5. **Fallback:** Bei leerem Ergebnis → PSM 7 (Einzelzeile) statt PSM 6
|
5. **Fallback:** Bei leerem Ergebnis → PSM 7 (Einzelzeile) statt PSM 6
|
||||||
6. **Bereinigung:** `_clean_cell_text_lite()` (aggressives Noise-Filtering)
|
6. **Bereinigung:** `_clean_cell_text_lite()` (aggressives Noise-Filtering)
|
||||||
|
|
||||||
|
### PaddleOCR Remote-Engine (`engine=paddle`)
|
||||||
|
|
||||||
|
PaddleOCR (PP-OCRv5 Latin) laeuft als eigenstaendiger Microservice auf einem Hetzner x86_64 Server,
|
||||||
|
da PaddlePaddle nicht auf ARM64 (Mac Mini) laeuft.
|
||||||
|
|
||||||
|
```
|
||||||
|
Mac Mini (klausur-service) Hetzner (paddleocr-service)
|
||||||
|
│ HTTPS POST + Bild │
|
||||||
|
│ ──────────────────────────▶ │ PP-OCRv5 Latin
|
||||||
|
│ │ FastAPI (Port 8095)
|
||||||
|
│ JSON word_boxes │ API-Key Auth
|
||||||
|
│ ◀────────────────────────── │
|
||||||
|
```
|
||||||
|
|
||||||
|
**Besonderheiten:**
|
||||||
|
|
||||||
|
- Erzwingt automatisch `grid_method=words_first` (full-page OCR, kein cell-crop)
|
||||||
|
- Async HTTP-Client (`paddleocr_remote.py`) mit 30s Timeout
|
||||||
|
- Koordinaten sind bereits absolut (kein content_bounds Offset noetig)
|
||||||
|
- API-Key Authentifizierung ueber `X-API-Key` Header
|
||||||
|
- Dateien: `paddleocr-service/main.py`, `services/paddleocr_remote.py`, `cv_ocr_engines.py:ocr_region_paddle()`
|
||||||
|
|
||||||
### Ablauf von `build_cell_grid_v2()`
|
### Ablauf von `build_cell_grid_v2()`
|
||||||
|
|
||||||
```
|
```
|
||||||
@@ -1063,6 +1085,7 @@ ssh macmini "/usr/local/bin/docker compose -f /Users/benjaminadmin/Projekte/brea
|
|||||||
|
|
||||||
| Datum | Version | Aenderung |
|
| Datum | Version | Aenderung |
|
||||||
|-------|---------|----------|
|
|-------|---------|----------|
|
||||||
|
| 2026-03-12 | 4.4.0 | PaddleOCR Remote-Engine (`engine=paddle`): PP-OCRv5 Latin auf Hetzner x86_64. Neuer Microservice (`paddleocr-service/`), HTTP-Client (`paddleocr_remote.py`), Frontend-Dropdown-Option. Nutzt words_first Grid-Methode. |
|
||||||
| 2026-03-12 | 4.3.0 | Words-First Grid Builder (`cv_words_first.py`): Bottom-up-Algorithmus clustert Tesseract word_boxes direkt zu Spalten/Zeilen/Zellen. Neuer `grid_method` Parameter im `/words` Endpoint. Frontend-Toggle in StepWordRecognition. |
|
| 2026-03-12 | 4.3.0 | Words-First Grid Builder (`cv_words_first.py`): Bottom-up-Algorithmus clustert Tesseract word_boxes direkt zu Spalten/Zeilen/Zellen. Neuer `grid_method` Parameter im `/words` Endpoint. Frontend-Toggle in StepWordRecognition. |
|
||||||
| 2026-03-10 | 4.2.0 | Rekonstruktion: Overlay-Modus mit Pixel-Wortpositionierung, 180°-Rotation, Sub-Session-Merging, usePixelWordPositions Hook, Box-Boundary-Schutz (box_ranges_inner) |
|
| 2026-03-10 | 4.2.0 | Rekonstruktion: Overlay-Modus mit Pixel-Wortpositionierung, 180°-Rotation, Sub-Session-Merging, usePixelWordPositions Hook, Box-Boundary-Schutz (box_ranges_inner) |
|
||||||
| 2026-03-05 | 3.1.0 | Spalten: Seiten-Segmentierung an Sub-Headern, Word-Coverage Fallback, Segment-gefilterte Validierung |
|
| 2026-03-05 | 3.1.0 | Spalten: Seiten-Segmentierung an Sub-Headern, Word-Coverage Fallback, Segment-gefilterte Validierung |
|
||||||
|
|||||||
@@ -385,6 +385,51 @@ def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
# --- Remote PaddleOCR (Hetzner x86_64) ---
|
||||||
|
|
||||||
|
|
||||||
|
async def ocr_region_paddle(
|
||||||
|
img_bgr: np.ndarray,
|
||||||
|
region: Optional["PageRegion"] = None,
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""Run OCR via remote PaddleOCR service (Hetzner).
|
||||||
|
|
||||||
|
If *region* is given, crops before sending. Otherwise sends the full image.
|
||||||
|
Returns word dicts in the standard format (left/top in absolute coords).
|
||||||
|
"""
|
||||||
|
from services.paddleocr_remote import ocr_remote_paddle
|
||||||
|
|
||||||
|
if region is not None:
|
||||||
|
crop = img_bgr[
|
||||||
|
region.y : region.y + region.height,
|
||||||
|
region.x : region.x + region.width,
|
||||||
|
]
|
||||||
|
offset_x, offset_y = region.x, region.y
|
||||||
|
else:
|
||||||
|
crop = img_bgr
|
||||||
|
offset_x, offset_y = 0, 0
|
||||||
|
|
||||||
|
if crop.size == 0:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Encode as PNG
|
||||||
|
success, png_buf = cv2.imencode(".png", crop)
|
||||||
|
if not success:
|
||||||
|
logger.error("ocr_region_paddle: cv2.imencode failed")
|
||||||
|
return []
|
||||||
|
|
||||||
|
words, _w, _h = await ocr_remote_paddle(png_buf.tobytes())
|
||||||
|
|
||||||
|
# Shift coordinates to absolute image space
|
||||||
|
for w in words:
|
||||||
|
w["left"] += offset_x
|
||||||
|
w["top"] += offset_y
|
||||||
|
if region is not None:
|
||||||
|
w["region_type"] = region.type
|
||||||
|
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Post-Processing: Deterministic Quality Fixes
|
# Post-Processing: Deterministic Quality Fixes
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|||||||
@@ -1865,7 +1865,7 @@ async def detect_words(
|
|||||||
"""Build word grid from columns × rows, OCR each cell.
|
"""Build word grid from columns × rows, OCR each cell.
|
||||||
|
|
||||||
Query params:
|
Query params:
|
||||||
engine: 'auto' (default), 'tesseract', or 'rapid'
|
engine: 'auto' (default), 'tesseract', 'rapid', or 'paddle'
|
||||||
pronunciation: 'british' (default) or 'american' — for IPA dictionary lookup
|
pronunciation: 'british' (default) or 'american' — for IPA dictionary lookup
|
||||||
stream: false (default) for JSON response, true for SSE streaming
|
stream: false (default) for JSON response, true for SSE streaming
|
||||||
skip_heal_gaps: false (default). When true, cells keep exact row geometry
|
skip_heal_gaps: false (default). When true, cells keep exact row geometry
|
||||||
@@ -1874,6 +1874,11 @@ async def detect_words(
|
|||||||
'v2' uses pre-detected columns/rows (top-down).
|
'v2' uses pre-detected columns/rows (top-down).
|
||||||
'words_first' clusters words bottom-up (no column/row detection needed).
|
'words_first' clusters words bottom-up (no column/row detection needed).
|
||||||
"""
|
"""
|
||||||
|
# PaddleOCR is full-page remote OCR → force words_first grid method
|
||||||
|
if engine == "paddle" and grid_method != "words_first":
|
||||||
|
logger.info("detect_words: engine=paddle requires words_first, overriding grid_method=%s", grid_method)
|
||||||
|
grid_method = "words_first"
|
||||||
|
|
||||||
if session_id not in _cache:
|
if session_id not in _cache:
|
||||||
logger.info("detect_words: session %s not in cache, loading from DB", session_id)
|
logger.info("detect_words: session %s not in cache, loading from DB", session_id)
|
||||||
await _load_session_to_cache(session_id)
|
await _load_session_to_cache(session_id)
|
||||||
@@ -1993,33 +1998,43 @@ async def detect_words(
|
|||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
img_h, img_w = dewarped_bgr.shape[:2]
|
img_h, img_w = dewarped_bgr.shape[:2]
|
||||||
|
|
||||||
# Get word_dicts from cache or run Tesseract full-page
|
# For paddle engine: run remote PaddleOCR full-page instead of Tesseract
|
||||||
wf_word_dicts = cached.get("_word_dicts")
|
if engine == "paddle":
|
||||||
if wf_word_dicts is None:
|
from cv_ocr_engines import ocr_region_paddle
|
||||||
ocr_img_tmp = create_ocr_image(dewarped_bgr)
|
|
||||||
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
|
wf_word_dicts = await ocr_region_paddle(dewarped_bgr, region=None)
|
||||||
if geo_result is not None:
|
# PaddleOCR returns absolute coordinates, no content_bounds offset needed
|
||||||
_geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
|
cached["_paddle_word_dicts"] = wf_word_dicts
|
||||||
cached["_word_dicts"] = wf_word_dicts
|
else:
|
||||||
cached["_inv"] = inv
|
# Get word_dicts from cache or run Tesseract full-page
|
||||||
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
wf_word_dicts = cached.get("_word_dicts")
|
||||||
|
if wf_word_dicts is None:
|
||||||
|
ocr_img_tmp = create_ocr_image(dewarped_bgr)
|
||||||
|
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
|
||||||
|
if geo_result is not None:
|
||||||
|
_geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
|
||||||
|
cached["_word_dicts"] = wf_word_dicts
|
||||||
|
cached["_inv"] = inv
|
||||||
|
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
||||||
|
|
||||||
if not wf_word_dicts:
|
if not wf_word_dicts:
|
||||||
raise HTTPException(status_code=400, detail="No words detected — cannot build words-first grid")
|
raise HTTPException(status_code=400, detail="No words detected — cannot build words-first grid")
|
||||||
|
|
||||||
# Convert word coordinates to absolute image coordinates if needed
|
# Convert word coordinates to absolute image coordinates if needed
|
||||||
# (detect_column_geometry returns words relative to content ROI)
|
# (detect_column_geometry returns words relative to content ROI)
|
||||||
content_bounds = cached.get("_content_bounds")
|
# PaddleOCR already returns absolute coordinates — skip offset.
|
||||||
if content_bounds:
|
if engine != "paddle":
|
||||||
lx, _rx, ty, _by = content_bounds
|
content_bounds = cached.get("_content_bounds")
|
||||||
abs_words = []
|
if content_bounds:
|
||||||
for w in wf_word_dicts:
|
lx, _rx, ty, _by = content_bounds
|
||||||
abs_words.append({
|
abs_words = []
|
||||||
**w,
|
for w in wf_word_dicts:
|
||||||
'left': w['left'] + lx,
|
abs_words.append({
|
||||||
'top': w['top'] + ty,
|
**w,
|
||||||
})
|
'left': w['left'] + lx,
|
||||||
wf_word_dicts = abs_words
|
'top': w['top'] + ty,
|
||||||
|
})
|
||||||
|
wf_word_dicts = abs_words
|
||||||
|
|
||||||
cells, columns_meta = build_grid_from_words(wf_word_dicts, img_w, img_h)
|
cells, columns_meta = build_grid_from_words(wf_word_dicts, img_w, img_h)
|
||||||
duration = time.time() - t0
|
duration = time.time() - t0
|
||||||
@@ -2035,7 +2050,7 @@ async def detect_words(
|
|||||||
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
||||||
n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
|
n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
|
||||||
n_cols = len(columns_meta)
|
n_cols = len(columns_meta)
|
||||||
used_engine = "words_first"
|
used_engine = "paddle" if engine == "paddle" else "words_first"
|
||||||
|
|
||||||
word_result = {
|
word_result = {
|
||||||
"cells": cells,
|
"cells": cells,
|
||||||
|
|||||||
44
klausur-service/backend/services/paddleocr_remote.py
Normal file
44
klausur-service/backend/services/paddleocr_remote.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
"""Remote PaddleOCR client — calls Hetzner service."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
PADDLEOCR_REMOTE_URL = os.environ.get("PADDLEOCR_REMOTE_URL", "")
|
||||||
|
PADDLEOCR_API_KEY = os.environ.get("PADDLEOCR_API_KEY", "")
|
||||||
|
_TIMEOUT = 30.0
|
||||||
|
|
||||||
|
|
||||||
|
async def ocr_remote_paddle(
|
||||||
|
image_bytes: bytes,
|
||||||
|
filename: str = "scan.png",
|
||||||
|
) -> Tuple[List[Dict], int, int]:
|
||||||
|
"""Send image to remote PaddleOCR, return (word_dicts, img_w, img_h)."""
|
||||||
|
if not PADDLEOCR_REMOTE_URL:
|
||||||
|
raise RuntimeError("PADDLEOCR_REMOTE_URL not configured")
|
||||||
|
|
||||||
|
headers = {}
|
||||||
|
if PADDLEOCR_API_KEY:
|
||||||
|
headers["X-API-Key"] = PADDLEOCR_API_KEY
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=_TIMEOUT, verify=False) as client:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{PADDLEOCR_REMOTE_URL.rstrip('/')}/ocr",
|
||||||
|
files={"file": (filename, image_bytes, "image/png")},
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
|
||||||
|
words = data.get("words", [])
|
||||||
|
logger.info(
|
||||||
|
"PaddleOCR remote returned %d words (img %dx%d)",
|
||||||
|
len(words),
|
||||||
|
data.get("image_width", 0),
|
||||||
|
data.get("image_height", 0),
|
||||||
|
)
|
||||||
|
return words, data["image_width"], data["image_height"]
|
||||||
101
klausur-service/backend/tests/test_paddleocr_remote.py
Normal file
101
klausur-service/backend/tests/test_paddleocr_remote.py
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
"""Tests for the remote PaddleOCR client."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import AsyncMock, patch, MagicMock
|
||||||
|
|
||||||
|
|
||||||
|
SAMPLE_RESPONSE = {
|
||||||
|
"words": [
|
||||||
|
{"text": "Hello", "left": 10, "top": 20, "width": 80, "height": 30, "conf": 95.2},
|
||||||
|
{"text": "World", "left": 100, "top": 20, "width": 90, "height": 30, "conf": 91.0},
|
||||||
|
],
|
||||||
|
"image_width": 640,
|
||||||
|
"image_height": 480,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_ocr_remote_paddle_success():
|
||||||
|
"""Successful OCR call returns word dicts and image dimensions."""
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.json.return_value = SAMPLE_RESPONSE
|
||||||
|
mock_response.raise_for_status = MagicMock()
|
||||||
|
|
||||||
|
mock_client = AsyncMock()
|
||||||
|
mock_client.post.return_value = mock_response
|
||||||
|
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||||
|
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||||
|
|
||||||
|
with patch("services.paddleocr_remote.PADDLEOCR_REMOTE_URL", "https://example.com:8095"), \
|
||||||
|
patch("services.paddleocr_remote.PADDLEOCR_API_KEY", "test-key"), \
|
||||||
|
patch("httpx.AsyncClient", return_value=mock_client):
|
||||||
|
|
||||||
|
from services.paddleocr_remote import ocr_remote_paddle
|
||||||
|
words, w, h = await ocr_remote_paddle(b"fake-png-bytes", "test.png")
|
||||||
|
|
||||||
|
assert len(words) == 2
|
||||||
|
assert words[0]["text"] == "Hello"
|
||||||
|
assert words[1]["text"] == "World"
|
||||||
|
assert w == 640
|
||||||
|
assert h == 480
|
||||||
|
|
||||||
|
# Verify API key was sent
|
||||||
|
call_kwargs = mock_client.post.call_args
|
||||||
|
assert call_kwargs.kwargs["headers"]["X-API-Key"] == "test-key"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_ocr_remote_paddle_no_url():
|
||||||
|
"""Raises RuntimeError when PADDLEOCR_REMOTE_URL is not configured."""
|
||||||
|
with patch("services.paddleocr_remote.PADDLEOCR_REMOTE_URL", ""):
|
||||||
|
from services.paddleocr_remote import ocr_remote_paddle
|
||||||
|
with pytest.raises(RuntimeError, match="PADDLEOCR_REMOTE_URL not configured"):
|
||||||
|
await ocr_remote_paddle(b"fake-png-bytes")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_ocr_remote_paddle_no_api_key():
|
||||||
|
"""When no API key is set, no X-API-Key header is sent."""
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.json.return_value = SAMPLE_RESPONSE
|
||||||
|
mock_response.raise_for_status = MagicMock()
|
||||||
|
|
||||||
|
mock_client = AsyncMock()
|
||||||
|
mock_client.post.return_value = mock_response
|
||||||
|
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||||
|
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||||
|
|
||||||
|
with patch("services.paddleocr_remote.PADDLEOCR_REMOTE_URL", "https://example.com:8095"), \
|
||||||
|
patch("services.paddleocr_remote.PADDLEOCR_API_KEY", ""), \
|
||||||
|
patch("httpx.AsyncClient", return_value=mock_client):
|
||||||
|
|
||||||
|
from services.paddleocr_remote import ocr_remote_paddle
|
||||||
|
words, w, h = await ocr_remote_paddle(b"fake-png-bytes")
|
||||||
|
|
||||||
|
assert len(words) == 2
|
||||||
|
call_kwargs = mock_client.post.call_args
|
||||||
|
assert "X-API-Key" not in call_kwargs.kwargs["headers"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_ocr_remote_paddle_http_error():
|
||||||
|
"""HTTP errors are raised to the caller."""
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.raise_for_status.side_effect = httpx.HTTPStatusError(
|
||||||
|
"401 Unauthorized", request=MagicMock(), response=MagicMock()
|
||||||
|
)
|
||||||
|
|
||||||
|
mock_client = AsyncMock()
|
||||||
|
mock_client.post.return_value = mock_response
|
||||||
|
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||||
|
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||||
|
|
||||||
|
with patch("services.paddleocr_remote.PADDLEOCR_REMOTE_URL", "https://example.com:8095"), \
|
||||||
|
patch("services.paddleocr_remote.PADDLEOCR_API_KEY", "wrong-key"), \
|
||||||
|
patch("httpx.AsyncClient", return_value=mock_client):
|
||||||
|
|
||||||
|
from services.paddleocr_remote import ocr_remote_paddle
|
||||||
|
with pytest.raises(httpx.HTTPStatusError):
|
||||||
|
await ocr_remote_paddle(b"fake-png-bytes")
|
||||||
16
paddleocr-service/Dockerfile
Normal file
16
paddleocr-service/Dockerfile
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
libgl1-mesa-glx libglib2.0-0 curl \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
EXPOSE 8095
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
|
||||||
|
CMD curl -f http://127.0.0.1:8095/health || exit 1
|
||||||
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8095"]
|
||||||
71
paddleocr-service/main.py
Normal file
71
paddleocr-service/main.py
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
"""PaddleOCR Remote Service — PP-OCRv5 Latin auf x86_64."""
|
||||||
|
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from fastapi import FastAPI, File, Header, HTTPException, UploadFile
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
app = FastAPI(title="PaddleOCR Service")
|
||||||
|
|
||||||
|
_engine = None
|
||||||
|
API_KEY = os.environ.get("PADDLEOCR_API_KEY", "")
|
||||||
|
|
||||||
|
|
||||||
|
def get_engine():
|
||||||
|
global _engine
|
||||||
|
if _engine is None:
|
||||||
|
from paddleocr import PaddleOCR
|
||||||
|
|
||||||
|
_engine = PaddleOCR(
|
||||||
|
lang="latin",
|
||||||
|
use_angle_cls=True,
|
||||||
|
show_log=False,
|
||||||
|
)
|
||||||
|
return _engine
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
def health():
|
||||||
|
return {"status": "ok", "model": "PP-OCRv5-latin"}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/ocr")
|
||||||
|
async def ocr(
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
x_api_key: str = Header(default=""),
|
||||||
|
):
|
||||||
|
if API_KEY and x_api_key != API_KEY:
|
||||||
|
raise HTTPException(status_code=401, detail="Invalid API key")
|
||||||
|
|
||||||
|
img_bytes = await file.read()
|
||||||
|
img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
|
||||||
|
img_np = np.array(img)
|
||||||
|
|
||||||
|
engine = get_engine()
|
||||||
|
result = engine.ocr(img_np)
|
||||||
|
|
||||||
|
words = []
|
||||||
|
for line in result[0] or []:
|
||||||
|
box, (text, conf) = line[0], line[1]
|
||||||
|
x_min = min(p[0] for p in box)
|
||||||
|
y_min = min(p[1] for p in box)
|
||||||
|
x_max = max(p[0] for p in box)
|
||||||
|
y_max = max(p[1] for p in box)
|
||||||
|
words.append(
|
||||||
|
{
|
||||||
|
"text": text,
|
||||||
|
"left": int(x_min),
|
||||||
|
"top": int(y_min),
|
||||||
|
"width": int(x_max - x_min),
|
||||||
|
"height": int(y_max - y_min),
|
||||||
|
"conf": round(conf * 100, 1),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"words": words,
|
||||||
|
"image_width": img_np.shape[1],
|
||||||
|
"image_height": img_np.shape[0],
|
||||||
|
}
|
||||||
7
paddleocr-service/requirements.txt
Normal file
7
paddleocr-service/requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
paddlepaddle>=3.0.0
|
||||||
|
paddleocr>=2.9.0
|
||||||
|
fastapi>=0.110.0
|
||||||
|
uvicorn>=0.25.0
|
||||||
|
python-multipart>=0.0.6
|
||||||
|
Pillow>=10.0.0
|
||||||
|
numpy>=1.24.0
|
||||||
Reference in New Issue
Block a user