feat: PaddleOCR Remote-Engine (PP-OCRv5 Latin auf Hetzner x86_64)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 31s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m7s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 21s

PaddleOCR als neue engine=paddle Option in der OCR-Pipeline.
Microservice auf Hetzner (paddleocr-service/), async HTTP-Client
(paddleocr_remote.py), Frontend-Dropdown, automatisch words_first.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-12 09:31:22 +01:00
parent ced5bb3dd3
commit a6069631cc
10 changed files with 354 additions and 27 deletions

View File

@@ -1865,7 +1865,7 @@ async def detect_words(
"""Build word grid from columns × rows, OCR each cell.
Query params:
engine: 'auto' (default), 'tesseract', or 'rapid'
engine: 'auto' (default), 'tesseract', 'rapid', or 'paddle'
pronunciation: 'british' (default) or 'american' — for IPA dictionary lookup
stream: false (default) for JSON response, true for SSE streaming
skip_heal_gaps: false (default). When true, cells keep exact row geometry
@@ -1874,6 +1874,11 @@ async def detect_words(
'v2' uses pre-detected columns/rows (top-down).
'words_first' clusters words bottom-up (no column/row detection needed).
"""
# PaddleOCR is full-page remote OCR → force words_first grid method
if engine == "paddle" and grid_method != "words_first":
logger.info("detect_words: engine=paddle requires words_first, overriding grid_method=%s", grid_method)
grid_method = "words_first"
if session_id not in _cache:
logger.info("detect_words: session %s not in cache, loading from DB", session_id)
await _load_session_to_cache(session_id)
@@ -1993,33 +1998,43 @@ async def detect_words(
t0 = time.time()
img_h, img_w = dewarped_bgr.shape[:2]
# Get word_dicts from cache or run Tesseract full-page
wf_word_dicts = cached.get("_word_dicts")
if wf_word_dicts is None:
ocr_img_tmp = create_ocr_image(dewarped_bgr)
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
if geo_result is not None:
_geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
cached["_word_dicts"] = wf_word_dicts
cached["_inv"] = inv
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
# For paddle engine: run remote PaddleOCR full-page instead of Tesseract
if engine == "paddle":
from cv_ocr_engines import ocr_region_paddle
wf_word_dicts = await ocr_region_paddle(dewarped_bgr, region=None)
# PaddleOCR returns absolute coordinates, no content_bounds offset needed
cached["_paddle_word_dicts"] = wf_word_dicts
else:
# Get word_dicts from cache or run Tesseract full-page
wf_word_dicts = cached.get("_word_dicts")
if wf_word_dicts is None:
ocr_img_tmp = create_ocr_image(dewarped_bgr)
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
if geo_result is not None:
_geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
cached["_word_dicts"] = wf_word_dicts
cached["_inv"] = inv
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
if not wf_word_dicts:
raise HTTPException(status_code=400, detail="No words detected — cannot build words-first grid")
# Convert word coordinates to absolute image coordinates if needed
# (detect_column_geometry returns words relative to content ROI)
content_bounds = cached.get("_content_bounds")
if content_bounds:
lx, _rx, ty, _by = content_bounds
abs_words = []
for w in wf_word_dicts:
abs_words.append({
**w,
'left': w['left'] + lx,
'top': w['top'] + ty,
})
wf_word_dicts = abs_words
# PaddleOCR already returns absolute coordinates — skip offset.
if engine != "paddle":
content_bounds = cached.get("_content_bounds")
if content_bounds:
lx, _rx, ty, _by = content_bounds
abs_words = []
for w in wf_word_dicts:
abs_words.append({
**w,
'left': w['left'] + lx,
'top': w['top'] + ty,
})
wf_word_dicts = abs_words
cells, columns_meta = build_grid_from_words(wf_word_dicts, img_w, img_h)
duration = time.time() - t0
@@ -2035,7 +2050,7 @@ async def detect_words(
is_vocab = bool(col_types & {'column_en', 'column_de'})
n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
n_cols = len(columns_meta)
used_engine = "words_first"
used_engine = "paddle" if engine == "paddle" else "words_first"
word_result = {
"cells": cells,