diff --git a/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx b/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx index e74d655..e1d338a 100644 --- a/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx +++ b/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx @@ -60,7 +60,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps const [editedEntries, setEditedEntries] = useState([]) const [editedCells, setEditedCells] = useState([]) const [mode, setMode] = useState<'overview' | 'labeling'>('overview') - const [ocrEngine, setOcrEngine] = useState<'auto' | 'tesseract' | 'rapid'>('auto') + const [ocrEngine, setOcrEngine] = useState<'auto' | 'tesseract' | 'rapid' | 'paddle'>('auto') const [usedEngine, setUsedEngine] = useState('') const [pronunciation, setPronunciation] = useState<'british' | 'american'>('british') const [gridMethod, setGridMethod] = useState<'v2' | 'words_first'>('v2') @@ -810,12 +810,13 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps {/* OCR Engine selector */} {/* Pronunciation selector (only for vocab) */} @@ -843,6 +844,8 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps {usedEngine} diff --git a/docker-compose.yml b/docker-compose.yml index d4f6af7..6dac132 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -236,6 +236,8 @@ services: MINIO_BUCKET: ${MINIO_BUCKET:-breakpilot-rag} MINIO_SECURE: "false" PADDLEOCR_SERVICE_URL: http://paddleocr-service:8095 + PADDLEOCR_REMOTE_URL: ${PADDLEOCR_REMOTE_URL:-https://hetzner.meghsakha.com:8095} + PADDLEOCR_API_KEY: ${PADDLEOCR_API_KEY:-} VAULT_ADDR: http://bp-core-vault:8200 VAULT_TOKEN: ${VAULT_TOKEN:-breakpilot-dev-token} ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-} diff --git a/docs-src/services/klausur-service/OCR-Pipeline.md b/docs-src/services/klausur-service/OCR-Pipeline.md index a05089b..96601ac 100644 --- a/docs-src/services/klausur-service/OCR-Pipeline.md +++ b/docs-src/services/klausur-service/OCR-Pipeline.md @@ -269,7 +269,7 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`. | Parameter | Default | Beschreibung | |-----------|---------|--------------| -| `engine` | `auto` | OCR-Engine: `auto`, `tesseract`, `rapid` | +| `engine` | `auto` | OCR-Engine: `auto`, `tesseract`, `rapid`, `paddle` | | `pronunciation` | `british` | IPA-Woerterbuch: `british` oder `american` | | `stream` | `false` | SSE-Streaming (nur bei `grid_method=v2`) | | `skip_heal_gaps` | `false` | Zeilen-Luecken nicht heilen (Overlay-Modus) | @@ -706,10 +706,32 @@ Isolierte OCR einer einzelnen Zelle (Spalte × Zeile Schnittflaeche): 1. **Crop:** Exakte Spalten- × Zeilengrenzen mit 3px internem Padding 2. **Density-Check:** Ueberspringe leere Zellen (`dark_ratio < 0.005`) 3. **Upscaling:** Kleine Crops (Hoehe < 80px) werden 3× vergroessert -4. **OCR:** Engine-spezifisch (Tesseract, TrOCR, RapidOCR, LightON) +4. **OCR:** Engine-spezifisch (Tesseract, TrOCR, RapidOCR, LightON, PaddleOCR) 5. **Fallback:** Bei leerem Ergebnis → PSM 7 (Einzelzeile) statt PSM 6 6. **Bereinigung:** `_clean_cell_text_lite()` (aggressives Noise-Filtering) +### PaddleOCR Remote-Engine (`engine=paddle`) + +PaddleOCR (PP-OCRv5 Latin) laeuft als eigenstaendiger Microservice auf einem Hetzner x86_64 Server, +da PaddlePaddle nicht auf ARM64 (Mac Mini) laeuft. + +``` +Mac Mini (klausur-service) Hetzner (paddleocr-service) + │ HTTPS POST + Bild │ + │ ──────────────────────────▶ │ PP-OCRv5 Latin + │ │ FastAPI (Port 8095) + │ JSON word_boxes │ API-Key Auth + │ ◀────────────────────────── │ +``` + +**Besonderheiten:** + +- Erzwingt automatisch `grid_method=words_first` (full-page OCR, kein cell-crop) +- Async HTTP-Client (`paddleocr_remote.py`) mit 30s Timeout +- Koordinaten sind bereits absolut (kein content_bounds Offset noetig) +- API-Key Authentifizierung ueber `X-API-Key` Header +- Dateien: `paddleocr-service/main.py`, `services/paddleocr_remote.py`, `cv_ocr_engines.py:ocr_region_paddle()` + ### Ablauf von `build_cell_grid_v2()` ``` @@ -1063,6 +1085,7 @@ ssh macmini "/usr/local/bin/docker compose -f /Users/benjaminadmin/Projekte/brea | Datum | Version | Aenderung | |-------|---------|----------| +| 2026-03-12 | 4.4.0 | PaddleOCR Remote-Engine (`engine=paddle`): PP-OCRv5 Latin auf Hetzner x86_64. Neuer Microservice (`paddleocr-service/`), HTTP-Client (`paddleocr_remote.py`), Frontend-Dropdown-Option. Nutzt words_first Grid-Methode. | | 2026-03-12 | 4.3.0 | Words-First Grid Builder (`cv_words_first.py`): Bottom-up-Algorithmus clustert Tesseract word_boxes direkt zu Spalten/Zeilen/Zellen. Neuer `grid_method` Parameter im `/words` Endpoint. Frontend-Toggle in StepWordRecognition. | | 2026-03-10 | 4.2.0 | Rekonstruktion: Overlay-Modus mit Pixel-Wortpositionierung, 180°-Rotation, Sub-Session-Merging, usePixelWordPositions Hook, Box-Boundary-Schutz (box_ranges_inner) | | 2026-03-05 | 3.1.0 | Spalten: Seiten-Segmentierung an Sub-Headern, Word-Coverage Fallback, Segment-gefilterte Validierung | diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index 9081cd9..7260584 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -385,6 +385,51 @@ def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str return [] +# --- Remote PaddleOCR (Hetzner x86_64) --- + + +async def ocr_region_paddle( + img_bgr: np.ndarray, + region: Optional["PageRegion"] = None, +) -> List[Dict[str, Any]]: + """Run OCR via remote PaddleOCR service (Hetzner). + + If *region* is given, crops before sending. Otherwise sends the full image. + Returns word dicts in the standard format (left/top in absolute coords). + """ + from services.paddleocr_remote import ocr_remote_paddle + + if region is not None: + crop = img_bgr[ + region.y : region.y + region.height, + region.x : region.x + region.width, + ] + offset_x, offset_y = region.x, region.y + else: + crop = img_bgr + offset_x, offset_y = 0, 0 + + if crop.size == 0: + return [] + + # Encode as PNG + success, png_buf = cv2.imencode(".png", crop) + if not success: + logger.error("ocr_region_paddle: cv2.imencode failed") + return [] + + words, _w, _h = await ocr_remote_paddle(png_buf.tobytes()) + + # Shift coordinates to absolute image space + for w in words: + w["left"] += offset_x + w["top"] += offset_y + if region is not None: + w["region_type"] = region.type + + return words + + # ============================================================================= # Post-Processing: Deterministic Quality Fixes # ============================================================================= diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index a261b1e..612e83b 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -1865,7 +1865,7 @@ async def detect_words( """Build word grid from columns × rows, OCR each cell. Query params: - engine: 'auto' (default), 'tesseract', or 'rapid' + engine: 'auto' (default), 'tesseract', 'rapid', or 'paddle' pronunciation: 'british' (default) or 'american' — for IPA dictionary lookup stream: false (default) for JSON response, true for SSE streaming skip_heal_gaps: false (default). When true, cells keep exact row geometry @@ -1874,6 +1874,11 @@ async def detect_words( 'v2' uses pre-detected columns/rows (top-down). 'words_first' clusters words bottom-up (no column/row detection needed). """ + # PaddleOCR is full-page remote OCR → force words_first grid method + if engine == "paddle" and grid_method != "words_first": + logger.info("detect_words: engine=paddle requires words_first, overriding grid_method=%s", grid_method) + grid_method = "words_first" + if session_id not in _cache: logger.info("detect_words: session %s not in cache, loading from DB", session_id) await _load_session_to_cache(session_id) @@ -1993,33 +1998,43 @@ async def detect_words( t0 = time.time() img_h, img_w = dewarped_bgr.shape[:2] - # Get word_dicts from cache or run Tesseract full-page - wf_word_dicts = cached.get("_word_dicts") - if wf_word_dicts is None: - ocr_img_tmp = create_ocr_image(dewarped_bgr) - geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr) - if geo_result is not None: - _geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result - cached["_word_dicts"] = wf_word_dicts - cached["_inv"] = inv - cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y) + # For paddle engine: run remote PaddleOCR full-page instead of Tesseract + if engine == "paddle": + from cv_ocr_engines import ocr_region_paddle + + wf_word_dicts = await ocr_region_paddle(dewarped_bgr, region=None) + # PaddleOCR returns absolute coordinates, no content_bounds offset needed + cached["_paddle_word_dicts"] = wf_word_dicts + else: + # Get word_dicts from cache or run Tesseract full-page + wf_word_dicts = cached.get("_word_dicts") + if wf_word_dicts is None: + ocr_img_tmp = create_ocr_image(dewarped_bgr) + geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr) + if geo_result is not None: + _geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result + cached["_word_dicts"] = wf_word_dicts + cached["_inv"] = inv + cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y) if not wf_word_dicts: raise HTTPException(status_code=400, detail="No words detected — cannot build words-first grid") # Convert word coordinates to absolute image coordinates if needed # (detect_column_geometry returns words relative to content ROI) - content_bounds = cached.get("_content_bounds") - if content_bounds: - lx, _rx, ty, _by = content_bounds - abs_words = [] - for w in wf_word_dicts: - abs_words.append({ - **w, - 'left': w['left'] + lx, - 'top': w['top'] + ty, - }) - wf_word_dicts = abs_words + # PaddleOCR already returns absolute coordinates — skip offset. + if engine != "paddle": + content_bounds = cached.get("_content_bounds") + if content_bounds: + lx, _rx, ty, _by = content_bounds + abs_words = [] + for w in wf_word_dicts: + abs_words.append({ + **w, + 'left': w['left'] + lx, + 'top': w['top'] + ty, + }) + wf_word_dicts = abs_words cells, columns_meta = build_grid_from_words(wf_word_dicts, img_w, img_h) duration = time.time() - t0 @@ -2035,7 +2050,7 @@ async def detect_words( is_vocab = bool(col_types & {'column_en', 'column_de'}) n_rows = len(set(c['row_index'] for c in cells)) if cells else 0 n_cols = len(columns_meta) - used_engine = "words_first" + used_engine = "paddle" if engine == "paddle" else "words_first" word_result = { "cells": cells, diff --git a/klausur-service/backend/services/paddleocr_remote.py b/klausur-service/backend/services/paddleocr_remote.py new file mode 100644 index 0000000..113457a --- /dev/null +++ b/klausur-service/backend/services/paddleocr_remote.py @@ -0,0 +1,44 @@ +"""Remote PaddleOCR client — calls Hetzner service.""" + +import logging +import os +from typing import Dict, List, Tuple + +import httpx + +logger = logging.getLogger(__name__) + +PADDLEOCR_REMOTE_URL = os.environ.get("PADDLEOCR_REMOTE_URL", "") +PADDLEOCR_API_KEY = os.environ.get("PADDLEOCR_API_KEY", "") +_TIMEOUT = 30.0 + + +async def ocr_remote_paddle( + image_bytes: bytes, + filename: str = "scan.png", +) -> Tuple[List[Dict], int, int]: + """Send image to remote PaddleOCR, return (word_dicts, img_w, img_h).""" + if not PADDLEOCR_REMOTE_URL: + raise RuntimeError("PADDLEOCR_REMOTE_URL not configured") + + headers = {} + if PADDLEOCR_API_KEY: + headers["X-API-Key"] = PADDLEOCR_API_KEY + + async with httpx.AsyncClient(timeout=_TIMEOUT, verify=False) as client: + resp = await client.post( + f"{PADDLEOCR_REMOTE_URL.rstrip('/')}/ocr", + files={"file": (filename, image_bytes, "image/png")}, + headers=headers, + ) + resp.raise_for_status() + data = resp.json() + + words = data.get("words", []) + logger.info( + "PaddleOCR remote returned %d words (img %dx%d)", + len(words), + data.get("image_width", 0), + data.get("image_height", 0), + ) + return words, data["image_width"], data["image_height"] diff --git a/klausur-service/backend/tests/test_paddleocr_remote.py b/klausur-service/backend/tests/test_paddleocr_remote.py new file mode 100644 index 0000000..61c8835 --- /dev/null +++ b/klausur-service/backend/tests/test_paddleocr_remote.py @@ -0,0 +1,101 @@ +"""Tests for the remote PaddleOCR client.""" + +import pytest +from unittest.mock import AsyncMock, patch, MagicMock + + +SAMPLE_RESPONSE = { + "words": [ + {"text": "Hello", "left": 10, "top": 20, "width": 80, "height": 30, "conf": 95.2}, + {"text": "World", "left": 100, "top": 20, "width": 90, "height": 30, "conf": 91.0}, + ], + "image_width": 640, + "image_height": 480, +} + + +@pytest.mark.asyncio +async def test_ocr_remote_paddle_success(): + """Successful OCR call returns word dicts and image dimensions.""" + mock_response = MagicMock() + mock_response.json.return_value = SAMPLE_RESPONSE + mock_response.raise_for_status = MagicMock() + + mock_client = AsyncMock() + mock_client.post.return_value = mock_response + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + + with patch("services.paddleocr_remote.PADDLEOCR_REMOTE_URL", "https://example.com:8095"), \ + patch("services.paddleocr_remote.PADDLEOCR_API_KEY", "test-key"), \ + patch("httpx.AsyncClient", return_value=mock_client): + + from services.paddleocr_remote import ocr_remote_paddle + words, w, h = await ocr_remote_paddle(b"fake-png-bytes", "test.png") + + assert len(words) == 2 + assert words[0]["text"] == "Hello" + assert words[1]["text"] == "World" + assert w == 640 + assert h == 480 + + # Verify API key was sent + call_kwargs = mock_client.post.call_args + assert call_kwargs.kwargs["headers"]["X-API-Key"] == "test-key" + + +@pytest.mark.asyncio +async def test_ocr_remote_paddle_no_url(): + """Raises RuntimeError when PADDLEOCR_REMOTE_URL is not configured.""" + with patch("services.paddleocr_remote.PADDLEOCR_REMOTE_URL", ""): + from services.paddleocr_remote import ocr_remote_paddle + with pytest.raises(RuntimeError, match="PADDLEOCR_REMOTE_URL not configured"): + await ocr_remote_paddle(b"fake-png-bytes") + + +@pytest.mark.asyncio +async def test_ocr_remote_paddle_no_api_key(): + """When no API key is set, no X-API-Key header is sent.""" + mock_response = MagicMock() + mock_response.json.return_value = SAMPLE_RESPONSE + mock_response.raise_for_status = MagicMock() + + mock_client = AsyncMock() + mock_client.post.return_value = mock_response + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + + with patch("services.paddleocr_remote.PADDLEOCR_REMOTE_URL", "https://example.com:8095"), \ + patch("services.paddleocr_remote.PADDLEOCR_API_KEY", ""), \ + patch("httpx.AsyncClient", return_value=mock_client): + + from services.paddleocr_remote import ocr_remote_paddle + words, w, h = await ocr_remote_paddle(b"fake-png-bytes") + + assert len(words) == 2 + call_kwargs = mock_client.post.call_args + assert "X-API-Key" not in call_kwargs.kwargs["headers"] + + +@pytest.mark.asyncio +async def test_ocr_remote_paddle_http_error(): + """HTTP errors are raised to the caller.""" + import httpx + + mock_response = MagicMock() + mock_response.raise_for_status.side_effect = httpx.HTTPStatusError( + "401 Unauthorized", request=MagicMock(), response=MagicMock() + ) + + mock_client = AsyncMock() + mock_client.post.return_value = mock_response + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + + with patch("services.paddleocr_remote.PADDLEOCR_REMOTE_URL", "https://example.com:8095"), \ + patch("services.paddleocr_remote.PADDLEOCR_API_KEY", "wrong-key"), \ + patch("httpx.AsyncClient", return_value=mock_client): + + from services.paddleocr_remote import ocr_remote_paddle + with pytest.raises(httpx.HTTPStatusError): + await ocr_remote_paddle(b"fake-png-bytes") diff --git a/paddleocr-service/Dockerfile b/paddleocr-service/Dockerfile new file mode 100644 index 0000000..e10f22c --- /dev/null +++ b/paddleocr-service/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.11-slim +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + libgl1-mesa-glx libglib2.0-0 curl \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +EXPOSE 8095 +HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \ + CMD curl -f http://127.0.0.1:8095/health || exit 1 +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8095"] diff --git a/paddleocr-service/main.py b/paddleocr-service/main.py new file mode 100644 index 0000000..1195c5e --- /dev/null +++ b/paddleocr-service/main.py @@ -0,0 +1,71 @@ +"""PaddleOCR Remote Service — PP-OCRv5 Latin auf x86_64.""" + +import io +import os + +import numpy as np +from fastapi import FastAPI, File, Header, HTTPException, UploadFile +from PIL import Image + +app = FastAPI(title="PaddleOCR Service") + +_engine = None +API_KEY = os.environ.get("PADDLEOCR_API_KEY", "") + + +def get_engine(): + global _engine + if _engine is None: + from paddleocr import PaddleOCR + + _engine = PaddleOCR( + lang="latin", + use_angle_cls=True, + show_log=False, + ) + return _engine + + +@app.get("/health") +def health(): + return {"status": "ok", "model": "PP-OCRv5-latin"} + + +@app.post("/ocr") +async def ocr( + file: UploadFile = File(...), + x_api_key: str = Header(default=""), +): + if API_KEY and x_api_key != API_KEY: + raise HTTPException(status_code=401, detail="Invalid API key") + + img_bytes = await file.read() + img = Image.open(io.BytesIO(img_bytes)).convert("RGB") + img_np = np.array(img) + + engine = get_engine() + result = engine.ocr(img_np) + + words = [] + for line in result[0] or []: + box, (text, conf) = line[0], line[1] + x_min = min(p[0] for p in box) + y_min = min(p[1] for p in box) + x_max = max(p[0] for p in box) + y_max = max(p[1] for p in box) + words.append( + { + "text": text, + "left": int(x_min), + "top": int(y_min), + "width": int(x_max - x_min), + "height": int(y_max - y_min), + "conf": round(conf * 100, 1), + } + ) + + return { + "words": words, + "image_width": img_np.shape[1], + "image_height": img_np.shape[0], + } diff --git a/paddleocr-service/requirements.txt b/paddleocr-service/requirements.txt new file mode 100644 index 0000000..47f8951 --- /dev/null +++ b/paddleocr-service/requirements.txt @@ -0,0 +1,7 @@ +paddlepaddle>=3.0.0 +paddleocr>=2.9.0 +fastapi>=0.110.0 +uvicorn>=0.25.0 +python-multipart>=0.0.6 +Pillow>=10.0.0 +numpy>=1.24.0