From 61c8169f9ee30bbcbd4001a6cd19a0fc230c3856 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 12 Mar 2026 20:18:46 +0100 Subject: [PATCH] docs+test: add Kombi-Modus tests (19 passing) and MkDocs documentation - test_paddle_kombi.py: 6 IoU tests, 10 merge tests, 2 bullet-point tests - OCR-Pipeline.md: new "OCR Overlay" section with Paddle Direct/Kombi docs, merge algorithm flowchart, dateistruktur update, changelog v4.5.0 Co-Authored-By: Claude Opus 4.6 --- .../services/klausur-service/OCR-Pipeline.md | 135 +++++++++++ .../backend/tests/test_paddle_kombi.py | 213 ++++++++++++++++++ 2 files changed, 348 insertions(+) create mode 100644 klausur-service/backend/tests/test_paddle_kombi.py diff --git a/docs-src/services/klausur-service/OCR-Pipeline.md b/docs-src/services/klausur-service/OCR-Pipeline.md index 96601ac..9a8233f 100644 --- a/docs-src/services/klausur-service/OCR-Pipeline.md +++ b/docs-src/services/klausur-service/OCR-Pipeline.md @@ -162,6 +162,12 @@ admin-lehrer/ ├── app/(admin)/ai/ocr-pipeline/ │ ├── page.tsx # Haupt-Page mit Session-Management │ └── types.ts # TypeScript Interfaces +├── app/(admin)/ai/ocr-overlay/ +│ ├── page.tsx # OCR Overlay: 3 Modi (Pipeline/Paddle/Kombi) +│ └── types.ts # OVERLAY_/PADDLE_DIRECT_/KOMBI_STEPS +├── components/ocr-overlay/ +│ ├── PaddleDirectStep.tsx # Wiederverwendbar fuer Paddle Direct + Kombi +│ └── OverlayReconstruction.tsx # Overlay-Anzeige auf Bildhintergrund └── components/ocr-pipeline/ ├── PipelineStepper.tsx # Fortschritts-Stepper ├── StepOrientation.tsx # Schritt 1: Orientierung @@ -1081,10 +1087,139 @@ ssh macmini "/usr/local/bin/docker compose -f /Users/benjaminadmin/Projekte/brea --- +## OCR Overlay — Alternative Pipelines + +**URL:** https://macmini:3002/ai/ocr-overlay + +Neben der vollen 10-Schritt-Pipeline gibt es die **OCR Overlay**-Seite mit +vereinfachten Pfaden fuer schnelle Ergebnisse. Alle drei Modi teilen die +gleichen Vorverarbeitungsschritte (Orient → Deskew → Dewarp → Crop). + +### Modus-Uebersicht + +| Modus | Schritte | Engine | Endpoint | Beschreibung | +|-------|----------|--------|----------|--------------| +| **Pipeline** | 7 | Tesseract | `/words` (SSE) | Volle Pipeline: Zeilen + Woerter + Overlay | +| **Paddle Direct** | 5 | PaddleOCR | `/paddle-direct` | PaddleOCR ersetzt Zeilen + Woerter + Overlay | +| **Kombi** | 5 | PaddleOCR + Tesseract | `/paddle-kombi` | Beide Engines, Ergebnisse gemittelt | + +### Flussdiagramm + +``` +┌──────────────────────────────────────────────────────────────┐ +│ GEMEINSAME VORVERARBEITUNG (alle 3 Modi) │ +│ │ +│ Schritt 1: Orientierung │ +│ Schritt 2: Deskew │ +│ Schritt 3: Dewarp │ +│ Schritt 4: Crop │ +└──────────────────┬────────────────────┬───────────────────────┘ + │ │ + ┌───────────┼────────────────────┼────────────────┐ + ▼ ▼ ▼ ▼ + PIPELINE PADDLE DIRECT KOMBI-MODUS + (7 Schritte) (5 Schritte) (5 Schritte) + │ │ │ + Zeilen- PaddleOCR PaddleOCR + erkennung word_boxes + Tesseract + │ │ parallel + Woerter- build_grid_ │ + erkennung from_words() _merge_paddle_ + │ │ tesseract() + Overlay Overlay │ + │ │ build_grid_ + ▼ ▼ from_words() + Ergebnis Ergebnis │ + Overlay + │ + Ergebnis +``` + +### Paddle Direct + +PaddleOCR laeuft auf dem vorverarbeiteten Bild und erkennt Woerter direkt. + +**Endpoint:** `POST /api/v1/ocr-pipeline/sessions/{id}/paddle-direct` + +**Ablauf:** + +1. Cropped/dewarped Bild laden (Prioritaet: cropped > dewarped > original) +2. `ocr_region_paddle(img_bgr, region=None)` aufrufen +3. `build_grid_from_words(word_dicts, img_w, img_h)` fuer Grid-Erstellung +4. Cells mit `ocr_engine="paddle_direct"` taggen +5. In DB speichern (`current_step=8`) + +**Frontend:** `PaddleDirectStep.tsx` — wiederverwendbare Komponente mit konfigurierbaren Props. + +### Kombi-Modus (PaddleOCR + Tesseract) + +!!! info "Motivation" + PaddleOCR liefert gute Texterkennung, positioniert Woerter aber manchmal falsch + (z.B. `!Betonung` als ein Wort, Bullet Points nicht erkannt). Tesseract erkennt + Sonderzeichen besser und liefert feinere Word-Level-Boxen. Der Kombi-Modus + nutzt beide Engines und mittelt die Koordinaten. + +**Endpoint:** `POST /api/v1/ocr-pipeline/sessions/{id}/paddle-kombi` + +**Ablauf:** + +1. Cropped/dewarped Bild laden +2. **Parallel** beide Engines aufrufen: + - `ocr_region_paddle(img_bgr, region=None)` → `paddle_words` + - `pytesseract.image_to_data(pil_img, lang='eng+deu', config='--psm 6 --oem 3')` → `tess_words` +3. **Merge:** `_merge_paddle_tesseract(paddle_words, tess_words)` +4. `build_grid_from_words(merged_words, img_w, img_h)` fuer Grid +5. Cells mit `ocr_engine="kombi"` taggen +6. In DB speichern + +#### Merge-Algorithmus + +```mermaid +flowchart TD + A[Paddle-Wort] --> B{Tesseract-Match
IoU > 0.3?} + B -->|Ja| C[Koordinaten mitteln
gewichtet nach Confidence] + B -->|Nein| D[Paddle-Wort behalten] + E[Ungematchte
Tesseract-Woerter] --> F{Confidence >= 40?} + F -->|Ja| G[Hinzufuegen
Bullet Points, Symbole] + F -->|Nein| H[Verwerfen] +``` + +**Koordinaten-Mittelung:** + +``` +merged_left = (paddle_left × paddle_conf + tess_left × tess_conf) / (paddle_conf + tess_conf) +``` + +Gleiches Prinzip fuer `top`, `width`, `height`. Der Text kommt immer von PaddleOCR (bessere Texterkennung). + +#### Dateien + +| Datei | Aenderung | +|-------|-----------| +| `ocr_pipeline_api.py` | `_box_iou()`, `_merge_paddle_tesseract()`, `/paddle-kombi` Endpoint | +| `admin-lehrer/.../ocr-overlay/types.ts` | `KOMBI_STEPS` Konstante | +| `admin-lehrer/.../PaddleDirectStep.tsx` | Wiederverwendbar mit `endpoint`/`engineKey` Props | +| `admin-lehrer/.../ocr-overlay/page.tsx` | 3er-Toggle: Pipeline / Paddle Direct / Kombi | + +#### Tests + +```bash +cd klausur-service/backend && pytest tests/test_paddle_kombi.py -v +``` + +| Testklasse | Tests | Beschreibung | +|------------|-------|--------------| +| `TestBoxIoU` | 6 | IoU-Berechnung: identisch, kein Overlap, teilweise, enthalten, Kante, Null-Flaeche | +| `TestMergePaddleTesseract` | 10 | Merge: Match-Averaging, kein Match, Low-Conf-Drop, leer, IoU-Schwelle, Text-Praeferenz, Zero-Conf | +| `TestMergePaddleTesseractBulletPoints` | 2 | Bullet-Points und Sonderzeichen von Tesseract | + +--- + ## Aenderungshistorie | Datum | Version | Aenderung | |-------|---------|----------| +| 2026-03-12 | 4.5.0 | Kombi-Modus (PaddleOCR + Tesseract): Beide Engines laufen parallel, Koordinaten werden IoU-basiert gematcht und confidence-gewichtet gemittelt. Ungematchte Tesseract-Woerter (Bullets, Symbole) werden hinzugefuegt. 3er-Toggle in OCR Overlay. | | 2026-03-12 | 4.4.0 | PaddleOCR Remote-Engine (`engine=paddle`): PP-OCRv5 Latin auf Hetzner x86_64. Neuer Microservice (`paddleocr-service/`), HTTP-Client (`paddleocr_remote.py`), Frontend-Dropdown-Option. Nutzt words_first Grid-Methode. | | 2026-03-12 | 4.3.0 | Words-First Grid Builder (`cv_words_first.py`): Bottom-up-Algorithmus clustert Tesseract word_boxes direkt zu Spalten/Zeilen/Zellen. Neuer `grid_method` Parameter im `/words` Endpoint. Frontend-Toggle in StepWordRecognition. | | 2026-03-10 | 4.2.0 | Rekonstruktion: Overlay-Modus mit Pixel-Wortpositionierung, 180°-Rotation, Sub-Session-Merging, usePixelWordPositions Hook, Box-Boundary-Schutz (box_ranges_inner) | diff --git a/klausur-service/backend/tests/test_paddle_kombi.py b/klausur-service/backend/tests/test_paddle_kombi.py new file mode 100644 index 0000000..d030f75 --- /dev/null +++ b/klausur-service/backend/tests/test_paddle_kombi.py @@ -0,0 +1,213 @@ +"""Tests for the Kombi-Modus merge algorithm (_box_iou, _merge_paddle_tesseract). + +These functions live in ocr_pipeline_api.py and merge PaddleOCR + Tesseract +word boxes by IoU matching and confidence-weighted coordinate averaging. +""" + +import pytest +import sys +import os + +# Add backend to path so we can import from ocr_pipeline_api +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from ocr_pipeline_api import _box_iou, _merge_paddle_tesseract + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 80): + """Create a synthetic word dict.""" + return { + "text": text, + "left": left, + "top": top, + "width": width, + "height": height, + "conf": conf, + } + + +# --------------------------------------------------------------------------- +# _box_iou +# --------------------------------------------------------------------------- + +class TestBoxIoU: + + def test_identical_boxes(self): + """Identical boxes have IoU = 1.0.""" + a = _word("hello", 10, 10, 100, 20) + assert _box_iou(a, a) == pytest.approx(1.0) + + def test_no_overlap(self): + """Non-overlapping boxes have IoU = 0.0.""" + a = _word("a", 0, 0, 50, 20) + b = _word("b", 200, 200, 50, 20) + assert _box_iou(a, b) == 0.0 + + def test_partial_overlap(self): + """Partially overlapping boxes have 0 < IoU < 1.""" + a = _word("a", 0, 0, 100, 20) + b = _word("b", 50, 0, 100, 20) + # Intersection: x=[50,100], y=[0,20] → 50×20 = 1000 + # Union: 100×20 + 100×20 - 1000 = 3000 + assert _box_iou(a, b) == pytest.approx(1000 / 3000, abs=0.01) + + def test_contained_box(self): + """Small box inside large box.""" + big = _word("big", 0, 0, 200, 40) + small = _word("small", 50, 10, 30, 10) + # Intersection = 30×10 = 300, Union = 200×40 + 30×10 - 300 = 8000 + assert _box_iou(big, small) == pytest.approx(300 / 8000, abs=0.01) + + def test_touching_edges(self): + """Boxes that share an edge but don't overlap have IoU = 0.""" + a = _word("a", 0, 0, 50, 20) + b = _word("b", 50, 0, 50, 20) + assert _box_iou(a, b) == 0.0 + + def test_zero_area_box(self): + """Zero-area box returns IoU = 0.""" + a = _word("a", 10, 10, 0, 0) + b = _word("b", 10, 10, 50, 20) + assert _box_iou(a, b) == 0.0 + + +# --------------------------------------------------------------------------- +# _merge_paddle_tesseract +# --------------------------------------------------------------------------- + +class TestMergePaddleTesseract: + + def test_perfect_match_averages_coords(self): + """When paddle and tesseract have the same word at same position, + coordinates are averaged by confidence.""" + pw = [_word("hello", 100, 50, 80, 20, conf=90)] + tw = [_word("hello", 110, 55, 70, 18, conf=60)] + merged = _merge_paddle_tesseract(pw, tw) + assert len(merged) == 1 + m = merged[0] + assert m["text"] == "hello" # Paddle text preferred + # Weighted avg: (100*90 + 110*60) / 150 = 15600/150 = 104 + assert m["left"] == 104 + assert m["conf"] == 90 # max(90, 60) + + def test_no_match_keeps_both(self): + """Non-overlapping words: both kept.""" + pw = [_word("hello", 10, 10)] + tw = [_word("bullet", 500, 500, conf=50)] + merged = _merge_paddle_tesseract(pw, tw) + assert len(merged) == 2 + texts = {m["text"] for m in merged} + assert texts == {"hello", "bullet"} + + def test_low_conf_tesseract_dropped(self): + """Unmatched Tesseract words with conf < 40 are dropped.""" + pw = [_word("hello", 10, 10)] + tw = [_word("noise", 500, 500, conf=20)] + merged = _merge_paddle_tesseract(pw, tw) + assert len(merged) == 1 + assert merged[0]["text"] == "hello" + + def test_empty_paddle(self): + """Only Tesseract words with sufficient confidence are kept.""" + pw = [] + tw = [ + _word("bullet", 10, 10, conf=80), + _word("noise", 200, 200, conf=10), + ] + merged = _merge_paddle_tesseract(pw, tw) + assert len(merged) == 1 + assert merged[0]["text"] == "bullet" + + def test_empty_tesseract(self): + """All Paddle words kept when Tesseract is empty.""" + pw = [_word("a", 10, 10), _word("b", 200, 10)] + tw = [] + merged = _merge_paddle_tesseract(pw, tw) + assert len(merged) == 2 + + def test_both_empty(self): + """Empty inputs return empty list.""" + assert _merge_paddle_tesseract([], []) == [] + + def test_one_to_one_matching(self): + """Each Tesseract word matches at most one Paddle word.""" + # Two paddle words at different X positions, one tesseract word overlaps first + pw = [ + _word("cat", 10, 10, 60, 20, conf=80), + _word("dog", 200, 10, 60, 20, conf=80), + ] + tw = [_word("cat", 15, 12, 55, 18, conf=70)] + merged = _merge_paddle_tesseract(pw, tw) + assert len(merged) == 2 # cat (merged) + dog (unmatched paddle) + + def test_iou_threshold(self): + """Match requires IoU > 0.3, not just any overlap.""" + pw = [_word("hello", 0, 0, 100, 20, conf=80)] + # Tiny overlap — IoU well below 0.3 + tw = [_word("world", 95, 0, 100, 20, conf=70)] + # Intersection: x=[95,100]=5px width, y=[0,20]=20px → 100 + # Union: 2000 + 2000 - 100 = 3900 → IoU ≈ 0.026 + merged = _merge_paddle_tesseract(pw, tw) + assert len(merged) == 2 # No match, both kept separately + + def test_paddle_text_preferred(self): + """Merged word uses Paddle's text, not Tesseract's.""" + pw = [_word("Betonung", 100, 50, 80, 20, conf=85)] + tw = [_word("Betonung!", 100, 50, 80, 20, conf=60)] + merged = _merge_paddle_tesseract(pw, tw) + assert len(merged) == 1 + assert merged[0]["text"] == "Betonung" + + def test_confidence_weighted_positions(self): + """Equal confidence → simple average of coordinates.""" + # Boxes must overlap enough for IoU > 0.3 + pw = [_word("x", 100, 200, 60, 20, conf=50)] + tw = [_word("x", 110, 200, 60, 20, conf=50)] + merged = _merge_paddle_tesseract(pw, tw) + assert len(merged) == 1 + m = merged[0] + assert m["left"] == 105 # (100+110)/2 + assert m["top"] == 200 # (200+200)/2 + assert m["width"] == 60 # (60+60)/2 + assert m["height"] == 20 # (20+20)/2 + + def test_zero_confidence_no_division_error(self): + """Words with conf=0 don't cause division by zero.""" + pw = [_word("a", 100, 50, 80, 20, conf=0)] + tw = [_word("a", 100, 50, 80, 20, conf=0)] + merged = _merge_paddle_tesseract(pw, tw) + assert len(merged) == 1 # Should not raise + + +class TestMergePaddleTesseractBulletPoints: + """Test the key use case: Tesseract catches bullet points / symbols + that PaddleOCR misses or merges with adjacent text.""" + + def test_bullet_added_from_tesseract(self): + """A bullet character recognized by Tesseract but not Paddle is added.""" + pw = [_word("Betonung", 60, 10, 80, 20)] + tw = [ + _word("•", 10, 10, 15, 15, conf=65), # bullet + _word("Betonung", 60, 10, 80, 20, conf=50), # overlaps paddle + ] + merged = _merge_paddle_tesseract(pw, tw) + texts = [m["text"] for m in merged] + assert "•" in texts + assert "Betonung" in texts + assert len(merged) == 2 + + def test_exclamation_added_from_tesseract(self): + """An exclamation mark recognized separately by Tesseract is added.""" + pw = [_word("important", 60, 10, 100, 20)] + tw = [ + _word("!", 40, 10, 12, 20, conf=70), + _word("important", 60, 10, 100, 20, conf=55), + ] + merged = _merge_paddle_tesseract(pw, tw) + texts = [m["text"] for m in merged] + assert "!" in texts + assert len(merged) == 2