diff --git a/admin-lehrer/components/ocr-pipeline/__tests__/useSlideWordPositions.test.ts b/admin-lehrer/components/ocr-pipeline/__tests__/useSlideWordPositions.test.ts new file mode 100644 index 0000000..d8aca95 --- /dev/null +++ b/admin-lehrer/components/ocr-pipeline/__tests__/useSlideWordPositions.test.ts @@ -0,0 +1,176 @@ +/** + * Tests for useSlideWordPositions hook. + * + * The hook computes word positions from OCR word_boxes or pixel projection. + * Since Canvas/Image are not available in jsdom, we test the pure computation + * logic by extracting and verifying the WordPosition interface contract. + */ +import { describe, it, expect } from 'vitest' + +// --------------------------------------------------------------------------- +// WordPosition interface (mirrored from useSlideWordPositions.ts) +// --------------------------------------------------------------------------- + +interface WordPosition { + xPct: number + wPct: number + yPct: number + hPct: number + text: string + fontRatio: number +} + +// --------------------------------------------------------------------------- +// Pure computation functions extracted from the hook for testing +// --------------------------------------------------------------------------- + +/** + * Word-box path: compute WordPosition from an OCR word_box. + * Replicates the word_boxes.map() logic in useSlideWordPositions. + */ +function wordBoxToPosition( + box: { text: string; left: number; top: number; width: number; height: number }, + imgW: number, + imgH: number, +): WordPosition { + return { + xPct: (box.left / imgW) * 100, + wPct: (box.width / imgW) * 100, + yPct: (box.top / imgH) * 100, + hPct: (box.height / imgH) * 100, + text: box.text, + fontRatio: 1.0, + } +} + +/** + * Fallback path (no word_boxes): spread tokens evenly across cell bbox. + * Replicates the fallback logic in useSlideWordPositions. + */ +function fallbackPositions( + tokens: string[], + bboxPct: { x: number; y: number; w: number; h: number }, +): WordPosition[] { + const fallbackW = bboxPct.w / tokens.length + return tokens.map((t, i) => ({ + xPct: bboxPct.x + i * fallbackW, + wPct: fallbackW, + yPct: bboxPct.y, + hPct: bboxPct.h, + text: t, + fontRatio: 1.0, + })) +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe('wordBoxToPosition (word-box path)', () => { + it('should compute percentage positions from pixel coordinates', () => { + const box = { text: 'hello', left: 100, top: 200, width: 80, height: 20 } + const wp = wordBoxToPosition(box, 1000, 2000) + + expect(wp.xPct).toBeCloseTo(10, 1) // 100/1000 * 100 + expect(wp.wPct).toBeCloseTo(8, 1) // 80/1000 * 100 + expect(wp.yPct).toBeCloseTo(10, 1) // 200/2000 * 100 + expect(wp.hPct).toBeCloseTo(1, 1) // 20/2000 * 100 + expect(wp.text).toBe('hello') + expect(wp.fontRatio).toBe(1.0) + }) + + it('should produce different yPct for words on different lines', () => { + const imgW = 1000, imgH = 2000 + const word1 = wordBoxToPosition({ text: 'line1', left: 50, top: 100, width: 60, height: 20 }, imgW, imgH) + const word2 = wordBoxToPosition({ text: 'line2', left: 50, top: 130, width: 60, height: 20 }, imgW, imgH) + + expect(word1.yPct).not.toEqual(word2.yPct) + expect(word2.yPct).toBeGreaterThan(word1.yPct) + }) + + it('should handle word at origin', () => { + const wp = wordBoxToPosition({ text: 'a', left: 0, top: 0, width: 50, height: 25 }, 500, 500) + expect(wp.xPct).toBe(0) + expect(wp.yPct).toBe(0) + expect(wp.wPct).toBeCloseTo(10, 1) + expect(wp.hPct).toBeCloseTo(5, 1) + }) + + it('should handle word at bottom-right corner', () => { + const wp = wordBoxToPosition({ text: 'z', left: 900, top: 1900, width: 100, height: 100 }, 1000, 2000) + expect(wp.xPct).toBe(90) + expect(wp.yPct).toBe(95) + expect(wp.wPct).toBe(10) + expect(wp.hPct).toBe(5) + }) +}) + + +describe('fallbackPositions (no word_boxes)', () => { + it('should spread tokens evenly across cell width', () => { + const bbox = { x: 10, y: 20, w: 60, h: 5 } + const positions = fallbackPositions(['apple', 'Apfel'], bbox) + + expect(positions.length).toBe(2) + expect(positions[0].xPct).toBeCloseTo(10, 1) + expect(positions[1].xPct).toBeCloseTo(40, 1) // 10 + 30 + expect(positions[0].wPct).toBeCloseTo(30, 1) + expect(positions[1].wPct).toBeCloseTo(30, 1) + }) + + it('should use cell bbox for Y position (all words same Y)', () => { + const bbox = { x: 5, y: 30, w: 80, h: 4 } + const positions = fallbackPositions(['a', 'b', 'c'], bbox) + + for (const wp of positions) { + expect(wp.yPct).toBe(30) + expect(wp.hPct).toBe(4) + } + }) + + it('should handle single token', () => { + const bbox = { x: 15, y: 25, w: 50, h: 6 } + const positions = fallbackPositions(['word'], bbox) + + expect(positions.length).toBe(1) + expect(positions[0].xPct).toBe(15) + expect(positions[0].wPct).toBe(50) + expect(positions[0].yPct).toBe(25) + expect(positions[0].hPct).toBe(6) + }) +}) + + +describe('WordPosition yPct/hPct contract', () => { + it('word-box path: yPct comes from box.top, not cell bbox', () => { + // This is the key fix: multi-line cells should NOT stack words at cell center + const cellBbox = { x: 10, y: 20, w: 60, h: 10 } // cell spans y=20% to y=30% + const imgW = 1000, imgH = 1000 + + // Two words on different lines within the same cell + const word1 = wordBoxToPosition({ text: 'line1', left: 100, top: 200, width: 80, height: 20 }, imgW, imgH) + const word2 = wordBoxToPosition({ text: 'line2', left: 100, top: 260, width: 80, height: 20 }, imgW, imgH) + + // word1 should be at y=20%, word2 at y=26% — NOT both at cellBbox.y (20%) + expect(word1.yPct).toBeCloseTo(20, 1) + expect(word2.yPct).toBeCloseTo(26, 1) + expect(word1.yPct).not.toEqual(word2.yPct) + + // Both should have individual heights from their box, not cell height + expect(word1.hPct).toBeCloseTo(2, 1) + expect(word2.hPct).toBeCloseTo(2, 1) + // Cell height would be 10% — word height is 2%, confirming per-word sizing + expect(word1.hPct).toBeLessThan(cellBbox.h) + }) + + it('fallback path: yPct equals cell bbox.y (no per-word data)', () => { + const bbox = { x: 10, y: 45, w: 30, h: 8 } + const positions = fallbackPositions(['a', 'b'], bbox) + + // Without word_boxes, all words use cell bbox Y — expected behavior + expect(positions[0].yPct).toBe(bbox.y) + expect(positions[1].yPct).toBe(bbox.y) + expect(positions[0].hPct).toBe(bbox.h) + expect(positions[1].hPct).toBe(bbox.h) + }) +}) diff --git a/docs-src/services/klausur-service/OCR-Pipeline.md b/docs-src/services/klausur-service/OCR-Pipeline.md index 9a8233f..0796a3d 100644 --- a/docs-src/services/klausur-service/OCR-Pipeline.md +++ b/docs-src/services/klausur-service/OCR-Pipeline.md @@ -937,7 +937,13 @@ function usePixelWordPositions( 6. Auto-Schriftgroesse per `measureText()` + `fontRatio` 7. Mode-Normalisierung: Haeufigste `fontRatio` (gerundet auf 0.02) auf alle anwenden -**Rueckgabe:** `Map` mit `xPct`, `wPct`, `text`, `fontRatio` +**Rueckgabe:** `Map` mit `xPct`, `wPct`, `yPct`, `hPct`, `text`, `fontRatio` + +!!! note "Per-Word Y-Positionierung (v4.3.1)" + `WordPosition` enthaelt seit v4.3.1 auch `yPct` und `hPct`. Dadurch rendert jedes + Wort an seiner tatsaechlichen vertikalen Position, statt alle Woerter einer Zelle + auf der Zell-Mitte zu stapeln. Bei Zellen ohne `word_boxes` (Fallback) werden + `yPct`/`hPct` aus `cell.bbox_pct` uebernommen. ### Fabric.js Editor @@ -1172,16 +1178,32 @@ PaddleOCR laeuft auf dem vorverarbeiteten Bild und erkennt Woerter direkt. 5. Cells mit `ocr_engine="kombi"` taggen 6. In DB speichern -#### Merge-Algorithmus +#### Merge-Algorithmus (v2: Row-Based Sequence Alignment) + +!!! info "Rewrite (v4.3)" + Der Merge wurde von IoU-basiertem Matching auf **Row-Based Sequence Alignment** umgestellt. + Multi-Word Paddle-Boxen werden vor dem Merge in Einzelwoerter aufgeteilt + (`_split_paddle_multi_words`). + +**Ablauf:** + +1. **Row Grouping:** Woerter beider Engines nach Y-Position in Zeilen gruppieren (12px Toleranz) +2. **Row Matching:** Paddle- und Tesseract-Zeilen ueber vertikale Naehe zuordnen +3. **Sequence Alignment:** Innerhalb jeder gematchten Zeile links-nach-rechts durchlaufen: + - **Gleicher Text** oder **Substring-Match:** Zusammenfuehren (Paddle-Text, gemittelte Koordinaten) + - **Raeumlicher Overlap >= 50%:** Auch bei unterschiedlichem Text als Duplikat behandeln + - **Nur bei einer Engine:** Wort beibehalten (falls Confidence >= 30) +4. **Ungematchte Zeilen:** Paddle-Zeilen behalten, Tesseract-Zeilen nur mit Confidence >= 40 ```mermaid flowchart TD - A[Paddle-Wort] --> B{Tesseract-Match
IoU > 0.3?} - B -->|Ja| C[Koordinaten mitteln
gewichtet nach Confidence] - B -->|Nein| D[Paddle-Wort behalten] - E[Ungematchte
Tesseract-Woerter] --> F{Confidence >= 40?} - F -->|Ja| G[Hinzufuegen
Bullet Points, Symbole] - F -->|Nein| H[Verwerfen] + A[Beide Engines] --> B[Row Grouping
Y-Toleranz 12px] + B --> C[Row Matching
vertikale Naehe] + C --> D{Gleicher Text
oder Overlap >= 50%?} + D -->|Ja| E[Deduplizieren:
Paddle-Text + gemittelte Coords] + D -->|Nein| F{Wort nur bei
einer Engine?} + F -->|Ja| G[Beibehalten
falls conf >= 30] + F -->|Nein| H[Beide behalten
verschiedene Positionen] ``` **Koordinaten-Mittelung:** @@ -1192,21 +1214,43 @@ merged_left = (paddle_left × paddle_conf + tess_left × tess_conf) / (paddle_co Gleiches Prinzip fuer `top`, `width`, `height`. Der Text kommt immer von PaddleOCR (bessere Texterkennung). +**Raeumlicher Overlap-Check (v4.3.1):** + +Wenn zwei Woerter >= 50% horizontal ueberlappen, werden sie als dasselbe physische Wort behandelt — +unabhaengig davon, ob die OCR-Texte unterschiedlich sind (z.B. "hello" vs "helo"). +Dies verhindert, dass leicht unterschiedliche Erkennungen als separate Woerter uebereinander +im Overlay erscheinen. + #### Dateien | Datei | Aenderung | |-------|-----------| -| `ocr_pipeline_api.py` | `_box_iou()`, `_merge_paddle_tesseract()`, `/paddle-kombi` Endpoint | +| `ocr_pipeline_api.py` | `_split_paddle_multi_words()`, `_group_words_into_rows()`, `_merge_row_sequences()`, `_merge_paddle_tesseract()`, `/paddle-kombi` Endpoint | | `admin-lehrer/.../ocr-overlay/types.ts` | `KOMBI_STEPS` Konstante | +| `admin-lehrer/.../ocr-overlay/useSlideWordPositions.ts` | Slide-Positionierung mit `yPct`/`hPct` | +| `admin-lehrer/.../ocr-overlay/usePixelWordPositions.ts` | Pixel-Cluster-Positionierung mit `yPct`/`hPct` | +| `admin-lehrer/.../ocr-overlay/OverlayReconstruction.tsx` | Rendering mit per-Word Y-Positionen | | `admin-lehrer/.../PaddleDirectStep.tsx` | Wiederverwendbar mit `endpoint`/`engineKey` Props | | `admin-lehrer/.../ocr-overlay/page.tsx` | 3er-Toggle: Pipeline / Paddle Direct / Kombi | #### Tests ```bash -cd klausur-service/backend && pytest tests/test_paddle_kombi.py -v +cd klausur-service/backend && pytest tests/test_paddle_kombi.py -v # 36 Tests ``` +**Test-Klassen:** + +| Klasse | Tests | Beschreibung | +|--------|-------|--------------| +| `TestSplitPaddleMultiWords` | 7 | Multi-Word-Box-Splitting | +| `TestGroupWordsIntoRows` | 5 | Y-Position Row Clustering | +| `TestMergeRowSequences` | 10 | Sequence Alignment innerhalb einer Zeile | +| `TestMergePaddleTesseract` | 8 | Vollstaendiger Merge mit Row-Matching | +| `TestMergeRealWorldRegression` | 1 | Regression mit Echtdaten | +| `TestSpatialOverlapDedup` | 4 | Raeumliche Overlap-Deduplizierung | +| `TestSplitThenMerge` | 1 | Split + Merge End-to-End | + | Testklasse | Tests | Beschreibung | |------------|-------|--------------| | `TestBoxIoU` | 6 | IoU-Berechnung: identisch, kein Overlap, teilweise, enthalten, Kante, Null-Flaeche |