Files
breakpilot-lehrer/klausur-service/backend/tests/test_cv_words_first.py
Benjamin Admin ced5bb3dd3
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 54s
CI / test-go-edu-search (push) Successful in 47s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 23s
CI / test-nodejs-website (push) Successful in 32s
feat: Words-First Grid Builder (bottom-up alternative zu cell_grid_v2)
Neuer Algorithmus in cv_words_first.py: Clustert Tesseract word_boxes
direkt zu Spalten (X-Gap) und Zeilen (Y-Proximity), baut Zellen an
Schnittpunkten. Kein Spalten-/Zeilenerkennung noetig.

- cv_words_first.py: _cluster_columns, _cluster_rows, _build_cells, build_grid_from_words
- ocr_pipeline_api.py: grid_method Parameter (v2|words_first) im /words Endpoint
- StepWordRecognition.tsx: Dropdown Toggle fuer Grid-Methode
- OCR-Pipeline.md: Doku v4.3.0 mit Words-First Algorithmus
- 15 Unit-Tests fuer cv_words_first

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 06:46:05 +01:00

215 lines
6.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for cv_words_first.py — Words-First Grid Builder."""
import pytest
from cv_words_first import (
_assign_word_to_column,
_assign_word_to_row,
_build_cells,
_cluster_columns,
_cluster_rows,
build_grid_from_words,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 90):
"""Create a synthetic word dict."""
return {
'text': text,
'left': left,
'top': top,
'width': width,
'height': height,
'conf': conf,
}
# ---------------------------------------------------------------------------
# _cluster_columns
# ---------------------------------------------------------------------------
class TestClusterColumns:
def test_single_column_freetext(self):
"""Words spread evenly across page → 1 column (column_text)."""
words = [
_word("Hello", 50, 10),
_word("world", 120, 10),
_word("this", 50, 40),
_word("is", 120, 40),
_word("text", 190, 40),
]
cols = _cluster_columns(words, img_w=400)
assert len(cols) == 1
assert cols[0]['type'] == 'column_text'
def test_two_columns(self):
"""Two word groups with large X-gap → 2 columns."""
words = [
_word("apple", 20, 10),
_word("Apfel", 300, 10),
_word("dog", 20, 40),
_word("Hund", 300, 40),
]
cols = _cluster_columns(words, img_w=500)
assert len(cols) == 2
assert cols[0]['type'] == 'column_1'
assert cols[1]['type'] == 'column_2'
def test_three_columns(self):
"""Three groups separated by wide gaps → 3 columns."""
words = [
_word("1", 10, 10, width=20),
_word("apple", 100, 10),
_word("Apfel", 400, 10),
_word("2", 10, 40, width=20),
_word("dog", 100, 40),
_word("Hund", 400, 40),
]
cols = _cluster_columns(words, img_w=600)
assert len(cols) == 3
def test_empty_words(self):
"""No words → empty result."""
assert _cluster_columns([], img_w=500) == []
# ---------------------------------------------------------------------------
# _cluster_rows
# ---------------------------------------------------------------------------
class TestClusterRows:
def test_two_rows(self):
"""Words at two Y-levels → 2 rows."""
words = [
_word("hello", 10, 20),
_word("world", 100, 25),
_word("foo", 10, 80),
_word("bar", 100, 82),
]
rows = _cluster_rows(words)
assert len(rows) == 2
assert rows[0]['y_min'] < rows[1]['y_min']
def test_single_row(self):
"""All words at same Y → 1 row."""
words = [
_word("a", 10, 50),
_word("b", 80, 52),
_word("c", 150, 51),
]
rows = _cluster_rows(words)
assert len(rows) == 1
def test_empty(self):
assert _cluster_rows([]) == []
# ---------------------------------------------------------------------------
# build_grid_from_words (integration)
# ---------------------------------------------------------------------------
class TestBuildGridFromWords:
def test_two_column_vocab(self):
"""Simulate a 2-column vocabulary page with 3 rows."""
words = [
_word("apple", 50, 20),
_word("Apfel", 400, 22),
_word("dog", 50, 60),
_word("Hund", 400, 62),
_word("cat", 50, 100),
_word("Katze", 400, 102),
]
cells, cols_meta = build_grid_from_words(words, img_w=600, img_h=200)
assert len(cols_meta) == 2
assert len(cells) == 6 # 3 rows × 2 cols
# Check cell_id format
cell_ids = {c['cell_id'] for c in cells}
assert 'R00_C0' in cell_ids
assert 'R00_C1' in cell_ids
def test_single_column_freetext(self):
"""Single-column text → 1 column, multiple rows."""
words = [
_word("Hello", 50, 20),
_word("world", 120, 22),
_word("Second", 50, 60),
_word("line", 120, 62),
]
cells, cols_meta = build_grid_from_words(words, img_w=300, img_h=150)
assert len(cols_meta) == 1
assert cols_meta[0]['type'] == 'column_text'
assert len(cells) == 2 # 2 rows, 1 column each
def test_empty_input(self):
cells, cols = build_grid_from_words([], img_w=500, img_h=500)
assert cells == []
assert cols == []
def test_low_confidence_filtered(self):
"""Words below min_confidence are excluded."""
words = [
_word("good", 50, 20, conf=90),
_word("bad", 200, 20, conf=10),
]
cells, cols = build_grid_from_words(words, img_w=400, img_h=100, min_confidence=30)
# Only the good word should produce a cell
assert len(cells) == 1
assert cells[0]['text'] == 'good'
def test_bbox_pct_correct(self):
"""Check that bbox_pct is correctly computed from pixel coords."""
words = [_word("test", 200, 100, width=100, height=30)]
cells, _ = build_grid_from_words(words, img_w=1000, img_h=500)
assert len(cells) == 1
bp = cells[0]['bbox_pct']
assert bp['x'] == 20.0 # 200/1000*100
assert bp['y'] == 20.0 # 100/500*100
assert bp['w'] == 10.0 # 100/1000*100
assert bp['h'] == 6.0 # 30/500*100
def test_columns_meta_format(self):
"""columns_meta has same keys as build_cell_grid_v2 output."""
words = [
_word("a", 50, 20),
_word("b", 400, 20),
]
_, cols_meta = build_grid_from_words(words, img_w=600, img_h=100)
for col in cols_meta:
assert 'index' in col
assert 'type' in col
assert 'x' in col
assert 'width' in col
def test_word_boxes_included(self):
"""Each cell should contain word_boxes with percent coords."""
words = [
_word("hello", 50, 20),
_word("world", 120, 22),
]
cells, _ = build_grid_from_words(words, img_w=300, img_h=100)
assert len(cells) == 1 # single row, single column
wb = cells[0].get('word_boxes', [])
assert len(wb) == 2
for w in wb:
assert 'left' in w
assert 'top' in w
assert 'text' in w
def test_all_whitespace_filtered(self):
"""Words with only whitespace text are filtered out."""
words = [
_word(" ", 50, 20, conf=90),
_word("hello", 200, 20, conf=90),
]
cells, _ = build_grid_from_words(words, img_w=400, img_h=100)
assert len(cells) == 1
assert cells[0]['text'] == 'hello'