feat: Words-First Grid Builder (bottom-up alternative zu cell_grid_v2)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 54s
CI / test-go-edu-search (push) Successful in 47s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 23s
CI / test-nodejs-website (push) Successful in 32s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 54s
CI / test-go-edu-search (push) Successful in 47s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 23s
CI / test-nodejs-website (push) Successful in 32s
Neuer Algorithmus in cv_words_first.py: Clustert Tesseract word_boxes direkt zu Spalten (X-Gap) und Zeilen (Y-Proximity), baut Zellen an Schnittpunkten. Kein Spalten-/Zeilenerkennung noetig. - cv_words_first.py: _cluster_columns, _cluster_rows, _build_cells, build_grid_from_words - ocr_pipeline_api.py: grid_method Parameter (v2|words_first) im /words Endpoint - StepWordRecognition.tsx: Dropdown Toggle fuer Grid-Methode - OCR-Pipeline.md: Doku v4.3.0 mit Words-First Algorithmus - 15 Unit-Tests fuer cv_words_first Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
214
klausur-service/backend/tests/test_cv_words_first.py
Normal file
214
klausur-service/backend/tests/test_cv_words_first.py
Normal file
@@ -0,0 +1,214 @@
|
||||
"""Tests for cv_words_first.py — Words-First Grid Builder."""
|
||||
|
||||
import pytest
|
||||
from cv_words_first import (
|
||||
_assign_word_to_column,
|
||||
_assign_word_to_row,
|
||||
_build_cells,
|
||||
_cluster_columns,
|
||||
_cluster_rows,
|
||||
build_grid_from_words,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 90):
|
||||
"""Create a synthetic word dict."""
|
||||
return {
|
||||
'text': text,
|
||||
'left': left,
|
||||
'top': top,
|
||||
'width': width,
|
||||
'height': height,
|
||||
'conf': conf,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _cluster_columns
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestClusterColumns:
|
||||
|
||||
def test_single_column_freetext(self):
|
||||
"""Words spread evenly across page → 1 column (column_text)."""
|
||||
words = [
|
||||
_word("Hello", 50, 10),
|
||||
_word("world", 120, 10),
|
||||
_word("this", 50, 40),
|
||||
_word("is", 120, 40),
|
||||
_word("text", 190, 40),
|
||||
]
|
||||
cols = _cluster_columns(words, img_w=400)
|
||||
assert len(cols) == 1
|
||||
assert cols[0]['type'] == 'column_text'
|
||||
|
||||
def test_two_columns(self):
|
||||
"""Two word groups with large X-gap → 2 columns."""
|
||||
words = [
|
||||
_word("apple", 20, 10),
|
||||
_word("Apfel", 300, 10),
|
||||
_word("dog", 20, 40),
|
||||
_word("Hund", 300, 40),
|
||||
]
|
||||
cols = _cluster_columns(words, img_w=500)
|
||||
assert len(cols) == 2
|
||||
assert cols[0]['type'] == 'column_1'
|
||||
assert cols[1]['type'] == 'column_2'
|
||||
|
||||
def test_three_columns(self):
|
||||
"""Three groups separated by wide gaps → 3 columns."""
|
||||
words = [
|
||||
_word("1", 10, 10, width=20),
|
||||
_word("apple", 100, 10),
|
||||
_word("Apfel", 400, 10),
|
||||
_word("2", 10, 40, width=20),
|
||||
_word("dog", 100, 40),
|
||||
_word("Hund", 400, 40),
|
||||
]
|
||||
cols = _cluster_columns(words, img_w=600)
|
||||
assert len(cols) == 3
|
||||
|
||||
def test_empty_words(self):
|
||||
"""No words → empty result."""
|
||||
assert _cluster_columns([], img_w=500) == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _cluster_rows
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestClusterRows:
|
||||
|
||||
def test_two_rows(self):
|
||||
"""Words at two Y-levels → 2 rows."""
|
||||
words = [
|
||||
_word("hello", 10, 20),
|
||||
_word("world", 100, 25),
|
||||
_word("foo", 10, 80),
|
||||
_word("bar", 100, 82),
|
||||
]
|
||||
rows = _cluster_rows(words)
|
||||
assert len(rows) == 2
|
||||
assert rows[0]['y_min'] < rows[1]['y_min']
|
||||
|
||||
def test_single_row(self):
|
||||
"""All words at same Y → 1 row."""
|
||||
words = [
|
||||
_word("a", 10, 50),
|
||||
_word("b", 80, 52),
|
||||
_word("c", 150, 51),
|
||||
]
|
||||
rows = _cluster_rows(words)
|
||||
assert len(rows) == 1
|
||||
|
||||
def test_empty(self):
|
||||
assert _cluster_rows([]) == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# build_grid_from_words (integration)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestBuildGridFromWords:
|
||||
|
||||
def test_two_column_vocab(self):
|
||||
"""Simulate a 2-column vocabulary page with 3 rows."""
|
||||
words = [
|
||||
_word("apple", 50, 20),
|
||||
_word("Apfel", 400, 22),
|
||||
_word("dog", 50, 60),
|
||||
_word("Hund", 400, 62),
|
||||
_word("cat", 50, 100),
|
||||
_word("Katze", 400, 102),
|
||||
]
|
||||
cells, cols_meta = build_grid_from_words(words, img_w=600, img_h=200)
|
||||
|
||||
assert len(cols_meta) == 2
|
||||
assert len(cells) == 6 # 3 rows × 2 cols
|
||||
# Check cell_id format
|
||||
cell_ids = {c['cell_id'] for c in cells}
|
||||
assert 'R00_C0' in cell_ids
|
||||
assert 'R00_C1' in cell_ids
|
||||
|
||||
def test_single_column_freetext(self):
|
||||
"""Single-column text → 1 column, multiple rows."""
|
||||
words = [
|
||||
_word("Hello", 50, 20),
|
||||
_word("world", 120, 22),
|
||||
_word("Second", 50, 60),
|
||||
_word("line", 120, 62),
|
||||
]
|
||||
cells, cols_meta = build_grid_from_words(words, img_w=300, img_h=150)
|
||||
|
||||
assert len(cols_meta) == 1
|
||||
assert cols_meta[0]['type'] == 'column_text'
|
||||
assert len(cells) == 2 # 2 rows, 1 column each
|
||||
|
||||
def test_empty_input(self):
|
||||
cells, cols = build_grid_from_words([], img_w=500, img_h=500)
|
||||
assert cells == []
|
||||
assert cols == []
|
||||
|
||||
def test_low_confidence_filtered(self):
|
||||
"""Words below min_confidence are excluded."""
|
||||
words = [
|
||||
_word("good", 50, 20, conf=90),
|
||||
_word("bad", 200, 20, conf=10),
|
||||
]
|
||||
cells, cols = build_grid_from_words(words, img_w=400, img_h=100, min_confidence=30)
|
||||
# Only the good word should produce a cell
|
||||
assert len(cells) == 1
|
||||
assert cells[0]['text'] == 'good'
|
||||
|
||||
def test_bbox_pct_correct(self):
|
||||
"""Check that bbox_pct is correctly computed from pixel coords."""
|
||||
words = [_word("test", 200, 100, width=100, height=30)]
|
||||
cells, _ = build_grid_from_words(words, img_w=1000, img_h=500)
|
||||
assert len(cells) == 1
|
||||
bp = cells[0]['bbox_pct']
|
||||
assert bp['x'] == 20.0 # 200/1000*100
|
||||
assert bp['y'] == 20.0 # 100/500*100
|
||||
assert bp['w'] == 10.0 # 100/1000*100
|
||||
assert bp['h'] == 6.0 # 30/500*100
|
||||
|
||||
def test_columns_meta_format(self):
|
||||
"""columns_meta has same keys as build_cell_grid_v2 output."""
|
||||
words = [
|
||||
_word("a", 50, 20),
|
||||
_word("b", 400, 20),
|
||||
]
|
||||
_, cols_meta = build_grid_from_words(words, img_w=600, img_h=100)
|
||||
for col in cols_meta:
|
||||
assert 'index' in col
|
||||
assert 'type' in col
|
||||
assert 'x' in col
|
||||
assert 'width' in col
|
||||
|
||||
def test_word_boxes_included(self):
|
||||
"""Each cell should contain word_boxes with percent coords."""
|
||||
words = [
|
||||
_word("hello", 50, 20),
|
||||
_word("world", 120, 22),
|
||||
]
|
||||
cells, _ = build_grid_from_words(words, img_w=300, img_h=100)
|
||||
assert len(cells) == 1 # single row, single column
|
||||
wb = cells[0].get('word_boxes', [])
|
||||
assert len(wb) == 2
|
||||
for w in wb:
|
||||
assert 'left' in w
|
||||
assert 'top' in w
|
||||
assert 'text' in w
|
||||
|
||||
def test_all_whitespace_filtered(self):
|
||||
"""Words with only whitespace text are filtered out."""
|
||||
words = [
|
||||
_word(" ", 50, 20, conf=90),
|
||||
_word("hello", 200, 20, conf=90),
|
||||
]
|
||||
cells, _ = build_grid_from_words(words, img_w=400, img_h=100)
|
||||
assert len(cells) == 1
|
||||
assert cells[0]['text'] == 'hello'
|
||||
Reference in New Issue
Block a user