Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 54s
CI / test-go-edu-search (push) Successful in 47s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 23s
CI / test-nodejs-website (push) Successful in 32s
Neuer Algorithmus in cv_words_first.py: Clustert Tesseract word_boxes direkt zu Spalten (X-Gap) und Zeilen (Y-Proximity), baut Zellen an Schnittpunkten. Kein Spalten-/Zeilenerkennung noetig. - cv_words_first.py: _cluster_columns, _cluster_rows, _build_cells, build_grid_from_words - ocr_pipeline_api.py: grid_method Parameter (v2|words_first) im /words Endpoint - StepWordRecognition.tsx: Dropdown Toggle fuer Grid-Methode - OCR-Pipeline.md: Doku v4.3.0 mit Words-First Algorithmus - 15 Unit-Tests fuer cv_words_first Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
215 lines
6.9 KiB
Python
215 lines
6.9 KiB
Python
"""Tests for cv_words_first.py — Words-First Grid Builder."""
|
||
|
||
import pytest
|
||
from cv_words_first import (
|
||
_assign_word_to_column,
|
||
_assign_word_to_row,
|
||
_build_cells,
|
||
_cluster_columns,
|
||
_cluster_rows,
|
||
build_grid_from_words,
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 90):
|
||
"""Create a synthetic word dict."""
|
||
return {
|
||
'text': text,
|
||
'left': left,
|
||
'top': top,
|
||
'width': width,
|
||
'height': height,
|
||
'conf': conf,
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _cluster_columns
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestClusterColumns:
|
||
|
||
def test_single_column_freetext(self):
|
||
"""Words spread evenly across page → 1 column (column_text)."""
|
||
words = [
|
||
_word("Hello", 50, 10),
|
||
_word("world", 120, 10),
|
||
_word("this", 50, 40),
|
||
_word("is", 120, 40),
|
||
_word("text", 190, 40),
|
||
]
|
||
cols = _cluster_columns(words, img_w=400)
|
||
assert len(cols) == 1
|
||
assert cols[0]['type'] == 'column_text'
|
||
|
||
def test_two_columns(self):
|
||
"""Two word groups with large X-gap → 2 columns."""
|
||
words = [
|
||
_word("apple", 20, 10),
|
||
_word("Apfel", 300, 10),
|
||
_word("dog", 20, 40),
|
||
_word("Hund", 300, 40),
|
||
]
|
||
cols = _cluster_columns(words, img_w=500)
|
||
assert len(cols) == 2
|
||
assert cols[0]['type'] == 'column_1'
|
||
assert cols[1]['type'] == 'column_2'
|
||
|
||
def test_three_columns(self):
|
||
"""Three groups separated by wide gaps → 3 columns."""
|
||
words = [
|
||
_word("1", 10, 10, width=20),
|
||
_word("apple", 100, 10),
|
||
_word("Apfel", 400, 10),
|
||
_word("2", 10, 40, width=20),
|
||
_word("dog", 100, 40),
|
||
_word("Hund", 400, 40),
|
||
]
|
||
cols = _cluster_columns(words, img_w=600)
|
||
assert len(cols) == 3
|
||
|
||
def test_empty_words(self):
|
||
"""No words → empty result."""
|
||
assert _cluster_columns([], img_w=500) == []
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _cluster_rows
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestClusterRows:
|
||
|
||
def test_two_rows(self):
|
||
"""Words at two Y-levels → 2 rows."""
|
||
words = [
|
||
_word("hello", 10, 20),
|
||
_word("world", 100, 25),
|
||
_word("foo", 10, 80),
|
||
_word("bar", 100, 82),
|
||
]
|
||
rows = _cluster_rows(words)
|
||
assert len(rows) == 2
|
||
assert rows[0]['y_min'] < rows[1]['y_min']
|
||
|
||
def test_single_row(self):
|
||
"""All words at same Y → 1 row."""
|
||
words = [
|
||
_word("a", 10, 50),
|
||
_word("b", 80, 52),
|
||
_word("c", 150, 51),
|
||
]
|
||
rows = _cluster_rows(words)
|
||
assert len(rows) == 1
|
||
|
||
def test_empty(self):
|
||
assert _cluster_rows([]) == []
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# build_grid_from_words (integration)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestBuildGridFromWords:
|
||
|
||
def test_two_column_vocab(self):
|
||
"""Simulate a 2-column vocabulary page with 3 rows."""
|
||
words = [
|
||
_word("apple", 50, 20),
|
||
_word("Apfel", 400, 22),
|
||
_word("dog", 50, 60),
|
||
_word("Hund", 400, 62),
|
||
_word("cat", 50, 100),
|
||
_word("Katze", 400, 102),
|
||
]
|
||
cells, cols_meta = build_grid_from_words(words, img_w=600, img_h=200)
|
||
|
||
assert len(cols_meta) == 2
|
||
assert len(cells) == 6 # 3 rows × 2 cols
|
||
# Check cell_id format
|
||
cell_ids = {c['cell_id'] for c in cells}
|
||
assert 'R00_C0' in cell_ids
|
||
assert 'R00_C1' in cell_ids
|
||
|
||
def test_single_column_freetext(self):
|
||
"""Single-column text → 1 column, multiple rows."""
|
||
words = [
|
||
_word("Hello", 50, 20),
|
||
_word("world", 120, 22),
|
||
_word("Second", 50, 60),
|
||
_word("line", 120, 62),
|
||
]
|
||
cells, cols_meta = build_grid_from_words(words, img_w=300, img_h=150)
|
||
|
||
assert len(cols_meta) == 1
|
||
assert cols_meta[0]['type'] == 'column_text'
|
||
assert len(cells) == 2 # 2 rows, 1 column each
|
||
|
||
def test_empty_input(self):
|
||
cells, cols = build_grid_from_words([], img_w=500, img_h=500)
|
||
assert cells == []
|
||
assert cols == []
|
||
|
||
def test_low_confidence_filtered(self):
|
||
"""Words below min_confidence are excluded."""
|
||
words = [
|
||
_word("good", 50, 20, conf=90),
|
||
_word("bad", 200, 20, conf=10),
|
||
]
|
||
cells, cols = build_grid_from_words(words, img_w=400, img_h=100, min_confidence=30)
|
||
# Only the good word should produce a cell
|
||
assert len(cells) == 1
|
||
assert cells[0]['text'] == 'good'
|
||
|
||
def test_bbox_pct_correct(self):
|
||
"""Check that bbox_pct is correctly computed from pixel coords."""
|
||
words = [_word("test", 200, 100, width=100, height=30)]
|
||
cells, _ = build_grid_from_words(words, img_w=1000, img_h=500)
|
||
assert len(cells) == 1
|
||
bp = cells[0]['bbox_pct']
|
||
assert bp['x'] == 20.0 # 200/1000*100
|
||
assert bp['y'] == 20.0 # 100/500*100
|
||
assert bp['w'] == 10.0 # 100/1000*100
|
||
assert bp['h'] == 6.0 # 30/500*100
|
||
|
||
def test_columns_meta_format(self):
|
||
"""columns_meta has same keys as build_cell_grid_v2 output."""
|
||
words = [
|
||
_word("a", 50, 20),
|
||
_word("b", 400, 20),
|
||
]
|
||
_, cols_meta = build_grid_from_words(words, img_w=600, img_h=100)
|
||
for col in cols_meta:
|
||
assert 'index' in col
|
||
assert 'type' in col
|
||
assert 'x' in col
|
||
assert 'width' in col
|
||
|
||
def test_word_boxes_included(self):
|
||
"""Each cell should contain word_boxes with percent coords."""
|
||
words = [
|
||
_word("hello", 50, 20),
|
||
_word("world", 120, 22),
|
||
]
|
||
cells, _ = build_grid_from_words(words, img_w=300, img_h=100)
|
||
assert len(cells) == 1 # single row, single column
|
||
wb = cells[0].get('word_boxes', [])
|
||
assert len(wb) == 2
|
||
for w in wb:
|
||
assert 'left' in w
|
||
assert 'top' in w
|
||
assert 'text' in w
|
||
|
||
def test_all_whitespace_filtered(self):
|
||
"""Words with only whitespace text are filtered out."""
|
||
words = [
|
||
_word(" ", 50, 20, conf=90),
|
||
_word("hello", 200, 20, conf=90),
|
||
]
|
||
cells, _ = build_grid_from_words(words, img_w=400, img_h=100)
|
||
assert len(cells) == 1
|
||
assert cells[0]['text'] == 'hello'
|