feat: Words-First Grid Builder (bottom-up alternative zu cell_grid_v2)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 54s
CI / test-go-edu-search (push) Successful in 47s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 23s
CI / test-nodejs-website (push) Successful in 32s

Neuer Algorithmus in cv_words_first.py: Clustert Tesseract word_boxes
direkt zu Spalten (X-Gap) und Zeilen (Y-Proximity), baut Zellen an
Schnittpunkten. Kein Spalten-/Zeilenerkennung noetig.

- cv_words_first.py: _cluster_columns, _cluster_rows, _build_cells, build_grid_from_words
- ocr_pipeline_api.py: grid_method Parameter (v2|words_first) im /words Endpoint
- StepWordRecognition.tsx: Dropdown Toggle fuer Grid-Methode
- OCR-Pipeline.md: Doku v4.3.0 mit Words-First Algorithmus
- 15 Unit-Tests fuer cv_words_first

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-12 06:46:05 +01:00
parent 2fdf3ff868
commit ced5bb3dd3
6 changed files with 854 additions and 34 deletions

View File

@@ -0,0 +1,214 @@
"""Tests for cv_words_first.py — Words-First Grid Builder."""
import pytest
from cv_words_first import (
_assign_word_to_column,
_assign_word_to_row,
_build_cells,
_cluster_columns,
_cluster_rows,
build_grid_from_words,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 90):
"""Create a synthetic word dict."""
return {
'text': text,
'left': left,
'top': top,
'width': width,
'height': height,
'conf': conf,
}
# ---------------------------------------------------------------------------
# _cluster_columns
# ---------------------------------------------------------------------------
class TestClusterColumns:
def test_single_column_freetext(self):
"""Words spread evenly across page → 1 column (column_text)."""
words = [
_word("Hello", 50, 10),
_word("world", 120, 10),
_word("this", 50, 40),
_word("is", 120, 40),
_word("text", 190, 40),
]
cols = _cluster_columns(words, img_w=400)
assert len(cols) == 1
assert cols[0]['type'] == 'column_text'
def test_two_columns(self):
"""Two word groups with large X-gap → 2 columns."""
words = [
_word("apple", 20, 10),
_word("Apfel", 300, 10),
_word("dog", 20, 40),
_word("Hund", 300, 40),
]
cols = _cluster_columns(words, img_w=500)
assert len(cols) == 2
assert cols[0]['type'] == 'column_1'
assert cols[1]['type'] == 'column_2'
def test_three_columns(self):
"""Three groups separated by wide gaps → 3 columns."""
words = [
_word("1", 10, 10, width=20),
_word("apple", 100, 10),
_word("Apfel", 400, 10),
_word("2", 10, 40, width=20),
_word("dog", 100, 40),
_word("Hund", 400, 40),
]
cols = _cluster_columns(words, img_w=600)
assert len(cols) == 3
def test_empty_words(self):
"""No words → empty result."""
assert _cluster_columns([], img_w=500) == []
# ---------------------------------------------------------------------------
# _cluster_rows
# ---------------------------------------------------------------------------
class TestClusterRows:
def test_two_rows(self):
"""Words at two Y-levels → 2 rows."""
words = [
_word("hello", 10, 20),
_word("world", 100, 25),
_word("foo", 10, 80),
_word("bar", 100, 82),
]
rows = _cluster_rows(words)
assert len(rows) == 2
assert rows[0]['y_min'] < rows[1]['y_min']
def test_single_row(self):
"""All words at same Y → 1 row."""
words = [
_word("a", 10, 50),
_word("b", 80, 52),
_word("c", 150, 51),
]
rows = _cluster_rows(words)
assert len(rows) == 1
def test_empty(self):
assert _cluster_rows([]) == []
# ---------------------------------------------------------------------------
# build_grid_from_words (integration)
# ---------------------------------------------------------------------------
class TestBuildGridFromWords:
def test_two_column_vocab(self):
"""Simulate a 2-column vocabulary page with 3 rows."""
words = [
_word("apple", 50, 20),
_word("Apfel", 400, 22),
_word("dog", 50, 60),
_word("Hund", 400, 62),
_word("cat", 50, 100),
_word("Katze", 400, 102),
]
cells, cols_meta = build_grid_from_words(words, img_w=600, img_h=200)
assert len(cols_meta) == 2
assert len(cells) == 6 # 3 rows × 2 cols
# Check cell_id format
cell_ids = {c['cell_id'] for c in cells}
assert 'R00_C0' in cell_ids
assert 'R00_C1' in cell_ids
def test_single_column_freetext(self):
"""Single-column text → 1 column, multiple rows."""
words = [
_word("Hello", 50, 20),
_word("world", 120, 22),
_word("Second", 50, 60),
_word("line", 120, 62),
]
cells, cols_meta = build_grid_from_words(words, img_w=300, img_h=150)
assert len(cols_meta) == 1
assert cols_meta[0]['type'] == 'column_text'
assert len(cells) == 2 # 2 rows, 1 column each
def test_empty_input(self):
cells, cols = build_grid_from_words([], img_w=500, img_h=500)
assert cells == []
assert cols == []
def test_low_confidence_filtered(self):
"""Words below min_confidence are excluded."""
words = [
_word("good", 50, 20, conf=90),
_word("bad", 200, 20, conf=10),
]
cells, cols = build_grid_from_words(words, img_w=400, img_h=100, min_confidence=30)
# Only the good word should produce a cell
assert len(cells) == 1
assert cells[0]['text'] == 'good'
def test_bbox_pct_correct(self):
"""Check that bbox_pct is correctly computed from pixel coords."""
words = [_word("test", 200, 100, width=100, height=30)]
cells, _ = build_grid_from_words(words, img_w=1000, img_h=500)
assert len(cells) == 1
bp = cells[0]['bbox_pct']
assert bp['x'] == 20.0 # 200/1000*100
assert bp['y'] == 20.0 # 100/500*100
assert bp['w'] == 10.0 # 100/1000*100
assert bp['h'] == 6.0 # 30/500*100
def test_columns_meta_format(self):
"""columns_meta has same keys as build_cell_grid_v2 output."""
words = [
_word("a", 50, 20),
_word("b", 400, 20),
]
_, cols_meta = build_grid_from_words(words, img_w=600, img_h=100)
for col in cols_meta:
assert 'index' in col
assert 'type' in col
assert 'x' in col
assert 'width' in col
def test_word_boxes_included(self):
"""Each cell should contain word_boxes with percent coords."""
words = [
_word("hello", 50, 20),
_word("world", 120, 22),
]
cells, _ = build_grid_from_words(words, img_w=300, img_h=100)
assert len(cells) == 1 # single row, single column
wb = cells[0].get('word_boxes', [])
assert len(wb) == 2
for w in wb:
assert 'left' in w
assert 'top' in w
assert 'text' in w
def test_all_whitespace_filtered(self):
"""Words with only whitespace text are filtered out."""
words = [
_word(" ", 50, 20, conf=90),
_word("hello", 200, 20, conf=90),
]
cells, _ = build_grid_from_words(words, img_w=400, img_h=100)
assert len(cells) == 1
assert cells[0]['text'] == 'hello'