Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m50s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Thin black lines (1-5px) at page edges from scanning were incorrectly detected as content, shifting content bounds and creating spurious IGNORE columns. This filters narrow projection runs (<1% of image dimension) and introduces explicit margin_left/margin_right regions for downstream page reconstruction. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
998 lines
38 KiB
Python
998 lines
38 KiB
Python
"""
|
||
Unit Tests for CV Vocab Pipeline (cv_vocab_pipeline.py)
|
||
|
||
Tests cover:
|
||
- Data classes (PageRegion, VocabRow, PipelineResult)
|
||
- Stage 2: Deskew image
|
||
- Stage 3: Dewarp (pass-through)
|
||
- Stage 4: Image preparation (OCR + Layout images)
|
||
- Stage 5: Layout analysis (content bounds, projection profiles, column detection)
|
||
- Stage 6: Multi-pass OCR region handling
|
||
- Stage 7: Line grouping and vocabulary matching
|
||
- Noise filter functions (_is_noise_tail_token, _clean_cell_text)
|
||
- Phonetic detection (_is_phonetic_only_text)
|
||
- Phonetic & continuation row merging
|
||
- Orchestrator (run_cv_pipeline)
|
||
|
||
DSGVO Note: All tests run locally with synthetic data. No external API calls.
|
||
"""
|
||
|
||
import pytest
|
||
import numpy as np
|
||
from unittest.mock import AsyncMock, MagicMock, patch, PropertyMock
|
||
from dataclasses import asdict
|
||
|
||
# Import module under test
|
||
from cv_vocab_pipeline import (
|
||
PageRegion,
|
||
VocabRow,
|
||
PipelineResult,
|
||
deskew_image,
|
||
dewarp_image,
|
||
create_ocr_image,
|
||
create_layout_image,
|
||
_find_content_bounds,
|
||
_filter_narrow_runs,
|
||
_build_margin_regions,
|
||
analyze_layout,
|
||
_group_words_into_lines,
|
||
match_lines_to_vocab,
|
||
run_cv_pipeline,
|
||
CV2_AVAILABLE,
|
||
TESSERACT_AVAILABLE,
|
||
CV_PIPELINE_AVAILABLE,
|
||
_is_noise_tail_token,
|
||
_clean_cell_text,
|
||
_is_phonetic_only_text,
|
||
_merge_phonetic_continuation_rows,
|
||
_merge_continuation_rows,
|
||
)
|
||
|
||
|
||
# =============================================
|
||
# FIXTURES
|
||
# =============================================
|
||
|
||
@pytest.fixture
|
||
def white_image():
|
||
"""Create a simple 300x200 white BGR image."""
|
||
return np.ones((200, 300, 3), dtype=np.uint8) * 255
|
||
|
||
|
||
@pytest.fixture
|
||
def text_like_image():
|
||
"""Create a 600x400 image with dark text-like regions simulating 3 columns."""
|
||
img = np.ones((400, 600, 3), dtype=np.uint8) * 255
|
||
|
||
# Column 1 (EN): x=20..170
|
||
for y in range(50, 350, 30):
|
||
img[y:y+15, 30:160, :] = 30 # Dark text lines
|
||
|
||
# Gap between col1 and col2: x=170..210 (white)
|
||
|
||
# Column 2 (DE): x=210..370
|
||
for y in range(50, 350, 30):
|
||
img[y:y+15, 220:360, :] = 30
|
||
|
||
# Gap between col2 and col3: x=370..410 (white)
|
||
|
||
# Column 3 (Example): x=410..580
|
||
for y in range(50, 350, 30):
|
||
img[y:y+15, 420:570, :] = 30
|
||
|
||
return img
|
||
|
||
|
||
@pytest.fixture
|
||
def binary_image():
|
||
"""Create a binary (single-channel) image for OCR tests."""
|
||
# White background (255) with some black text-like areas
|
||
img = np.ones((400, 600), dtype=np.uint8) * 255
|
||
# Add text-like dark bands
|
||
for y in range(50, 350, 30):
|
||
img[y:y+15, 30:570] = 0
|
||
return img
|
||
|
||
|
||
@pytest.fixture
|
||
def sample_words_column_en():
|
||
"""Sample OCR word dicts for English column."""
|
||
return [
|
||
{'text': 'achieve', 'left': 30, 'top': 50, 'width': 80, 'height': 15, 'conf': 90, 'region_type': 'column_en'},
|
||
{'text': 'improve', 'left': 30, 'top': 80, 'width': 80, 'height': 15, 'conf': 85, 'region_type': 'column_en'},
|
||
{'text': 'success', 'left': 30, 'top': 110, 'width': 80, 'height': 15, 'conf': 92, 'region_type': 'column_en'},
|
||
]
|
||
|
||
|
||
@pytest.fixture
|
||
def sample_words_column_de():
|
||
"""Sample OCR word dicts for German column."""
|
||
return [
|
||
{'text': 'erreichen', 'left': 220, 'top': 52, 'width': 100, 'height': 15, 'conf': 88, 'region_type': 'column_de'},
|
||
{'text': 'verbessern', 'left': 220, 'top': 82, 'width': 100, 'height': 15, 'conf': 80, 'region_type': 'column_de'},
|
||
{'text': 'Erfolg', 'left': 220, 'top': 112, 'width': 100, 'height': 15, 'conf': 95, 'region_type': 'column_de'},
|
||
]
|
||
|
||
|
||
@pytest.fixture
|
||
def sample_words_column_ex():
|
||
"""Sample OCR word dicts for Example column."""
|
||
return [
|
||
{'text': 'She', 'left': 420, 'top': 50, 'width': 30, 'height': 15, 'conf': 85, 'region_type': 'column_example'},
|
||
{'text': 'achieved', 'left': 455, 'top': 50, 'width': 70, 'height': 15, 'conf': 80, 'region_type': 'column_example'},
|
||
{'text': 'her', 'left': 530, 'top': 50, 'width': 30, 'height': 15, 'conf': 90, 'region_type': 'column_example'},
|
||
{'text': 'goals.', 'left': 420, 'top': 52, 'width': 50, 'height': 15, 'conf': 75, 'region_type': 'column_example'},
|
||
]
|
||
|
||
|
||
@pytest.fixture
|
||
def sample_regions():
|
||
"""Sample 3-column PageRegion layout."""
|
||
return [
|
||
PageRegion(type='column_en', x=0, y=50, width=190, height=300),
|
||
PageRegion(type='column_de', x=210, y=50, width=160, height=300),
|
||
PageRegion(type='column_example', x=410, y=50, width=190, height=300),
|
||
]
|
||
|
||
|
||
# =============================================
|
||
# DATA CLASS TESTS
|
||
# =============================================
|
||
|
||
class TestDataClasses:
|
||
"""Test data classes for correct defaults and fields."""
|
||
|
||
def test_page_region_creation(self):
|
||
region = PageRegion(type='column_en', x=10, y=20, width=100, height=200)
|
||
assert region.type == 'column_en'
|
||
assert region.x == 10
|
||
assert region.y == 20
|
||
assert region.width == 100
|
||
assert region.height == 200
|
||
|
||
def test_vocab_row_defaults(self):
|
||
row = VocabRow()
|
||
assert row.english == ""
|
||
assert row.german == ""
|
||
assert row.example == ""
|
||
assert row.confidence == 0.0
|
||
assert row.y_position == 0
|
||
|
||
def test_vocab_row_with_values(self):
|
||
row = VocabRow(english="test", german="Test", example="A test.", confidence=85.5, y_position=100)
|
||
assert row.english == "test"
|
||
assert row.german == "Test"
|
||
assert row.confidence == 85.5
|
||
|
||
def test_pipeline_result_defaults(self):
|
||
result = PipelineResult()
|
||
assert result.vocabulary == []
|
||
assert result.word_count == 0
|
||
assert result.columns_detected == 0
|
||
assert result.duration_seconds == 0.0
|
||
assert result.stages == {}
|
||
assert result.error is None
|
||
|
||
def test_pipeline_result_error(self):
|
||
result = PipelineResult(error="Something went wrong")
|
||
assert result.error == "Something went wrong"
|
||
|
||
|
||
# =============================================
|
||
# STAGE 2: DESKEW TESTS
|
||
# =============================================
|
||
|
||
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
|
||
class TestDeskew:
|
||
"""Test deskew (rotation correction) stage."""
|
||
|
||
def test_deskew_straight_image(self, white_image):
|
||
"""A perfectly straight image should not be rotated."""
|
||
corrected, angle = deskew_image(white_image)
|
||
assert abs(angle) < 0.1
|
||
assert corrected.shape == white_image.shape
|
||
|
||
def test_deskew_returns_tuple(self, white_image):
|
||
"""deskew_image must return (image, angle) tuple."""
|
||
result = deskew_image(white_image)
|
||
assert isinstance(result, tuple)
|
||
assert len(result) == 2
|
||
assert isinstance(result[0], np.ndarray)
|
||
assert isinstance(result[1], float)
|
||
|
||
def test_deskew_preserves_shape(self, text_like_image):
|
||
"""Output image should have same shape as input."""
|
||
corrected, _ = deskew_image(text_like_image)
|
||
assert corrected.shape == text_like_image.shape
|
||
|
||
|
||
# =============================================
|
||
# STAGE 3: DEWARP TESTS
|
||
# =============================================
|
||
|
||
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
|
||
class TestDewarp:
|
||
"""Test dewarp stage (returns (image, info) tuple)."""
|
||
|
||
def test_dewarp_returns_tuple(self, white_image):
|
||
"""dewarp_image must return (image, dewarp_info) tuple."""
|
||
result = dewarp_image(white_image)
|
||
assert isinstance(result, tuple)
|
||
assert len(result) == 2
|
||
img_out, info = result
|
||
assert isinstance(img_out, np.ndarray)
|
||
assert isinstance(info, dict)
|
||
assert "shear_degrees" in info
|
||
|
||
def test_dewarp_preserves_shape(self, text_like_image):
|
||
"""Output image should have same shape as input."""
|
||
img_out, _ = dewarp_image(text_like_image)
|
||
assert img_out.shape == text_like_image.shape
|
||
|
||
def test_dewarp_white_image_no_correction(self, white_image):
|
||
"""A uniform white image should get no shear correction."""
|
||
img_out, info = dewarp_image(white_image)
|
||
assert abs(info["shear_degrees"]) < 0.5
|
||
assert img_out.shape == white_image.shape
|
||
|
||
|
||
# =============================================
|
||
# STAGE 4: IMAGE PREPARATION TESTS
|
||
# =============================================
|
||
|
||
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
|
||
class TestImagePreparation:
|
||
"""Test OCR and layout image creation."""
|
||
|
||
def test_create_ocr_image_returns_grayscale(self, text_like_image):
|
||
"""OCR image should be single-channel (binarized)."""
|
||
ocr_img = create_ocr_image(text_like_image)
|
||
assert len(ocr_img.shape) == 2 # Single channel
|
||
assert ocr_img.dtype == np.uint8
|
||
|
||
def test_create_ocr_image_is_binary(self, text_like_image):
|
||
"""OCR image should contain only 0 and 255 values."""
|
||
ocr_img = create_ocr_image(text_like_image)
|
||
unique_vals = np.unique(ocr_img)
|
||
assert all(v in [0, 255] for v in unique_vals)
|
||
|
||
def test_create_layout_image_returns_grayscale(self, text_like_image):
|
||
"""Layout image should be single-channel (CLAHE enhanced)."""
|
||
layout_img = create_layout_image(text_like_image)
|
||
assert len(layout_img.shape) == 2
|
||
assert layout_img.dtype == np.uint8
|
||
|
||
def test_create_layout_image_enhanced_contrast(self, text_like_image):
|
||
"""Layout image should have different histogram than simple grayscale."""
|
||
import cv2
|
||
gray = cv2.cvtColor(text_like_image, cv2.COLOR_BGR2GRAY)
|
||
layout_img = create_layout_image(text_like_image)
|
||
# CLAHE should change the histogram
|
||
assert layout_img.shape == gray.shape
|
||
|
||
|
||
# =============================================
|
||
# STAGE 5: LAYOUT ANALYSIS TESTS
|
||
# =============================================
|
||
|
||
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
|
||
class TestContentBounds:
|
||
"""Test _find_content_bounds helper."""
|
||
|
||
def test_empty_image(self):
|
||
"""Fully white (inverted = black) image should return full bounds."""
|
||
inv = np.zeros((200, 300), dtype=np.uint8)
|
||
left, right, top, bottom = _find_content_bounds(inv)
|
||
# With no content, bounds should span the image
|
||
assert left >= 0
|
||
assert right <= 300
|
||
assert top >= 0
|
||
assert bottom <= 200
|
||
|
||
def test_centered_content(self):
|
||
"""Content in center should give tight bounds."""
|
||
inv = np.zeros((400, 600), dtype=np.uint8)
|
||
# Add content block in center
|
||
inv[100:300, 50:550] = 255
|
||
left, right, top, bottom = _find_content_bounds(inv)
|
||
assert left <= 52 # ~50 with 2px margin
|
||
assert right >= 548 # ~550 with 2px margin
|
||
assert top <= 102
|
||
assert bottom >= 298
|
||
|
||
|
||
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
|
||
class TestLayoutAnalysis:
|
||
"""Test analyze_layout for column detection."""
|
||
|
||
def test_returns_list_of_regions(self, text_like_image):
|
||
"""analyze_layout should return a list of PageRegion."""
|
||
ocr_img = create_ocr_image(text_like_image)
|
||
layout_img = create_layout_image(text_like_image)
|
||
regions = analyze_layout(layout_img, ocr_img)
|
||
assert isinstance(regions, list)
|
||
assert all(isinstance(r, PageRegion) for r in regions)
|
||
|
||
def test_detects_columns(self, text_like_image):
|
||
"""With clear 3-column image, should detect at least 1 column."""
|
||
ocr_img = create_ocr_image(text_like_image)
|
||
layout_img = create_layout_image(text_like_image)
|
||
regions = analyze_layout(layout_img, ocr_img)
|
||
column_regions = [r for r in regions if r.type.startswith('column')]
|
||
assert len(column_regions) >= 1
|
||
|
||
def test_single_column_fallback(self):
|
||
"""Image with no clear columns should fall back to single column."""
|
||
# Uniform text across full width
|
||
img = np.ones((400, 600, 3), dtype=np.uint8) * 255
|
||
for y in range(50, 350, 20):
|
||
img[y:y+10, 20:580, :] = 30 # Full-width text
|
||
ocr_img = create_ocr_image(img)
|
||
layout_img = create_layout_image(img)
|
||
regions = analyze_layout(layout_img, ocr_img)
|
||
column_regions = [r for r in regions if r.type.startswith('column')]
|
||
# Should at least return 1 column (full page fallback)
|
||
assert len(column_regions) >= 1
|
||
|
||
def test_region_types_are_valid(self, text_like_image):
|
||
"""All region types should be from the expected set."""
|
||
ocr_img = create_ocr_image(text_like_image)
|
||
layout_img = create_layout_image(text_like_image)
|
||
regions = analyze_layout(layout_img, ocr_img)
|
||
valid_types = {'column_en', 'column_de', 'column_example', 'header', 'footer'}
|
||
for r in regions:
|
||
assert r.type in valid_types, f"Unexpected region type: {r.type}"
|
||
|
||
|
||
# =============================================
|
||
# STAGE 7: LINE GROUPING TESTS
|
||
# =============================================
|
||
|
||
class TestLineGrouping:
|
||
"""Test _group_words_into_lines function."""
|
||
|
||
def test_empty_input(self):
|
||
"""Empty word list should return empty lines."""
|
||
assert _group_words_into_lines([]) == []
|
||
|
||
def test_single_word(self):
|
||
"""Single word should return one line with one word."""
|
||
words = [{'text': 'hello', 'left': 10, 'top': 50, 'width': 50, 'height': 15, 'conf': 90}]
|
||
lines = _group_words_into_lines(words)
|
||
assert len(lines) == 1
|
||
assert len(lines[0]) == 1
|
||
assert lines[0][0]['text'] == 'hello'
|
||
|
||
def test_words_on_same_line(self):
|
||
"""Words close in Y should be grouped into one line."""
|
||
words = [
|
||
{'text': 'hello', 'left': 10, 'top': 50, 'width': 50, 'height': 15, 'conf': 90},
|
||
{'text': 'world', 'left': 70, 'top': 52, 'width': 50, 'height': 15, 'conf': 85},
|
||
]
|
||
lines = _group_words_into_lines(words, y_tolerance_px=10)
|
||
assert len(lines) == 1
|
||
assert len(lines[0]) == 2
|
||
|
||
def test_words_on_different_lines(self):
|
||
"""Words far apart in Y should be on different lines."""
|
||
words = [
|
||
{'text': 'line1', 'left': 10, 'top': 50, 'width': 50, 'height': 15, 'conf': 90},
|
||
{'text': 'line2', 'left': 10, 'top': 100, 'width': 50, 'height': 15, 'conf': 85},
|
||
{'text': 'line3', 'left': 10, 'top': 150, 'width': 50, 'height': 15, 'conf': 88},
|
||
]
|
||
lines = _group_words_into_lines(words, y_tolerance_px=20)
|
||
assert len(lines) == 3
|
||
|
||
def test_words_sorted_by_x_within_line(self):
|
||
"""Words within a line should be sorted by X position."""
|
||
words = [
|
||
{'text': 'world', 'left': 100, 'top': 50, 'width': 50, 'height': 15, 'conf': 85},
|
||
{'text': 'hello', 'left': 10, 'top': 52, 'width': 50, 'height': 15, 'conf': 90},
|
||
]
|
||
lines = _group_words_into_lines(words, y_tolerance_px=10)
|
||
assert len(lines) == 1
|
||
assert lines[0][0]['text'] == 'hello'
|
||
assert lines[0][1]['text'] == 'world'
|
||
|
||
|
||
# =============================================
|
||
# STAGE 7: VOCABULARY MATCHING TESTS
|
||
# =============================================
|
||
|
||
class TestVocabMatching:
|
||
"""Test match_lines_to_vocab function."""
|
||
|
||
def test_empty_results(self, sample_regions):
|
||
"""Empty OCR results should return empty vocab."""
|
||
vocab = match_lines_to_vocab({}, sample_regions)
|
||
assert vocab == []
|
||
|
||
def test_en_only(self, sample_words_column_en, sample_regions):
|
||
"""Only EN words should create entries with empty DE/example."""
|
||
ocr_results = {'column_en': sample_words_column_en}
|
||
vocab = match_lines_to_vocab(ocr_results, sample_regions)
|
||
assert len(vocab) == 3
|
||
for row in vocab:
|
||
assert row.english != ""
|
||
assert row.german == ""
|
||
|
||
def test_en_de_matching(self, sample_words_column_en, sample_words_column_de, sample_regions):
|
||
"""EN and DE words on same Y should be matched."""
|
||
ocr_results = {
|
||
'column_en': sample_words_column_en,
|
||
'column_de': sample_words_column_de,
|
||
}
|
||
vocab = match_lines_to_vocab(ocr_results, sample_regions, y_tolerance_px=25)
|
||
assert len(vocab) == 3
|
||
# First entry should match achieve <-> erreichen
|
||
assert vocab[0].english == 'achieve'
|
||
assert vocab[0].german == 'erreichen'
|
||
|
||
def test_full_3_column_matching(self, sample_words_column_en, sample_words_column_de,
|
||
sample_words_column_ex, sample_regions):
|
||
"""All 3 columns should be matched by Y coordinate."""
|
||
ocr_results = {
|
||
'column_en': sample_words_column_en,
|
||
'column_de': sample_words_column_de,
|
||
'column_example': sample_words_column_ex,
|
||
}
|
||
vocab = match_lines_to_vocab(ocr_results, sample_regions, y_tolerance_px=25)
|
||
assert len(vocab) >= 1
|
||
# First entry should have example text
|
||
assert vocab[0].english == 'achieve'
|
||
assert vocab[0].example != ""
|
||
|
||
def test_sorted_by_y_position(self, sample_words_column_en, sample_regions):
|
||
"""Result should be sorted by Y position."""
|
||
ocr_results = {'column_en': sample_words_column_en}
|
||
vocab = match_lines_to_vocab(ocr_results, sample_regions)
|
||
positions = [row.y_position for row in vocab]
|
||
assert positions == sorted(positions)
|
||
|
||
def test_skips_short_entries(self, sample_regions):
|
||
"""Very short text (< 2 chars) should be skipped."""
|
||
words = [
|
||
{'text': 'a', 'left': 30, 'top': 50, 'width': 10, 'height': 15, 'conf': 90, 'region_type': 'column_en'},
|
||
{'text': 'valid', 'left': 30, 'top': 80, 'width': 50, 'height': 15, 'conf': 90, 'region_type': 'column_en'},
|
||
]
|
||
ocr_results = {'column_en': words}
|
||
vocab = match_lines_to_vocab(ocr_results, sample_regions)
|
||
assert len(vocab) == 1
|
||
assert vocab[0].english == 'valid'
|
||
|
||
def test_confidence_calculation(self, sample_words_column_en, sample_words_column_de, sample_regions):
|
||
"""Confidence should be the average of matched columns."""
|
||
ocr_results = {
|
||
'column_en': sample_words_column_en,
|
||
'column_de': sample_words_column_de,
|
||
}
|
||
vocab = match_lines_to_vocab(ocr_results, sample_regions, y_tolerance_px=25)
|
||
# First entry: EN conf=90, DE conf=88 → avg=89
|
||
assert vocab[0].confidence > 0
|
||
assert vocab[0].confidence == pytest.approx(89.0, abs=1.0)
|
||
|
||
|
||
# =============================================
|
||
# ORCHESTRATOR TESTS
|
||
# =============================================
|
||
|
||
class TestOrchestrator:
|
||
"""Test run_cv_pipeline orchestrator."""
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_no_input_returns_error(self):
|
||
"""Pipeline without input should return error."""
|
||
result = await run_cv_pipeline()
|
||
assert result.error is not None
|
||
assert "No input data" in result.error
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_pipeline_unavailable(self):
|
||
"""When CV_PIPELINE_AVAILABLE is False, should return error."""
|
||
with patch('cv_vocab_pipeline.CV_PIPELINE_AVAILABLE', False):
|
||
result = await run_cv_pipeline(pdf_data=b"fake")
|
||
assert result.error is not None
|
||
assert "not available" in result.error
|
||
|
||
@pytest.mark.asyncio
|
||
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
|
||
async def test_pipeline_with_image_data(self):
|
||
"""Pipeline with a real synthetic image should run without errors."""
|
||
import cv2
|
||
# Create a simple test image (white with some text-like black bars)
|
||
img = np.ones((200, 300, 3), dtype=np.uint8) * 255
|
||
for y in range(30, 170, 25):
|
||
img[y:y+12, 20:280, :] = 30
|
||
_, img_bytes = cv2.imencode('.png', img)
|
||
image_data = img_bytes.tobytes()
|
||
|
||
with patch('cv_vocab_pipeline.pytesseract') as mock_tess:
|
||
# Mock Tesseract to return empty results
|
||
mock_tess.image_to_data.return_value = {
|
||
'text': [], 'conf': [], 'left': [], 'top': [],
|
||
'width': [], 'height': [],
|
||
}
|
||
mock_tess.Output.DICT = 'dict'
|
||
|
||
result = await run_cv_pipeline(image_data=image_data)
|
||
assert result.error is None
|
||
assert result.image_width == 300
|
||
assert result.image_height == 200
|
||
assert 'render' in result.stages
|
||
assert 'deskew' in result.stages
|
||
|
||
@pytest.mark.asyncio
|
||
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
|
||
async def test_pipeline_records_timing(self):
|
||
"""Pipeline should record timing for each stage."""
|
||
import cv2
|
||
img = np.ones((100, 150, 3), dtype=np.uint8) * 255
|
||
_, img_bytes = cv2.imencode('.png', img)
|
||
|
||
with patch('cv_vocab_pipeline.pytesseract') as mock_tess:
|
||
mock_tess.image_to_data.return_value = {
|
||
'text': [], 'conf': [], 'left': [], 'top': [],
|
||
'width': [], 'height': [],
|
||
}
|
||
mock_tess.Output.DICT = 'dict'
|
||
|
||
result = await run_cv_pipeline(image_data=img_bytes.tobytes())
|
||
assert result.duration_seconds >= 0
|
||
assert all(v >= 0 for v in result.stages.values())
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_pipeline_result_format(self):
|
||
"""PipelineResult vocabulary should be list of dicts with expected keys."""
|
||
result = PipelineResult()
|
||
result.vocabulary = [
|
||
{"english": "test", "german": "Test", "example": "A test.", "confidence": 90.0}
|
||
]
|
||
assert len(result.vocabulary) == 1
|
||
entry = result.vocabulary[0]
|
||
assert "english" in entry
|
||
assert "german" in entry
|
||
assert "example" in entry
|
||
assert "confidence" in entry
|
||
|
||
|
||
# =============================================
|
||
# INTEGRATION-STYLE TESTS (with mocked Tesseract)
|
||
# =============================================
|
||
|
||
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
|
||
class TestStageIntegration:
|
||
"""Test multiple stages together (still unit-test level with mocked OCR)."""
|
||
|
||
def test_image_prep_to_layout(self, text_like_image):
|
||
"""Stages 4→5: image prep feeds layout analysis correctly."""
|
||
ocr_img = create_ocr_image(text_like_image)
|
||
layout_img = create_layout_image(text_like_image)
|
||
|
||
assert ocr_img.shape[:2] == text_like_image.shape[:2]
|
||
assert layout_img.shape[:2] == text_like_image.shape[:2]
|
||
|
||
regions = analyze_layout(layout_img, ocr_img)
|
||
assert len(regions) >= 1
|
||
|
||
def test_deskew_to_image_prep(self, text_like_image):
|
||
"""Stages 2→4: deskew output can be processed by image prep."""
|
||
corrected, angle = deskew_image(text_like_image)
|
||
ocr_img = create_ocr_image(corrected)
|
||
layout_img = create_layout_image(corrected)
|
||
assert ocr_img.shape[:2] == corrected.shape[:2]
|
||
assert layout_img.shape[:2] == corrected.shape[:2]
|
||
|
||
|
||
# =============================================
|
||
# NOISE FILTER TESTS
|
||
# =============================================
|
||
|
||
class TestNoiseFilter:
|
||
"""Test _is_noise_tail_token for trailing OCR noise detection."""
|
||
|
||
# --- Tokens that should be KEPT (return False) ---
|
||
|
||
@pytest.mark.parametrize("token", [
|
||
# Compound words with hyphens
|
||
"money-saver",
|
||
"under-",
|
||
"well-known",
|
||
# Words with parenthesized parts (dictionary entries)
|
||
"Schild(chen)",
|
||
"(Salat-)Gurke",
|
||
"(auf)",
|
||
"(on)",
|
||
"selbst)",
|
||
"(wir",
|
||
"Tanz(veranstaltung)",
|
||
"(zer)brechen",
|
||
# Phonetic brackets
|
||
"serva]",
|
||
"['mani",
|
||
"[eg]",
|
||
"[maus]",
|
||
# Words with trailing punctuation
|
||
"cupcakes.",
|
||
"sister.",
|
||
"mice",
|
||
# Abbreviations
|
||
"e.g.",
|
||
"sth.",
|
||
"usw.",
|
||
"adj.",
|
||
# Ellipsis
|
||
"...",
|
||
"\u2026",
|
||
# Regular words
|
||
"the",
|
||
"cat",
|
||
"big",
|
||
"run",
|
||
"set",
|
||
"ago",
|
||
])
|
||
def test_keep_real_tokens(self, token):
|
||
"""Real words, dictionary punctuation, and phonetic brackets are kept."""
|
||
assert _is_noise_tail_token(token) is False, f"Should keep {token!r}"
|
||
|
||
# --- Tokens that should be FILTERED (return True) ---
|
||
|
||
@pytest.mark.parametrize("token", [
|
||
# Pure non-alpha
|
||
"B|",
|
||
"3d",
|
||
"x7",
|
||
")",
|
||
"|",
|
||
"@",
|
||
"3",
|
||
# Very short non-dictionary fragments
|
||
"ee",
|
||
"k",
|
||
"zz",
|
||
"qq",
|
||
# Empty
|
||
"",
|
||
" ",
|
||
])
|
||
def test_filter_noise_tokens(self, token):
|
||
"""OCR noise fragments are filtered."""
|
||
assert _is_noise_tail_token(token) is True, f"Should filter {token!r}"
|
||
|
||
|
||
class TestCleanCellText:
|
||
"""Test _clean_cell_text integration (full text → cleaned text)."""
|
||
|
||
def test_empty_returns_empty(self):
|
||
assert _clean_cell_text("") == ""
|
||
assert _clean_cell_text(" ") == ""
|
||
|
||
def test_real_word_unchanged(self):
|
||
assert _clean_cell_text("cupcakes") == "cupcakes"
|
||
|
||
def test_strips_trailing_noise(self):
|
||
"""Trailing noise tokens should be removed."""
|
||
result = _clean_cell_text("cupcakes B|")
|
||
assert result == "cupcakes"
|
||
|
||
def test_keeps_trailing_real_word(self):
|
||
"""Trailing real words should be kept."""
|
||
result = _clean_cell_text("big cat")
|
||
assert result == "big cat"
|
||
|
||
def test_abbreviation_kept(self):
|
||
"""Known abbreviations should not be cleared."""
|
||
result = _clean_cell_text("e.g.")
|
||
assert result == "e.g."
|
||
|
||
def test_pure_garbage_cleared(self):
|
||
"""OCR garbage without real words should be cleared."""
|
||
result = _clean_cell_text("3d |x")
|
||
assert result == ""
|
||
|
||
def test_compound_word_preserved(self):
|
||
"""Compound words with hyphens should be preserved."""
|
||
result = _clean_cell_text("money-saver")
|
||
assert result == "money-saver"
|
||
|
||
def test_parenthesized_word_preserved(self):
|
||
result = _clean_cell_text("(Salat-)Gurke")
|
||
assert result == "(Salat-)Gurke"
|
||
|
||
def test_multiple_trailing_noise(self):
|
||
"""Multiple trailing noise tokens should all be removed."""
|
||
result = _clean_cell_text("achieve 3 |")
|
||
assert result == "achieve"
|
||
|
||
|
||
class TestPhoneticOnlyText:
|
||
"""Test _is_phonetic_only_text for phonetic transcription detection."""
|
||
|
||
@pytest.mark.parametrize("text,expected", [
|
||
# Phonetic-only patterns → True
|
||
("['mani serva]", True),
|
||
("[dɑːns]", True),
|
||
("[\"a:mand]", True),
|
||
("['wɜːkʃɒp]", True),
|
||
# serva] has 5 alpha chars after bracket removal → NOT phonetic-only
|
||
("serva]", False),
|
||
# NOT phonetic-only → False
|
||
("almond ['a:mand]", False),
|
||
("Mandel", False),
|
||
("cupcakes", False),
|
||
("", False),
|
||
("achieve", False),
|
||
("money-saver ['mani]", False),
|
||
])
|
||
def test_phonetic_detection(self, text, expected):
|
||
assert _is_phonetic_only_text(text) is expected, \
|
||
f"_is_phonetic_only_text({text!r}) should be {expected}"
|
||
|
||
|
||
class TestMergePhoneticContinuationRows:
|
||
"""Test _merge_phonetic_continuation_rows for phonetic row merging."""
|
||
|
||
def test_empty_list(self):
|
||
assert _merge_phonetic_continuation_rows([]) == []
|
||
|
||
def test_single_entry(self):
|
||
entries = [{"english": "cat", "german": "Katze", "example": ""}]
|
||
result = _merge_phonetic_continuation_rows(entries)
|
||
assert len(result) == 1
|
||
assert result[0]["english"] == "cat"
|
||
|
||
def test_merges_phonetic_row(self):
|
||
"""Phonetic-only row should merge into previous entry."""
|
||
entries = [
|
||
{"english": "money-saver", "german": "Sparfuchs", "example": "", "row_index": 0},
|
||
{"english": "['mani serva]", "german": "", "example": "", "row_index": 1},
|
||
]
|
||
result = _merge_phonetic_continuation_rows(entries)
|
||
assert len(result) == 1
|
||
assert result[0]["english"] == "money-saver ['mani serva]"
|
||
assert result[0]["german"] == "Sparfuchs"
|
||
|
||
def test_no_merge_when_de_present(self):
|
||
"""Row with DE text should NOT be merged even if EN looks phonetic."""
|
||
entries = [
|
||
{"english": "cat", "german": "Katze", "example": ""},
|
||
{"english": "[kæt]", "german": "some text", "example": ""},
|
||
]
|
||
result = _merge_phonetic_continuation_rows(entries)
|
||
assert len(result) == 2
|
||
|
||
def test_no_merge_regular_rows(self):
|
||
"""Normal vocab rows should not be merged."""
|
||
entries = [
|
||
{"english": "cat", "german": "Katze", "example": ""},
|
||
{"english": "dog", "german": "Hund", "example": ""},
|
||
]
|
||
result = _merge_phonetic_continuation_rows(entries)
|
||
assert len(result) == 2
|
||
|
||
def test_merges_example_too(self):
|
||
"""If phonetic row has example text, it should merge into previous."""
|
||
entries = [
|
||
{"english": "dance", "german": "tanzen", "example": "", "row_index": 0},
|
||
{"english": "[dɑːns]", "german": "", "example": "Let's dance.", "row_index": 1},
|
||
]
|
||
result = _merge_phonetic_continuation_rows(entries)
|
||
assert len(result) == 1
|
||
assert result[0]["english"] == "dance [dɑːns]"
|
||
assert result[0]["example"] == "Let's dance."
|
||
|
||
|
||
class TestMergeContinuationRows:
|
||
"""Test _merge_continuation_rows for multi-line entry merging."""
|
||
|
||
def test_empty_list(self):
|
||
assert _merge_continuation_rows([]) == []
|
||
|
||
def test_no_merge_independent_rows(self):
|
||
"""Rows with both EN and DE should not be merged."""
|
||
entries = [
|
||
{"english": "cat", "german": "Katze", "example": "", "row_index": 0},
|
||
{"english": "dog", "german": "Hund", "example": "", "row_index": 1},
|
||
]
|
||
result = _merge_continuation_rows(entries)
|
||
assert len(result) == 2
|
||
|
||
def test_merge_lowercase_continuation(self):
|
||
"""Lowercase EN with empty DE should merge into previous."""
|
||
entries = [
|
||
{"english": "to put up", "german": "aufstellen", "example": "", "row_index": 0},
|
||
{"english": "with sth.", "german": "", "example": "", "row_index": 1},
|
||
]
|
||
result = _merge_continuation_rows(entries)
|
||
assert len(result) == 1
|
||
assert result[0]["english"] == "to put up with sth."
|
||
assert result[0]["german"] == "aufstellen"
|
||
|
||
def test_no_merge_uppercase_start(self):
|
||
"""EN starting with uppercase and empty DE is likely its own entry, not a continuation."""
|
||
entries = [
|
||
{"english": "cat", "german": "Katze", "example": "", "row_index": 0},
|
||
{"english": "Dog", "german": "", "example": "", "row_index": 1},
|
||
]
|
||
result = _merge_continuation_rows(entries)
|
||
assert len(result) == 2
|
||
|
||
def test_no_merge_when_previous_ends_with_period(self):
|
||
"""If previous entry ends with sentence terminator, next is not continuation."""
|
||
entries = [
|
||
{"english": "That's great.", "german": "Das ist toll.", "example": "", "row_index": 0},
|
||
{"english": "really nice", "german": "", "example": "", "row_index": 1},
|
||
]
|
||
result = _merge_continuation_rows(entries)
|
||
assert len(result) == 2
|
||
|
||
def test_no_merge_long_text(self):
|
||
"""Text with 4+ words is likely an example sentence, not continuation."""
|
||
entries = [
|
||
{"english": "achieve", "german": "erreichen", "example": "", "row_index": 0},
|
||
{"english": "she achieved her goals", "german": "", "example": "", "row_index": 1},
|
||
]
|
||
result = _merge_continuation_rows(entries)
|
||
assert len(result) == 2
|
||
|
||
def test_first_entry_not_merged(self):
|
||
"""First entry with empty DE should not crash (no previous)."""
|
||
entries = [
|
||
{"english": "something", "german": "", "example": "", "row_index": 0},
|
||
{"english": "cat", "german": "Katze", "example": "", "row_index": 1},
|
||
]
|
||
result = _merge_continuation_rows(entries)
|
||
assert len(result) == 2
|
||
|
||
|
||
# =============================================
|
||
# Test: Content-Bounds Scan-Artifact Filtering
|
||
# =============================================
|
||
|
||
class TestContentBoundsFiltering:
|
||
"""Test that _find_content_bounds filters narrow scan artifacts."""
|
||
|
||
def test_thin_vertical_line_ignored(self):
|
||
"""A 2px black line at the left edge should not pull left_x leftward."""
|
||
inv = np.zeros((400, 600), dtype=np.uint8)
|
||
# Main content block in the middle
|
||
inv[50:350, 100:550] = 255
|
||
# 2px thin vertical scan artifact at x=5..6
|
||
inv[50:350, 5:7] = 255
|
||
|
||
left, right, top, bottom = _find_content_bounds(inv)
|
||
# left_x must be near 100 (the real content), not near 5
|
||
assert left >= 90, f"left_x={left} should be >=90 (near real content, not artifact)"
|
||
|
||
def test_thick_content_preserved(self):
|
||
"""A 50px wide text block is real content and must not be filtered."""
|
||
inv = np.zeros((400, 600), dtype=np.uint8)
|
||
inv[50:350, 80:130] = 255 # 50px wide block
|
||
inv[50:350, 200:500] = 255 # wider block
|
||
|
||
left, right, top, bottom = _find_content_bounds(inv)
|
||
assert left <= 82, f"left_x={left} should be <=82 (50px block is real content)"
|
||
|
||
def test_no_artifacts_unchanged(self):
|
||
"""Normal image without artifacts: bounds should match content."""
|
||
inv = np.zeros((400, 600), dtype=np.uint8)
|
||
inv[100:300, 50:550] = 255
|
||
|
||
left, right, top, bottom = _find_content_bounds(inv)
|
||
assert left <= 52
|
||
assert right >= 548
|
||
assert top <= 105
|
||
assert bottom >= 295
|
||
|
||
def test_right_edge_artifact_ignored(self):
|
||
"""A thin vertical line at the right edge should not pull right_x rightward."""
|
||
inv = np.zeros((400, 600), dtype=np.uint8)
|
||
inv[50:350, 50:500] = 255 # real content
|
||
inv[50:350, 595:598] = 255 # 3px artifact at right edge
|
||
|
||
left, right, top, bottom = _find_content_bounds(inv)
|
||
assert right <= 510, f"right_x={right} should be <=510, ignoring right-edge artifact"
|
||
|
||
def test_horizontal_line_ignored(self):
|
||
"""A thin horizontal line at the top should not pull top_y upward."""
|
||
inv = np.zeros((400, 600), dtype=np.uint8)
|
||
inv[100:350, 50:550] = 255 # real content
|
||
inv[2:4, 50:550] = 255 # 2px horizontal artifact at top
|
||
|
||
left, right, top, bottom = _find_content_bounds(inv)
|
||
assert top >= 90, f"top_y={top} should be >=90 (ignoring thin top line)"
|
||
|
||
|
||
class TestFilterNarrowRuns:
|
||
"""Test the _filter_narrow_runs helper directly."""
|
||
|
||
def test_removes_short_run(self):
|
||
mask = np.array([False, True, True, False, True, True, True, True, True, False])
|
||
result = _filter_narrow_runs(mask, min_width=3)
|
||
# The 2-wide run at indices 1-2 should be removed
|
||
assert not result[1]
|
||
assert not result[2]
|
||
# The 5-wide run at indices 4-8 should remain
|
||
assert result[4]
|
||
assert result[8]
|
||
|
||
def test_keeps_wide_run(self):
|
||
mask = np.array([True] * 10)
|
||
result = _filter_narrow_runs(mask, min_width=5)
|
||
assert all(result)
|
||
|
||
def test_all_narrow(self):
|
||
mask = np.array([True, True, False, True, False])
|
||
result = _filter_narrow_runs(mask, min_width=3)
|
||
assert not any(result)
|
||
|
||
|
||
# =============================================
|
||
# Test: Margin Regions
|
||
# =============================================
|
||
|
||
class TestMarginRegions:
|
||
"""Test _build_margin_regions and margin integration."""
|
||
|
||
def test_margin_left_created(self):
|
||
"""When left_x > 5, a margin_left region should be created."""
|
||
existing = [
|
||
PageRegion(type='column_en', x=100, y=50, width=200, height=300),
|
||
PageRegion(type='column_de', x=320, y=50, width=200, height=300),
|
||
]
|
||
margins = _build_margin_regions(existing, left_x=100, right_x=520,
|
||
img_w=600, top_y=50, content_h=300)
|
||
left_margins = [m for m in margins if m.type == 'margin_left']
|
||
assert len(left_margins) == 1
|
||
ml = left_margins[0]
|
||
assert ml.x == 0
|
||
assert ml.width == 100
|
||
|
||
def test_margin_right_created(self):
|
||
"""When there's space after the last column, margin_right should be created."""
|
||
existing = [
|
||
PageRegion(type='column_en', x=50, y=50, width=200, height=300),
|
||
PageRegion(type='column_de', x=260, y=50, width=200, height=300),
|
||
]
|
||
# last_col_end = 260 + 200 = 460, img_w = 600 → gap = 140
|
||
margins = _build_margin_regions(existing, left_x=50, right_x=460,
|
||
img_w=600, top_y=50, content_h=300)
|
||
right_margins = [m for m in margins if m.type == 'margin_right']
|
||
assert len(right_margins) == 1
|
||
mr = right_margins[0]
|
||
assert mr.x == 460
|
||
assert mr.width == 140
|
||
|
||
def test_no_margin_when_flush(self):
|
||
"""When columns are flush with the image edges, no margins should appear."""
|
||
existing = [
|
||
PageRegion(type='column_en', x=0, y=0, width=300, height=400),
|
||
PageRegion(type='column_de', x=300, y=0, width=300, height=400),
|
||
]
|
||
margins = _build_margin_regions(existing, left_x=0, right_x=600,
|
||
img_w=600, top_y=0, content_h=400)
|
||
assert len(margins) == 0
|
||
|
||
def test_margins_in_skip_types(self):
|
||
"""Verify margin types are in the skip set used by build_cell_grid."""
|
||
skip = {'column_ignore', 'header', 'footer', 'page_ref', 'margin_left', 'margin_right'}
|
||
assert 'margin_left' in skip
|
||
assert 'margin_right' in skip
|
||
|
||
def test_margin_confidence_and_method(self):
|
||
"""Margin regions should have confidence 1.0 and method 'content_bounds'."""
|
||
existing = [PageRegion(type='column_en', x=80, y=20, width=400, height=500)]
|
||
margins = _build_margin_regions(existing, left_x=80, right_x=480,
|
||
img_w=600, top_y=20, content_h=500)
|
||
for m in margins:
|
||
assert m.classification_confidence == 1.0
|
||
assert m.classification_method == 'content_bounds'
|
||
|
||
|
||
# =============================================
|
||
# RUN TESTS
|
||
# =============================================
|
||
|
||
if __name__ == "__main__":
|
||
pytest.main([__file__, "-v"])
|