"""Tests for dictionary/Wörterbuch page detection. Tests the _score_dictionary_signals() function and _classify_dictionary_columns() from cv_layout.py. """ import sys import os # Add backend to path for imports sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from cv_vocab_types import ColumnGeometry from cv_layout import _score_dictionary_signals, _classify_dictionary_columns, _score_language def _make_words(texts, start_y=0, y_step=30, x=100, conf=80): """Create a list of word dicts from text strings.""" return [ { "text": t, "conf": conf, "top": start_y + i * y_step, "left": x, "height": 20, "width": len(t) * 10, } for i, t in enumerate(texts) ] def _make_geom(index, words, x=0, width=200, width_ratio=0.15): """Create a ColumnGeometry with given words.""" return ColumnGeometry( index=index, x=x, y=0, width=width, height=1000, word_count=len(words), words=words, width_ratio=width_ratio, ) class TestDictionarySignals: """Test _score_dictionary_signals with synthetic data.""" def test_alphabetical_column_detected(self): """A column with alphabetically ordered words should score high.""" # Simulate a dictionary headword column: Z words headwords = _make_words([ "Zahl", "Zahn", "zart", "Zauber", "Zaun", "Zeichen", "zeigen", "Zeit", "Zelt", "Zentrum", "zerbrechen", "Zeug", "Ziel", "Zimmer", "Zitrone", "Zoll", "Zone", "Zoo", "Zucker", "Zug", ]) # Article column articles = _make_words( ["die", "der", "das", "der", "der", "das", "die", "die", "das", "das", "der", "das", "das", "das", "die", "der", "die", "der", "der", "der"], x=0, ) # Translation column translations = _make_words( ["number", "tooth", "tender", "magic", "fence", "sign", "to show", "time", "tent", "centre", "to break", "stuff", "goal", "room", "lemon", "customs", "zone", "zoo", "sugar", "train"], x=400, ) geoms = [ _make_geom(0, articles, x=0, width=60, width_ratio=0.05), _make_geom(1, headwords, x=80, width=200, width_ratio=0.15), _make_geom(2, translations, x=400, width=200, width_ratio=0.15), ] result = _score_dictionary_signals(geoms) assert result["signals"]["alphabetical_score"] >= 0.80, ( f"Expected alphabetical_score >= 0.80, got {result['signals']['alphabetical_score']}" ) assert result["signals"]["article_density"] >= 0.80, ( f"Expected article_density >= 0.80, got {result['signals']['article_density']}" ) assert result["signals"]["first_letter_uniformity"] >= 0.60, ( f"Expected first_letter_uniformity >= 0.60, got {result['signals']['first_letter_uniformity']}" ) assert result["is_dictionary"] is True assert result["confidence"] >= 0.40 def test_non_dictionary_vocab_table(self): """A normal vocab table (topic-grouped, no alphabetical order) should NOT be detected.""" en_words = _make_words([ "school", "teacher", "homework", "pencil", "break", "lunch", "friend", "computer", "book", "bag", ]) de_words = _make_words([ "Schule", "Lehrer", "Hausaufgaben", "Bleistift", "Pause", "Mittagessen", "Freund", "Computer", "Buch", "Tasche", ], x=300) geoms = [ _make_geom(0, en_words, x=0, width=200, width_ratio=0.20), _make_geom(1, de_words, x=300, width=200, width_ratio=0.20), ] result = _score_dictionary_signals(geoms) # Alphabetical score should be moderate at best (random order) assert result["is_dictionary"] is False, ( f"Normal vocab table should NOT be detected as dictionary, " f"confidence={result['confidence']}" ) def test_article_column_detection(self): """A narrow column with mostly articles should be identified.""" articles = _make_words( ["der", "die", "das", "der", "die", "das", "der", "die", "das", "der"], x=0, ) headwords = _make_words( ["Apfel", "Birne", "Dose", "Eis", "Fisch", "Gabel", "Haus", "Igel", "Jacke", "Kuchen"], ) translations = _make_words( ["apple", "pear", "can", "ice", "fish", "fork", "house", "hedgehog", "jacket", "cake"], x=400, ) geoms = [ _make_geom(0, articles, x=0, width=50, width_ratio=0.04), _make_geom(1, headwords, x=80, width=200, width_ratio=0.15), _make_geom(2, translations, x=400, width=200, width_ratio=0.15), ] result = _score_dictionary_signals(geoms) assert result["signals"]["article_density"] >= 0.80 assert result["signals"]["article_col"] == 0 def test_first_letter_uniformity(self): """Words all starting with same letter should have high uniformity.""" z_words = _make_words([ "Zahl", "Zahn", "zart", "Zauber", "Zaun", "Zeichen", "zeigen", "Zeit", "Zelt", "Zentrum", ]) other = _make_words( ["number", "tooth", "tender", "magic", "fence", "sign", "to show", "time", "tent", "centre"], x=300, ) geoms = [ _make_geom(0, z_words, x=0, width=200, width_ratio=0.15), _make_geom(1, other, x=300, width=200, width_ratio=0.15), ] result = _score_dictionary_signals(geoms) assert result["signals"]["first_letter_uniformity"] >= 0.80 def test_letter_transition_detected(self): """Words transitioning from one letter to next (A→B) should be detected.""" words = _make_words([ "Apfel", "Arm", "Auto", "Auge", "Abend", "Ball", "Baum", "Berg", "Blume", "Boot", ]) other = _make_words( ["apple", "arm", "car", "eye", "evening", "ball", "tree", "mountain", "flower", "boat"], x=300, ) geoms = [ _make_geom(0, words, x=0, width=200, width_ratio=0.15), _make_geom(1, other, x=300, width=200, width_ratio=0.15), ] result = _score_dictionary_signals(geoms) assert result["signals"]["has_letter_transition"] is True def test_category_boost(self): """document_category='woerterbuch' should boost confidence.""" # Weak signals that normally wouldn't trigger dictionary detection words_a = _make_words(["cat", "dog", "fish", "hat", "map"], x=0) words_b = _make_words(["Katze", "Hund", "Fisch", "Hut", "Karte"], x=300) geoms = [ _make_geom(0, words_a, x=0, width=200, width_ratio=0.15), _make_geom(1, words_b, x=300, width=200, width_ratio=0.15), ] without_boost = _score_dictionary_signals(geoms) with_boost = _score_dictionary_signals(geoms, document_category="woerterbuch") assert with_boost["confidence"] > without_boost["confidence"] assert with_boost["confidence"] - without_boost["confidence"] >= 0.19 # ~0.20 boost def test_margin_strip_signal(self): """margin_strip_detected=True should contribute to confidence.""" words_a = _make_words(["Apfel", "Arm", "Auto", "Auge", "Abend"], x=0) words_b = _make_words(["apple", "arm", "car", "eye", "evening"], x=300) geoms = [ _make_geom(0, words_a, x=0, width=200, width_ratio=0.15), _make_geom(1, words_b, x=300, width=200, width_ratio=0.15), ] without = _score_dictionary_signals(geoms, margin_strip_detected=False) with_strip = _score_dictionary_signals(geoms, margin_strip_detected=True) assert with_strip["confidence"] > without["confidence"] assert with_strip["signals"]["margin_strip_detected"] is True def test_too_few_columns(self): """Single column should return is_dictionary=False.""" words = _make_words(["Zahl", "Zahn", "zart", "Zauber", "Zaun"]) geoms = [_make_geom(0, words)] result = _score_dictionary_signals(geoms) assert result["is_dictionary"] is False def test_empty_words(self): """Columns with no words should return is_dictionary=False.""" geoms = [ _make_geom(0, [], x=0), _make_geom(1, [], x=300), ] result = _score_dictionary_signals(geoms) assert result["is_dictionary"] is False class TestClassifyDictionaryColumns: """Test _classify_dictionary_columns with dictionary-detected data.""" def test_assigns_article_and_headword(self): """When dictionary detected, assigns column_article and column_headword.""" articles = _make_words( ["der", "die", "das", "der", "die", "das", "der", "die", "das", "der"], x=0, ) headwords = _make_words([ "Zahl", "Zahn", "zart", "Zauber", "Zaun", "Zeichen", "zeigen", "Zeit", "Zelt", "Zentrum", ]) translations = _make_words( ["number", "tooth", "tender", "magic", "fence", "sign", "to show", "time", "tent", "centre"], x=400, ) geoms = [ _make_geom(0, articles, x=0, width=50, width_ratio=0.04), _make_geom(1, headwords, x=80, width=200, width_ratio=0.15), _make_geom(2, translations, x=400, width=200, width_ratio=0.15), ] dict_signals = _score_dictionary_signals(geoms) assert dict_signals["is_dictionary"] is True lang_scores = [_score_language(g.words) for g in geoms] regions = _classify_dictionary_columns(geoms, dict_signals, lang_scores, 1000) assert regions is not None types = [r.type for r in regions] assert "column_article" in types, f"Expected column_article in {types}" assert "column_headword" in types, f"Expected column_headword in {types}" # All regions should have classification_method='dictionary' for r in regions: assert r.classification_method == "dictionary" def test_returns_none_when_not_dictionary(self): """Should return None when dict_signals says not a dictionary.""" geoms = [ _make_geom(0, _make_words(["cat", "dog"]), x=0), _make_geom(1, _make_words(["Katze", "Hund"]), x=300), ] dict_signals = {"is_dictionary": False, "confidence": 0.1} lang_scores = [_score_language(g.words) for g in geoms] result = _classify_dictionary_columns(geoms, dict_signals, lang_scores, 1000) assert result is None