breakpilot-lehrer/klausur-service/backend/tests/test_dictionary_detection.py

"""Tests for dictionary/Wörterbuch page detection.

Tests the _score_dictionary_signals() function and _classify_dictionary_columns()
from cv_layout.py.
"""

import sys
import os

# Add backend to path for imports
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from cv_vocab_types import ColumnGeometry
from cv_layout import _score_dictionary_signals, _classify_dictionary_columns, _score_language


def _make_words(texts, start_y=0, y_step=30, x=100, conf=80):
    """Create a list of word dicts from text strings."""
    return [
        {
            "text": t,
            "conf": conf,
            "top": start_y + i * y_step,
            "left": x,
            "height": 20,
            "width": len(t) * 10,
        }
        for i, t in enumerate(texts)
    ]


def _make_geom(index, words, x=0, width=200, width_ratio=0.15):
    """Create a ColumnGeometry with given words."""
    return ColumnGeometry(
        index=index,
        x=x,
        y=0,
        width=width,
        height=1000,
        word_count=len(words),
        words=words,
        width_ratio=width_ratio,
    )


class TestDictionarySignals:
    """Test _score_dictionary_signals with synthetic data."""

    def test_alphabetical_column_detected(self):
        """A column with alphabetically ordered words should score high."""
        # Simulate a dictionary headword column: Z words
        headwords = _make_words([
            "Zahl", "Zahn", "zart", "Zauber", "Zaun",
            "Zeichen", "zeigen", "Zeit", "Zelt", "Zentrum",
            "zerbrechen", "Zeug", "Ziel", "Zimmer", "Zitrone",
            "Zoll", "Zone", "Zoo", "Zucker", "Zug",
        ])
        # Article column
        articles = _make_words(
            ["die", "der", "das", "der", "der",
             "das", "die", "die", "das", "das",
             "der", "das", "das", "das", "die",
             "der", "die", "der", "der", "der"],
            x=0,
        )
        # Translation column
        translations = _make_words(
            ["number", "tooth", "tender", "magic", "fence",
             "sign", "to show", "time", "tent", "centre",
             "to break", "stuff", "goal", "room", "lemon",
             "customs", "zone", "zoo", "sugar", "train"],
            x=400,
        )

        geoms = [
            _make_geom(0, articles, x=0, width=60, width_ratio=0.05),
            _make_geom(1, headwords, x=80, width=200, width_ratio=0.15),
            _make_geom(2, translations, x=400, width=200, width_ratio=0.15),
        ]

        result = _score_dictionary_signals(geoms)

        assert result["signals"]["alphabetical_score"] >= 0.80, (
            f"Expected alphabetical_score >= 0.80, got {result['signals']['alphabetical_score']}"
        )
        assert result["signals"]["article_density"] >= 0.80, (
            f"Expected article_density >= 0.80, got {result['signals']['article_density']}"
        )
        assert result["signals"]["first_letter_uniformity"] >= 0.60, (
            f"Expected first_letter_uniformity >= 0.60, got {result['signals']['first_letter_uniformity']}"
        )
        assert result["is_dictionary"] is True
        assert result["confidence"] >= 0.40

    def test_non_dictionary_vocab_table(self):
        """A normal vocab table (topic-grouped, no alphabetical order) should NOT be detected."""
        en_words = _make_words([
            "school", "teacher", "homework", "pencil", "break",
            "lunch", "friend", "computer", "book", "bag",
        ])
        de_words = _make_words([
            "Schule", "Lehrer", "Hausaufgaben", "Bleistift", "Pause",
            "Mittagessen", "Freund", "Computer", "Buch", "Tasche",
        ], x=300)

        geoms = [
            _make_geom(0, en_words, x=0, width=200, width_ratio=0.20),
            _make_geom(1, de_words, x=300, width=200, width_ratio=0.20),
        ]

        result = _score_dictionary_signals(geoms)

        # Alphabetical score should be moderate at best (random order)
        assert result["is_dictionary"] is False, (
            f"Normal vocab table should NOT be detected as dictionary, "
            f"confidence={result['confidence']}"
        )

    def test_article_column_detection(self):
        """A narrow column with mostly articles should be identified."""
        articles = _make_words(
            ["der", "die", "das", "der", "die", "das", "der", "die", "das", "der"],
            x=0,
        )
        headwords = _make_words(
            ["Apfel", "Birne", "Dose", "Eis", "Fisch",
             "Gabel", "Haus", "Igel", "Jacke", "Kuchen"],
        )
        translations = _make_words(
            ["apple", "pear", "can", "ice", "fish",
             "fork", "house", "hedgehog", "jacket", "cake"],
            x=400,
        )

        geoms = [
            _make_geom(0, articles, x=0, width=50, width_ratio=0.04),
            _make_geom(1, headwords, x=80, width=200, width_ratio=0.15),
            _make_geom(2, translations, x=400, width=200, width_ratio=0.15),
        ]

        result = _score_dictionary_signals(geoms)

        assert result["signals"]["article_density"] >= 0.80
        assert result["signals"]["article_col"] == 0

    def test_first_letter_uniformity(self):
        """Words all starting with same letter should have high uniformity."""
        z_words = _make_words([
            "Zahl", "Zahn", "zart", "Zauber", "Zaun",
            "Zeichen", "zeigen", "Zeit", "Zelt", "Zentrum",
        ])
        other = _make_words(
            ["number", "tooth", "tender", "magic", "fence",
             "sign", "to show", "time", "tent", "centre"],
            x=300,
        )

        geoms = [
            _make_geom(0, z_words, x=0, width=200, width_ratio=0.15),
            _make_geom(1, other, x=300, width=200, width_ratio=0.15),
        ]

        result = _score_dictionary_signals(geoms)
        assert result["signals"]["first_letter_uniformity"] >= 0.80

    def test_letter_transition_detected(self):
        """Words transitioning from one letter to next (A→B) should be detected."""
        words = _make_words([
            "Apfel", "Arm", "Auto", "Auge", "Abend",
            "Ball", "Baum", "Berg", "Blume", "Boot",
        ])
        other = _make_words(
            ["apple", "arm", "car", "eye", "evening",
             "ball", "tree", "mountain", "flower", "boat"],
            x=300,
        )

        geoms = [
            _make_geom(0, words, x=0, width=200, width_ratio=0.15),
            _make_geom(1, other, x=300, width=200, width_ratio=0.15),
        ]

        result = _score_dictionary_signals(geoms)
        assert result["signals"]["has_letter_transition"] is True

    def test_category_boost(self):
        """document_category='woerterbuch' should boost confidence."""
        # Weak signals that normally wouldn't trigger dictionary detection
        words_a = _make_words(["cat", "dog", "fish", "hat", "map"], x=0)
        words_b = _make_words(["Katze", "Hund", "Fisch", "Hut", "Karte"], x=300)

        geoms = [
            _make_geom(0, words_a, x=0, width=200, width_ratio=0.15),
            _make_geom(1, words_b, x=300, width=200, width_ratio=0.15),
        ]

        without_boost = _score_dictionary_signals(geoms)
        with_boost = _score_dictionary_signals(geoms, document_category="woerterbuch")

        assert with_boost["confidence"] > without_boost["confidence"]
        assert with_boost["confidence"] - without_boost["confidence"] >= 0.19  # ~0.20 boost

    def test_margin_strip_signal(self):
        """margin_strip_detected=True should contribute to confidence."""
        words_a = _make_words(["Apfel", "Arm", "Auto", "Auge", "Abend"], x=0)
        words_b = _make_words(["apple", "arm", "car", "eye", "evening"], x=300)

        geoms = [
            _make_geom(0, words_a, x=0, width=200, width_ratio=0.15),
            _make_geom(1, words_b, x=300, width=200, width_ratio=0.15),
        ]

        without = _score_dictionary_signals(geoms, margin_strip_detected=False)
        with_strip = _score_dictionary_signals(geoms, margin_strip_detected=True)

        assert with_strip["confidence"] > without["confidence"]
        assert with_strip["signals"]["margin_strip_detected"] is True

    def test_too_few_columns(self):
        """Single column should return is_dictionary=False."""
        words = _make_words(["Zahl", "Zahn", "zart", "Zauber", "Zaun"])
        geoms = [_make_geom(0, words)]

        result = _score_dictionary_signals(geoms)
        assert result["is_dictionary"] is False

    def test_empty_words(self):
        """Columns with no words should return is_dictionary=False."""
        geoms = [
            _make_geom(0, [], x=0),
            _make_geom(1, [], x=300),
        ]
        result = _score_dictionary_signals(geoms)
        assert result["is_dictionary"] is False


class TestClassifyDictionaryColumns:
    """Test _classify_dictionary_columns with dictionary-detected data."""

    def test_assigns_article_and_headword(self):
        """When dictionary detected, assigns column_article and column_headword."""
        articles = _make_words(
            ["der", "die", "das", "der", "die", "das", "der", "die", "das", "der"],
            x=0,
        )
        headwords = _make_words([
            "Zahl", "Zahn", "zart", "Zauber", "Zaun",
            "Zeichen", "zeigen", "Zeit", "Zelt", "Zentrum",
        ])
        translations = _make_words(
            ["number", "tooth", "tender", "magic", "fence",
             "sign", "to show", "time", "tent", "centre"],
            x=400,
        )

        geoms = [
            _make_geom(0, articles, x=0, width=50, width_ratio=0.04),
            _make_geom(1, headwords, x=80, width=200, width_ratio=0.15),
            _make_geom(2, translations, x=400, width=200, width_ratio=0.15),
        ]

        dict_signals = _score_dictionary_signals(geoms)
        assert dict_signals["is_dictionary"] is True

        lang_scores = [_score_language(g.words) for g in geoms]
        regions = _classify_dictionary_columns(geoms, dict_signals, lang_scores, 1000)

        assert regions is not None
        types = [r.type for r in regions]
        assert "column_article" in types, f"Expected column_article in {types}"
        assert "column_headword" in types, f"Expected column_headword in {types}"
        # All regions should have classification_method='dictionary'
        for r in regions:
            assert r.classification_method == "dictionary"

    def test_returns_none_when_not_dictionary(self):
        """Should return None when dict_signals says not a dictionary."""
        geoms = [
            _make_geom(0, _make_words(["cat", "dog"]), x=0),
            _make_geom(1, _make_words(["Katze", "Hund"]), x=300),
        ]
        dict_signals = {"is_dictionary": False, "confidence": 0.1}
        lang_scores = [_score_language(g.words) for g in geoms]
        result = _classify_dictionary_columns(geoms, dict_signals, lang_scores, 1000)
        assert result is None