Files
breakpilot-lehrer/klausur-service/backend/tests/test_dictionary_detection.py
Benjamin Admin f39cbe9283 refactor: remove unused pages and backends (model-management, OCR legacy, GPU/vast.ai, video-chat, matrix)
Deleted pages:
- /ai/model-management (mock data only, no real backend)
- /ai/ocr-compare (old /vocab/ backend, replaced by ocr-kombi)
- /ai/ocr-pipeline (minimal session browser, redundant)
- /ai/ocr-overlay (legacy monolith, redundant)
- /ai/gpu (vast.ai GPU management, no longer used)
- /infrastructure/gpu (same)
- /communication/video-chat (moved to core)
- /communication/matrix (moved to core)

Deleted backends:
- backend-lehrer/infra/vast_client.py + vast_power.py
- backend-lehrer/meetings_api.py + jitsi_api.py
- website/app/api/admin/gpu/
- edu-search-service/scripts/vast_ai_extractor.py

Total: ~7,800 LOC removed. All code preserved in git history.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-23 13:14:12 +02:00

286 lines
11 KiB
Python

"""Tests for dictionary/Wörterbuch page detection.
Tests the _score_dictionary_signals() function and _classify_dictionary_columns()
from cv_layout.py.
"""
import sys
import os
# Add backend to path for imports
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from cv_vocab_types import ColumnGeometry
from cv_layout import _score_dictionary_signals, _classify_dictionary_columns, _score_language
def _make_words(texts, start_y=0, y_step=30, x=100, conf=80):
"""Create a list of word dicts from text strings."""
return [
{
"text": t,
"conf": conf,
"top": start_y + i * y_step,
"left": x,
"height": 20,
"width": len(t) * 10,
}
for i, t in enumerate(texts)
]
def _make_geom(index, words, x=0, width=200, width_ratio=0.15):
"""Create a ColumnGeometry with given words."""
return ColumnGeometry(
index=index,
x=x,
y=0,
width=width,
height=1000,
word_count=len(words),
words=words,
width_ratio=width_ratio,
)
class TestDictionarySignals:
"""Test _score_dictionary_signals with synthetic data."""
def test_alphabetical_column_detected(self):
"""A column with alphabetically ordered words should score high."""
# Simulate a dictionary headword column: Z words
headwords = _make_words([
"Zahl", "Zahn", "zart", "Zauber", "Zaun",
"Zeichen", "zeigen", "Zeit", "Zelt", "Zentrum",
"zerbrechen", "Zeug", "Ziel", "Zimmer", "Zitrone",
"Zoll", "Zone", "Zoo", "Zucker", "Zug",
])
# Article column
articles = _make_words(
["die", "der", "das", "der", "der",
"das", "die", "die", "das", "das",
"der", "das", "das", "das", "die",
"der", "die", "der", "der", "der"],
x=0,
)
# Translation column
translations = _make_words(
["number", "tooth", "tender", "magic", "fence",
"sign", "to show", "time", "tent", "centre",
"to break", "stuff", "goal", "room", "lemon",
"customs", "zone", "zoo", "sugar", "train"],
x=400,
)
geoms = [
_make_geom(0, articles, x=0, width=60, width_ratio=0.05),
_make_geom(1, headwords, x=80, width=200, width_ratio=0.15),
_make_geom(2, translations, x=400, width=200, width_ratio=0.15),
]
result = _score_dictionary_signals(geoms)
assert result["signals"]["alphabetical_score"] >= 0.80, (
f"Expected alphabetical_score >= 0.80, got {result['signals']['alphabetical_score']}"
)
assert result["signals"]["article_density"] >= 0.80, (
f"Expected article_density >= 0.80, got {result['signals']['article_density']}"
)
assert result["signals"]["first_letter_uniformity"] >= 0.60, (
f"Expected first_letter_uniformity >= 0.60, got {result['signals']['first_letter_uniformity']}"
)
assert result["is_dictionary"] is True
assert result["confidence"] >= 0.40
def test_non_dictionary_vocab_table(self):
"""A normal vocab table (topic-grouped, no alphabetical order) should NOT be detected."""
en_words = _make_words([
"school", "teacher", "homework", "pencil", "break",
"lunch", "friend", "computer", "book", "bag",
])
de_words = _make_words([
"Schule", "Lehrer", "Hausaufgaben", "Bleistift", "Pause",
"Mittagessen", "Freund", "Computer", "Buch", "Tasche",
], x=300)
geoms = [
_make_geom(0, en_words, x=0, width=200, width_ratio=0.20),
_make_geom(1, de_words, x=300, width=200, width_ratio=0.20),
]
result = _score_dictionary_signals(geoms)
# Alphabetical score should be moderate at best (random order)
assert result["is_dictionary"] is False, (
f"Normal vocab table should NOT be detected as dictionary, "
f"confidence={result['confidence']}"
)
def test_article_column_detection(self):
"""A narrow column with mostly articles should be identified."""
articles = _make_words(
["der", "die", "das", "der", "die", "das", "der", "die", "das", "der"],
x=0,
)
headwords = _make_words(
["Apfel", "Birne", "Dose", "Eis", "Fisch",
"Gabel", "Haus", "Igel", "Jacke", "Kuchen"],
)
translations = _make_words(
["apple", "pear", "can", "ice", "fish",
"fork", "house", "hedgehog", "jacket", "cake"],
x=400,
)
geoms = [
_make_geom(0, articles, x=0, width=50, width_ratio=0.04),
_make_geom(1, headwords, x=80, width=200, width_ratio=0.15),
_make_geom(2, translations, x=400, width=200, width_ratio=0.15),
]
result = _score_dictionary_signals(geoms)
assert result["signals"]["article_density"] >= 0.80
assert result["signals"]["article_col"] == 0
def test_first_letter_uniformity(self):
"""Words all starting with same letter should have high uniformity."""
z_words = _make_words([
"Zahl", "Zahn", "zart", "Zauber", "Zaun",
"Zeichen", "zeigen", "Zeit", "Zelt", "Zentrum",
])
other = _make_words(
["number", "tooth", "tender", "magic", "fence",
"sign", "to show", "time", "tent", "centre"],
x=300,
)
geoms = [
_make_geom(0, z_words, x=0, width=200, width_ratio=0.15),
_make_geom(1, other, x=300, width=200, width_ratio=0.15),
]
result = _score_dictionary_signals(geoms)
assert result["signals"]["first_letter_uniformity"] >= 0.80
def test_letter_transition_detected(self):
"""Words transitioning from one letter to next (A→B) should be detected."""
words = _make_words([
"Apfel", "Arm", "Auto", "Auge", "Abend",
"Ball", "Baum", "Berg", "Blume", "Boot",
])
other = _make_words(
["apple", "arm", "car", "eye", "evening",
"ball", "tree", "mountain", "flower", "boat"],
x=300,
)
geoms = [
_make_geom(0, words, x=0, width=200, width_ratio=0.15),
_make_geom(1, other, x=300, width=200, width_ratio=0.15),
]
result = _score_dictionary_signals(geoms)
assert result["signals"]["has_letter_transition"] is True
def test_category_boost(self):
"""document_category='woerterbuch' should boost confidence."""
# Weak signals that normally wouldn't trigger dictionary detection
words_a = _make_words(["cat", "dog", "fish", "hat", "map"], x=0)
words_b = _make_words(["Katze", "Hund", "Fisch", "Hut", "Karte"], x=300)
geoms = [
_make_geom(0, words_a, x=0, width=200, width_ratio=0.15),
_make_geom(1, words_b, x=300, width=200, width_ratio=0.15),
]
without_boost = _score_dictionary_signals(geoms)
with_boost = _score_dictionary_signals(geoms, document_category="woerterbuch")
assert with_boost["confidence"] > without_boost["confidence"]
assert with_boost["confidence"] - without_boost["confidence"] >= 0.19 # ~0.20 boost
def test_margin_strip_signal(self):
"""margin_strip_detected=True should contribute to confidence."""
words_a = _make_words(["Apfel", "Arm", "Auto", "Auge", "Abend"], x=0)
words_b = _make_words(["apple", "arm", "car", "eye", "evening"], x=300)
geoms = [
_make_geom(0, words_a, x=0, width=200, width_ratio=0.15),
_make_geom(1, words_b, x=300, width=200, width_ratio=0.15),
]
without = _score_dictionary_signals(geoms, margin_strip_detected=False)
with_strip = _score_dictionary_signals(geoms, margin_strip_detected=True)
assert with_strip["confidence"] > without["confidence"]
assert with_strip["signals"]["margin_strip_detected"] is True
def test_too_few_columns(self):
"""Single column should return is_dictionary=False."""
words = _make_words(["Zahl", "Zahn", "zart", "Zauber", "Zaun"])
geoms = [_make_geom(0, words)]
result = _score_dictionary_signals(geoms)
assert result["is_dictionary"] is False
def test_empty_words(self):
"""Columns with no words should return is_dictionary=False."""
geoms = [
_make_geom(0, [], x=0),
_make_geom(1, [], x=300),
]
result = _score_dictionary_signals(geoms)
assert result["is_dictionary"] is False
class TestClassifyDictionaryColumns:
"""Test _classify_dictionary_columns with dictionary-detected data."""
def test_assigns_article_and_headword(self):
"""When dictionary detected, assigns column_article and column_headword."""
articles = _make_words(
["der", "die", "das", "der", "die", "das", "der", "die", "das", "der"],
x=0,
)
headwords = _make_words([
"Zahl", "Zahn", "zart", "Zauber", "Zaun",
"Zeichen", "zeigen", "Zeit", "Zelt", "Zentrum",
])
translations = _make_words(
["number", "tooth", "tender", "magic", "fence",
"sign", "to show", "time", "tent", "centre"],
x=400,
)
geoms = [
_make_geom(0, articles, x=0, width=50, width_ratio=0.04),
_make_geom(1, headwords, x=80, width=200, width_ratio=0.15),
_make_geom(2, translations, x=400, width=200, width_ratio=0.15),
]
dict_signals = _score_dictionary_signals(geoms)
assert dict_signals["is_dictionary"] is True
lang_scores = [_score_language(g.words) for g in geoms]
regions = _classify_dictionary_columns(geoms, dict_signals, lang_scores, 1000)
assert regions is not None
types = [r.type for r in regions]
assert "column_article" in types, f"Expected column_article in {types}"
assert "column_headword" in types, f"Expected column_headword in {types}"
# All regions should have classification_method='dictionary'
for r in regions:
assert r.classification_method == "dictionary"
def test_returns_none_when_not_dictionary(self):
"""Should return None when dict_signals says not a dictionary."""
geoms = [
_make_geom(0, _make_words(["cat", "dog"]), x=0),
_make_geom(1, _make_words(["Katze", "Hund"]), x=300),
]
dict_signals = {"is_dictionary": False, "confidence": 0.1}
lang_scores = [_score_language(g.words) for g in geoms]
result = _classify_dictionary_columns(geoms, dict_signals, lang_scores, 1000)
assert result is None