Deleted pages: - /ai/model-management (mock data only, no real backend) - /ai/ocr-compare (old /vocab/ backend, replaced by ocr-kombi) - /ai/ocr-pipeline (minimal session browser, redundant) - /ai/ocr-overlay (legacy monolith, redundant) - /ai/gpu (vast.ai GPU management, no longer used) - /infrastructure/gpu (same) - /communication/video-chat (moved to core) - /communication/matrix (moved to core) Deleted backends: - backend-lehrer/infra/vast_client.py + vast_power.py - backend-lehrer/meetings_api.py + jitsi_api.py - website/app/api/admin/gpu/ - edu-search-service/scripts/vast_ai_extractor.py Total: ~7,800 LOC removed. All code preserved in git history. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
286 lines
11 KiB
Python
286 lines
11 KiB
Python
"""Tests for dictionary/Wörterbuch page detection.
|
|
|
|
Tests the _score_dictionary_signals() function and _classify_dictionary_columns()
|
|
from cv_layout.py.
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
|
|
# Add backend to path for imports
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
|
|
|
from cv_vocab_types import ColumnGeometry
|
|
from cv_layout import _score_dictionary_signals, _classify_dictionary_columns, _score_language
|
|
|
|
|
|
def _make_words(texts, start_y=0, y_step=30, x=100, conf=80):
|
|
"""Create a list of word dicts from text strings."""
|
|
return [
|
|
{
|
|
"text": t,
|
|
"conf": conf,
|
|
"top": start_y + i * y_step,
|
|
"left": x,
|
|
"height": 20,
|
|
"width": len(t) * 10,
|
|
}
|
|
for i, t in enumerate(texts)
|
|
]
|
|
|
|
|
|
def _make_geom(index, words, x=0, width=200, width_ratio=0.15):
|
|
"""Create a ColumnGeometry with given words."""
|
|
return ColumnGeometry(
|
|
index=index,
|
|
x=x,
|
|
y=0,
|
|
width=width,
|
|
height=1000,
|
|
word_count=len(words),
|
|
words=words,
|
|
width_ratio=width_ratio,
|
|
)
|
|
|
|
|
|
class TestDictionarySignals:
|
|
"""Test _score_dictionary_signals with synthetic data."""
|
|
|
|
def test_alphabetical_column_detected(self):
|
|
"""A column with alphabetically ordered words should score high."""
|
|
# Simulate a dictionary headword column: Z words
|
|
headwords = _make_words([
|
|
"Zahl", "Zahn", "zart", "Zauber", "Zaun",
|
|
"Zeichen", "zeigen", "Zeit", "Zelt", "Zentrum",
|
|
"zerbrechen", "Zeug", "Ziel", "Zimmer", "Zitrone",
|
|
"Zoll", "Zone", "Zoo", "Zucker", "Zug",
|
|
])
|
|
# Article column
|
|
articles = _make_words(
|
|
["die", "der", "das", "der", "der",
|
|
"das", "die", "die", "das", "das",
|
|
"der", "das", "das", "das", "die",
|
|
"der", "die", "der", "der", "der"],
|
|
x=0,
|
|
)
|
|
# Translation column
|
|
translations = _make_words(
|
|
["number", "tooth", "tender", "magic", "fence",
|
|
"sign", "to show", "time", "tent", "centre",
|
|
"to break", "stuff", "goal", "room", "lemon",
|
|
"customs", "zone", "zoo", "sugar", "train"],
|
|
x=400,
|
|
)
|
|
|
|
geoms = [
|
|
_make_geom(0, articles, x=0, width=60, width_ratio=0.05),
|
|
_make_geom(1, headwords, x=80, width=200, width_ratio=0.15),
|
|
_make_geom(2, translations, x=400, width=200, width_ratio=0.15),
|
|
]
|
|
|
|
result = _score_dictionary_signals(geoms)
|
|
|
|
assert result["signals"]["alphabetical_score"] >= 0.80, (
|
|
f"Expected alphabetical_score >= 0.80, got {result['signals']['alphabetical_score']}"
|
|
)
|
|
assert result["signals"]["article_density"] >= 0.80, (
|
|
f"Expected article_density >= 0.80, got {result['signals']['article_density']}"
|
|
)
|
|
assert result["signals"]["first_letter_uniformity"] >= 0.60, (
|
|
f"Expected first_letter_uniformity >= 0.60, got {result['signals']['first_letter_uniformity']}"
|
|
)
|
|
assert result["is_dictionary"] is True
|
|
assert result["confidence"] >= 0.40
|
|
|
|
def test_non_dictionary_vocab_table(self):
|
|
"""A normal vocab table (topic-grouped, no alphabetical order) should NOT be detected."""
|
|
en_words = _make_words([
|
|
"school", "teacher", "homework", "pencil", "break",
|
|
"lunch", "friend", "computer", "book", "bag",
|
|
])
|
|
de_words = _make_words([
|
|
"Schule", "Lehrer", "Hausaufgaben", "Bleistift", "Pause",
|
|
"Mittagessen", "Freund", "Computer", "Buch", "Tasche",
|
|
], x=300)
|
|
|
|
geoms = [
|
|
_make_geom(0, en_words, x=0, width=200, width_ratio=0.20),
|
|
_make_geom(1, de_words, x=300, width=200, width_ratio=0.20),
|
|
]
|
|
|
|
result = _score_dictionary_signals(geoms)
|
|
|
|
# Alphabetical score should be moderate at best (random order)
|
|
assert result["is_dictionary"] is False, (
|
|
f"Normal vocab table should NOT be detected as dictionary, "
|
|
f"confidence={result['confidence']}"
|
|
)
|
|
|
|
def test_article_column_detection(self):
|
|
"""A narrow column with mostly articles should be identified."""
|
|
articles = _make_words(
|
|
["der", "die", "das", "der", "die", "das", "der", "die", "das", "der"],
|
|
x=0,
|
|
)
|
|
headwords = _make_words(
|
|
["Apfel", "Birne", "Dose", "Eis", "Fisch",
|
|
"Gabel", "Haus", "Igel", "Jacke", "Kuchen"],
|
|
)
|
|
translations = _make_words(
|
|
["apple", "pear", "can", "ice", "fish",
|
|
"fork", "house", "hedgehog", "jacket", "cake"],
|
|
x=400,
|
|
)
|
|
|
|
geoms = [
|
|
_make_geom(0, articles, x=0, width=50, width_ratio=0.04),
|
|
_make_geom(1, headwords, x=80, width=200, width_ratio=0.15),
|
|
_make_geom(2, translations, x=400, width=200, width_ratio=0.15),
|
|
]
|
|
|
|
result = _score_dictionary_signals(geoms)
|
|
|
|
assert result["signals"]["article_density"] >= 0.80
|
|
assert result["signals"]["article_col"] == 0
|
|
|
|
def test_first_letter_uniformity(self):
|
|
"""Words all starting with same letter should have high uniformity."""
|
|
z_words = _make_words([
|
|
"Zahl", "Zahn", "zart", "Zauber", "Zaun",
|
|
"Zeichen", "zeigen", "Zeit", "Zelt", "Zentrum",
|
|
])
|
|
other = _make_words(
|
|
["number", "tooth", "tender", "magic", "fence",
|
|
"sign", "to show", "time", "tent", "centre"],
|
|
x=300,
|
|
)
|
|
|
|
geoms = [
|
|
_make_geom(0, z_words, x=0, width=200, width_ratio=0.15),
|
|
_make_geom(1, other, x=300, width=200, width_ratio=0.15),
|
|
]
|
|
|
|
result = _score_dictionary_signals(geoms)
|
|
assert result["signals"]["first_letter_uniformity"] >= 0.80
|
|
|
|
def test_letter_transition_detected(self):
|
|
"""Words transitioning from one letter to next (A→B) should be detected."""
|
|
words = _make_words([
|
|
"Apfel", "Arm", "Auto", "Auge", "Abend",
|
|
"Ball", "Baum", "Berg", "Blume", "Boot",
|
|
])
|
|
other = _make_words(
|
|
["apple", "arm", "car", "eye", "evening",
|
|
"ball", "tree", "mountain", "flower", "boat"],
|
|
x=300,
|
|
)
|
|
|
|
geoms = [
|
|
_make_geom(0, words, x=0, width=200, width_ratio=0.15),
|
|
_make_geom(1, other, x=300, width=200, width_ratio=0.15),
|
|
]
|
|
|
|
result = _score_dictionary_signals(geoms)
|
|
assert result["signals"]["has_letter_transition"] is True
|
|
|
|
def test_category_boost(self):
|
|
"""document_category='woerterbuch' should boost confidence."""
|
|
# Weak signals that normally wouldn't trigger dictionary detection
|
|
words_a = _make_words(["cat", "dog", "fish", "hat", "map"], x=0)
|
|
words_b = _make_words(["Katze", "Hund", "Fisch", "Hut", "Karte"], x=300)
|
|
|
|
geoms = [
|
|
_make_geom(0, words_a, x=0, width=200, width_ratio=0.15),
|
|
_make_geom(1, words_b, x=300, width=200, width_ratio=0.15),
|
|
]
|
|
|
|
without_boost = _score_dictionary_signals(geoms)
|
|
with_boost = _score_dictionary_signals(geoms, document_category="woerterbuch")
|
|
|
|
assert with_boost["confidence"] > without_boost["confidence"]
|
|
assert with_boost["confidence"] - without_boost["confidence"] >= 0.19 # ~0.20 boost
|
|
|
|
def test_margin_strip_signal(self):
|
|
"""margin_strip_detected=True should contribute to confidence."""
|
|
words_a = _make_words(["Apfel", "Arm", "Auto", "Auge", "Abend"], x=0)
|
|
words_b = _make_words(["apple", "arm", "car", "eye", "evening"], x=300)
|
|
|
|
geoms = [
|
|
_make_geom(0, words_a, x=0, width=200, width_ratio=0.15),
|
|
_make_geom(1, words_b, x=300, width=200, width_ratio=0.15),
|
|
]
|
|
|
|
without = _score_dictionary_signals(geoms, margin_strip_detected=False)
|
|
with_strip = _score_dictionary_signals(geoms, margin_strip_detected=True)
|
|
|
|
assert with_strip["confidence"] > without["confidence"]
|
|
assert with_strip["signals"]["margin_strip_detected"] is True
|
|
|
|
def test_too_few_columns(self):
|
|
"""Single column should return is_dictionary=False."""
|
|
words = _make_words(["Zahl", "Zahn", "zart", "Zauber", "Zaun"])
|
|
geoms = [_make_geom(0, words)]
|
|
|
|
result = _score_dictionary_signals(geoms)
|
|
assert result["is_dictionary"] is False
|
|
|
|
def test_empty_words(self):
|
|
"""Columns with no words should return is_dictionary=False."""
|
|
geoms = [
|
|
_make_geom(0, [], x=0),
|
|
_make_geom(1, [], x=300),
|
|
]
|
|
result = _score_dictionary_signals(geoms)
|
|
assert result["is_dictionary"] is False
|
|
|
|
|
|
class TestClassifyDictionaryColumns:
|
|
"""Test _classify_dictionary_columns with dictionary-detected data."""
|
|
|
|
def test_assigns_article_and_headword(self):
|
|
"""When dictionary detected, assigns column_article and column_headword."""
|
|
articles = _make_words(
|
|
["der", "die", "das", "der", "die", "das", "der", "die", "das", "der"],
|
|
x=0,
|
|
)
|
|
headwords = _make_words([
|
|
"Zahl", "Zahn", "zart", "Zauber", "Zaun",
|
|
"Zeichen", "zeigen", "Zeit", "Zelt", "Zentrum",
|
|
])
|
|
translations = _make_words(
|
|
["number", "tooth", "tender", "magic", "fence",
|
|
"sign", "to show", "time", "tent", "centre"],
|
|
x=400,
|
|
)
|
|
|
|
geoms = [
|
|
_make_geom(0, articles, x=0, width=50, width_ratio=0.04),
|
|
_make_geom(1, headwords, x=80, width=200, width_ratio=0.15),
|
|
_make_geom(2, translations, x=400, width=200, width_ratio=0.15),
|
|
]
|
|
|
|
dict_signals = _score_dictionary_signals(geoms)
|
|
assert dict_signals["is_dictionary"] is True
|
|
|
|
lang_scores = [_score_language(g.words) for g in geoms]
|
|
regions = _classify_dictionary_columns(geoms, dict_signals, lang_scores, 1000)
|
|
|
|
assert regions is not None
|
|
types = [r.type for r in regions]
|
|
assert "column_article" in types, f"Expected column_article in {types}"
|
|
assert "column_headword" in types, f"Expected column_headword in {types}"
|
|
# All regions should have classification_method='dictionary'
|
|
for r in regions:
|
|
assert r.classification_method == "dictionary"
|
|
|
|
def test_returns_none_when_not_dictionary(self):
|
|
"""Should return None when dict_signals says not a dictionary."""
|
|
geoms = [
|
|
_make_geom(0, _make_words(["cat", "dog"]), x=0),
|
|
_make_geom(1, _make_words(["Katze", "Hund"]), x=300),
|
|
]
|
|
dict_signals = {"is_dictionary": False, "confidence": 0.1}
|
|
lang_scores = [_score_language(g.words) for g in geoms]
|
|
result = _classify_dictionary_columns(geoms, dict_signals, lang_scores, 1000)
|
|
assert result is None
|