refactor: remove unused pages and backends (model-management, OCR legacy, GPU/vast.ai, video-chat, matrix)

Deleted pages: - /ai/model-management (mock data only, no real backend) - /ai/ocr-compare (old /vocab/ backend, replaced by ocr-kombi) - /ai/ocr-pipeline (minimal session browser, redundant) - /ai/ocr-overlay (legacy monolith, redundant) - /ai/gpu (vast.ai GPU management, no longer used) - /infrastructure/gpu (same) - /communication/video-chat (moved to core) - /communication/matrix (moved to core) Deleted backends: - backend-lehrer/infra/vast_client.py + vast_power.py - backend-lehrer/meetings_api.py + jitsi_api.py - website/app/api/admin/gpu/ - edu-search-service/scripts/vast_ai_extractor.py Total: ~7,800 LOC removed. All code preserved in git history. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-23 13:14:12 +02:00
parent 5abdfa202e
commit f39cbe9283
30 changed files with 1089 additions and 9567 deletions
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+"""Debug script: analyze text line slopes on deskewed image to determine true residual shear."""
+import sys, math, asyncio
+sys.path.insert(0, "/app/backend")
+
+import cv2
+import numpy as np
+import pytesseract
+from ocr_pipeline_session_store import get_session_db
+
+SESSION_ID = "3dcb1897-09a6-4b80-91b5-7e4207980bf3"
+
+async def main():
+    s = await get_session_db(SESSION_ID)
+    if not s:
+        print("Session not found")
+        return
+
+    deskewed_png = s.get("deskewed_png")
+    if not deskewed_png:
+        print("No deskewed_png stored")
+        return
+
+    arr = np.frombuffer(deskewed_png, dtype=np.uint8)
+    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
+    h, w = img.shape[:2]
+    print(f"Deskewed image: {w}x{h}")
+
+    # Detect text line slopes using Tesseract word positions
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    data = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT, config="--psm 6")
+
+    lines = {}
+    for i in range(len(data["text"])):
+        txt = (data["text"][i] or "").strip()
+        if len(txt) < 2 or data["conf"][i] < 30:
+            continue
+        key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
+        cx = data["left"][i] + data["width"][i] / 2
+        cy = data["top"][i] + data["height"][i] / 2
+        if key not in lines:
+            lines[key] = []
+        lines[key].append((cx, cy))
+
+    slopes = []
+    for key, pts in lines.items():
+        if len(pts) < 3:
+            continue
+        pts.sort(key=lambda p: p[0])
+        xs = np.array([p[0] for p in pts])
+        ys = np.array([p[1] for p in pts])
+        if xs[-1] - xs[0] < w * 0.2:
+            continue
+        A = np.vstack([xs, np.ones(len(xs))]).T
+        result = np.linalg.lstsq(A, ys, rcond=None)
+        slope = result[0][0]
+        angle_deg = math.degrees(math.atan(slope))
+        slopes.append(angle_deg)
+
+    if not slopes:
+        print("No text lines detected")
+        return
+
+    median_slope = sorted(slopes)[len(slopes) // 2]
+    mean_slope = sum(slopes) / len(slopes)
+    print(f"Text lines found: {len(slopes)}")
+    print(f"Median slope: {median_slope:.4f} deg")
+    print(f"Mean slope:   {mean_slope:.4f} deg")
+    print(f"Range: [{min(slopes):.4f}, {max(slopes):.4f}]")
+    print()
+    print("Individual line slopes:")
+    for s in sorted(slopes):
+        print(f"  {s:+.4f}")
+
+    # Also test the 4 dewarp methods directly
+    print("\n--- Dewarp method results on deskewed image ---")
+    from cv_vocab_pipeline import (
+        _detect_shear_angle, _detect_shear_by_projection,
+        _detect_shear_by_hough, _detect_shear_by_text_lines,
+    )
+    for name, fn in [
+        ("vertical_edge", _detect_shear_angle),
+        ("projection", _detect_shear_by_projection),
+        ("hough_lines", _detect_shear_by_hough),
+        ("text_lines", _detect_shear_by_text_lines),
+    ]:
+        r = fn(img)
+        print(f"  {name}: shear={r['shear_degrees']:.4f} conf={r['confidence']:.3f}")
+
+    # The user says "right side needs to come down 3mm"
+    # For a ~85mm wide image (1002px at ~300DPI), 3mm ~ 35px
+    # shear angle = atan(35 / 1556) ~ 1.29 degrees
+    # Let's check: what does the image look like if we apply 0.5, 1.0, 1.5 deg shear?
+    print("\n--- Pixel shift at right edge for various shear angles ---")
+    for deg in [0.5, 0.8, 1.0, 1.3, 1.5, 2.0]:
+        shift_px = h * math.tan(math.radians(deg))
+        shift_mm = shift_px / (w / 85.0)  # approximate mm
+        print(f"  {deg:.1f} deg -> {shift_px:.0f}px shift -> ~{shift_mm:.1f}mm")
+
+asyncio.run(main())
@@ -0,0 +1,256 @@
+"""
+Tests for box boundary row filtering logic (box_ranges_inner).
+
+Verifies that rows at the border of box zones are NOT excluded during
+row detection and word filtering. This prevents the last row above a
+box from being clipped by the box's border pixels.
+
+Related fix in ocr_pipeline_api.py: detect_rows() and detect_words()
+use box_ranges_inner (shrunk by border_thickness, min 5px) instead of
+full box_ranges for row exclusion.
+"""
+
+import pytest
+import numpy as np
+from dataclasses import dataclass
+
+
+# ---------------------------------------------------------------------------
+# Simulate the box_ranges_inner calculation from ocr_pipeline_api.py
+# ---------------------------------------------------------------------------
+
+def compute_box_ranges(zones: list[dict]) -> tuple[list, list]:
+    """
+    Replicates the box_ranges / box_ranges_inner calculation
+    from detect_rows() in ocr_pipeline_api.py.
+    """
+    box_ranges = []
+    box_ranges_inner = []
+    for zone in zones:
+        if zone.get("zone_type") == "box" and zone.get("box"):
+            box = zone["box"]
+            bt = max(box.get("border_thickness", 0), 5)  # minimum 5px margin
+            box_ranges.append((box["y"], box["y"] + box["height"]))
+            box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt))
+    return box_ranges, box_ranges_inner
+
+
+def build_content_strips(box_ranges_inner: list, top_y: int, bottom_y: int) -> list:
+    """
+    Replicates the content_strips calculation from detect_rows() in ocr_pipeline_api.py.
+    """
+    sorted_boxes = sorted(box_ranges_inner, key=lambda r: r[0])
+    content_strips = []
+    strip_start = top_y
+    for by_start, by_end in sorted_boxes:
+        if by_start > strip_start:
+            content_strips.append((strip_start, by_start))
+        strip_start = max(strip_start, by_end)
+    if strip_start < bottom_y:
+        content_strips.append((strip_start, bottom_y))
+    return [(ys, ye) for ys, ye in content_strips if ye - ys >= 20]
+
+
+def row_in_box(row_y: int, row_height: int, box_ranges_inner: list) -> bool:
+    """
+    Replicates the _row_in_box filter from detect_words() in ocr_pipeline_api.py.
+    """
+    center_y = row_y + row_height / 2
+    return any(by_s <= center_y < by_e for by_s, by_e in box_ranges_inner)
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+class TestBoxRangesInner:
+    """Tests for box_ranges_inner calculation."""
+
+    def test_border_thickness_shrinks_inner_range(self):
+        """Inner range should be shrunk by border_thickness."""
+        zones = [{
+            "zone_type": "box",
+            "box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 10},
+        }]
+        box_ranges, inner = compute_box_ranges(zones)
+
+        assert box_ranges == [(500, 700)]
+        assert inner == [(510, 690)]  # shrunk by 10px on each side
+
+    def test_minimum_5px_margin(self):
+        """Even with border_thickness=0, minimum 5px margin should apply."""
+        zones = [{
+            "zone_type": "box",
+            "box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 0},
+        }]
+        _, inner = compute_box_ranges(zones)
+
+        assert inner == [(505, 695)]  # minimum 5px applied
+
+    def test_no_box_zones_returns_empty(self):
+        """Without box zones, both ranges should be empty."""
+        zones = [
+            {"zone_type": "content", "y": 0, "height": 500},
+        ]
+        box_ranges, inner = compute_box_ranges(zones)
+
+        assert box_ranges == []
+        assert inner == []
+
+    def test_multiple_boxes(self):
+        """Multiple boxes should each get their own inner range."""
+        zones = [
+            {"zone_type": "box", "box": {"x": 50, "y": 300, "width": 1100, "height": 150, "border_thickness": 8}},
+            {"zone_type": "box", "box": {"x": 50, "y": 700, "width": 1100, "height": 150, "border_thickness": 3}},
+        ]
+        box_ranges, inner = compute_box_ranges(zones)
+
+        assert len(box_ranges) == 2
+        assert len(inner) == 2
+        assert inner[0] == (308, 442)  # 300+8 to 450-8
+        assert inner[1] == (705, 845)  # 700+5(min) to 850-5(min)
+
+
+class TestContentStrips:
+    """Tests for content strip building with box_ranges_inner."""
+
+    def test_single_box_creates_two_strips(self):
+        """A single box in the middle should create two content strips."""
+        inner = [(505, 695)]  # box inner at y=505..695
+        strips = build_content_strips(inner, top_y=100, bottom_y=1700)
+
+        assert len(strips) == 2
+        assert strips[0] == (100, 505)   # above box
+        assert strips[1] == (695, 1700)  # below box
+
+    def test_content_strip_includes_box_border_area(self):
+        """Content strips should INCLUDE the box border area (not just stop at box outer edge)."""
+        # Box at y=500, height=200, border=10 → inner=(510, 690)
+        inner = [(510, 690)]
+        strips = build_content_strips(inner, top_y=100, bottom_y=1700)
+
+        # Strip above extends to 510 (not 500), including border area
+        assert strips[0] == (100, 510)
+        # Strip below starts at 690 (not 700), including border area
+        assert strips[1] == (690, 1700)
+
+    def test_row_at_box_border_is_in_content_strip(self):
+        """A row at y=495 (just above box at y=500) should be in the content strip."""
+        # Box at y=500, height=200, border=10 → inner=(510, 690)
+        inner = [(510, 690)]
+        strips = build_content_strips(inner, top_y=100, bottom_y=1700)
+
+        # Row at y=495, height=30 → center at y=510 → just at the edge
+        row_center = 495 + 15  # = 510
+        # This row center is at the boundary — it should be in the first strip
+        in_first_strip = strips[0][0] <= row_center <= strips[0][1]
+        assert in_first_strip
+
+    def test_no_boxes_single_strip(self):
+        """Without boxes, a single strip covering the full content should be returned."""
+        strips = build_content_strips([], top_y=100, bottom_y=1700)
+
+        assert len(strips) == 1
+        assert strips[0] == (100, 1700)
+
+
+class TestRowInBoxFilter:
+    """Tests for the _row_in_box filter using box_ranges_inner."""
+
+    def test_row_inside_box_is_excluded(self):
+        """A row clearly inside the box inner range should be excluded."""
+        inner = [(510, 690)]
+        # Row at y=550, height=30 → center at 565
+        assert row_in_box(550, 30, inner) is True
+
+    def test_row_above_box_not_excluded(self):
+        """A row above the box (at the border area) should NOT be excluded."""
+        inner = [(510, 690)]
+        # Row at y=490, height=30 → center at 505 → below inner start (510)
+        assert row_in_box(490, 30, inner) is False
+
+    def test_row_below_box_not_excluded(self):
+        """A row below the box (at the border area) should NOT be excluded."""
+        inner = [(510, 690)]
+        # Row at y=695, height=30 → center at 710 → above inner end (690)
+        assert row_in_box(695, 30, inner) is False
+
+    def test_row_at_box_border_not_excluded(self):
+        """A row overlapping with the box border should NOT be excluded.
+
+        This is the key fix: previously, box_ranges (not inner) was used,
+        which would exclude this row because its center (505) falls within
+        the full box range (500-700).
+        """
+        # Full box range: (500, 700), inner: (510, 690)
+        inner = [(510, 690)]
+        # Row at y=490, height=30 → center at 505
+        # With box_ranges (500, 700): 500 <= 505 < 700 → excluded (BUG!)
+        # With box_ranges_inner (510, 690): 510 <= 505 → False → not excluded (FIXED!)
+        assert row_in_box(490, 30, inner) is False
+
+    def test_row_at_bottom_border_not_excluded(self):
+        """A row overlapping with the bottom box border should NOT be excluded."""
+        inner = [(510, 690)]
+        # Row at y=685, height=30 → center at 700
+        # With box_ranges (500, 700): 500 <= 700 < 700 → not excluded (edge)
+        # With box_ranges_inner (510, 690): 510 <= 700 → True but 700 >= 690 → False
+        assert row_in_box(685, 30, inner) is False
+
+    def test_no_boxes_nothing_excluded(self):
+        """Without box zones, no rows should be excluded."""
+        assert row_in_box(500, 30, []) is False
+
+
+class TestBoxBoundaryIntegration:
+    """Integration test: simulate the full row → content strip → filter pipeline."""
+
+    def test_boundary_row_preserved_with_inner_ranges(self):
+        """
+        End-to-end: A row at the box boundary is preserved in content strips
+        and not filtered out by _row_in_box.
+
+        Simulates the real scenario: page with a box at y=500..700,
+        border_thickness=10. Row at y=488..518 (center=503) sits just
+        above the box border.
+        """
+        zones = [{
+            "zone_type": "box",
+            "box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 10},
+        }]
+
+        # Step 1: Compute inner ranges
+        box_ranges, inner = compute_box_ranges(zones)
+        assert inner == [(510, 690)]
+
+        # Step 2: Build content strips
+        strips = build_content_strips(inner, top_y=20, bottom_y=2400)
+        assert len(strips) == 2
+        # First strip extends to 510 (includes the border area 500-510)
+        assert strips[0] == (20, 510)
+
+        # Step 3: Check that the boundary row is NOT in box
+        row_y, row_h = 488, 30  # center = 503
+        assert row_in_box(row_y, row_h, inner) is False
+
+        # Step 4: Verify the row's center falls within a content strip
+        row_center = row_y + row_h / 2  # 503
+        in_any_strip = any(ys <= row_center < ye for ys, ye in strips)
+        assert in_any_strip, f"Row center {row_center} should be in content strips {strips}"
+
+    def test_boundary_row_would_be_lost_with_full_ranges(self):
+        """
+        Demonstrates the bug: using full box_ranges (not inner) WOULD
+        exclude the boundary row.
+        """
+        zones = [{
+            "zone_type": "box",
+            "box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 10},
+        }]
+        box_ranges, _ = compute_box_ranges(zones)
+
+        # The full range is (500, 700)
+        row_center = 488 + 30 / 2  # 503
+        # With full range: 500 <= 503 < 700 → would be excluded!
+        in_box_full = any(by_s <= row_center < by_e for by_s, by_e in box_ranges)
+        assert in_box_full is True, "Full range SHOULD incorrectly exclude this row"
@@ -0,0 +1,285 @@
+"""Tests for dictionary/Wörterbuch page detection.
+
+Tests the _score_dictionary_signals() function and _classify_dictionary_columns()
+from cv_layout.py.
+"""
+
+import sys
+import os
+
+# Add backend to path for imports
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from cv_vocab_types import ColumnGeometry
+from cv_layout import _score_dictionary_signals, _classify_dictionary_columns, _score_language
+
+
+def _make_words(texts, start_y=0, y_step=30, x=100, conf=80):
+    """Create a list of word dicts from text strings."""
+    return [
+        {
+            "text": t,
+            "conf": conf,
+            "top": start_y + i * y_step,
+            "left": x,
+            "height": 20,
+            "width": len(t) * 10,
+        }
+        for i, t in enumerate(texts)
+    ]
+
+
+def _make_geom(index, words, x=0, width=200, width_ratio=0.15):
+    """Create a ColumnGeometry with given words."""
+    return ColumnGeometry(
+        index=index,
+        x=x,
+        y=0,
+        width=width,
+        height=1000,
+        word_count=len(words),
+        words=words,
+        width_ratio=width_ratio,
+    )
+
+
+class TestDictionarySignals:
+    """Test _score_dictionary_signals with synthetic data."""
+
+    def test_alphabetical_column_detected(self):
+        """A column with alphabetically ordered words should score high."""
+        # Simulate a dictionary headword column: Z words
+        headwords = _make_words([
+            "Zahl", "Zahn", "zart", "Zauber", "Zaun",
+            "Zeichen", "zeigen", "Zeit", "Zelt", "Zentrum",
+            "zerbrechen", "Zeug", "Ziel", "Zimmer", "Zitrone",
+            "Zoll", "Zone", "Zoo", "Zucker", "Zug",
+        ])
+        # Article column
+        articles = _make_words(
+            ["die", "der", "das", "der", "der",
+             "das", "die", "die", "das", "das",
+             "der", "das", "das", "das", "die",
+             "der", "die", "der", "der", "der"],
+            x=0,
+        )
+        # Translation column
+        translations = _make_words(
+            ["number", "tooth", "tender", "magic", "fence",
+             "sign", "to show", "time", "tent", "centre",
+             "to break", "stuff", "goal", "room", "lemon",
+             "customs", "zone", "zoo", "sugar", "train"],
+            x=400,
+        )
+
+        geoms = [
+            _make_geom(0, articles, x=0, width=60, width_ratio=0.05),
+            _make_geom(1, headwords, x=80, width=200, width_ratio=0.15),
+            _make_geom(2, translations, x=400, width=200, width_ratio=0.15),
+        ]
+
+        result = _score_dictionary_signals(geoms)
+
+        assert result["signals"]["alphabetical_score"] >= 0.80, (
+            f"Expected alphabetical_score >= 0.80, got {result['signals']['alphabetical_score']}"
+        )
+        assert result["signals"]["article_density"] >= 0.80, (
+            f"Expected article_density >= 0.80, got {result['signals']['article_density']}"
+        )
+        assert result["signals"]["first_letter_uniformity"] >= 0.60, (
+            f"Expected first_letter_uniformity >= 0.60, got {result['signals']['first_letter_uniformity']}"
+        )
+        assert result["is_dictionary"] is True
+        assert result["confidence"] >= 0.40
+
+    def test_non_dictionary_vocab_table(self):
+        """A normal vocab table (topic-grouped, no alphabetical order) should NOT be detected."""
+        en_words = _make_words([
+            "school", "teacher", "homework", "pencil", "break",
+            "lunch", "friend", "computer", "book", "bag",
+        ])
+        de_words = _make_words([
+            "Schule", "Lehrer", "Hausaufgaben", "Bleistift", "Pause",
+            "Mittagessen", "Freund", "Computer", "Buch", "Tasche",
+        ], x=300)
+
+        geoms = [
+            _make_geom(0, en_words, x=0, width=200, width_ratio=0.20),
+            _make_geom(1, de_words, x=300, width=200, width_ratio=0.20),
+        ]
+
+        result = _score_dictionary_signals(geoms)
+
+        # Alphabetical score should be moderate at best (random order)
+        assert result["is_dictionary"] is False, (
+            f"Normal vocab table should NOT be detected as dictionary, "
+            f"confidence={result['confidence']}"
+        )
+
+    def test_article_column_detection(self):
+        """A narrow column with mostly articles should be identified."""
+        articles = _make_words(
+            ["der", "die", "das", "der", "die", "das", "der", "die", "das", "der"],
+            x=0,
+        )
+        headwords = _make_words(
+            ["Apfel", "Birne", "Dose", "Eis", "Fisch",
+             "Gabel", "Haus", "Igel", "Jacke", "Kuchen"],
+        )
+        translations = _make_words(
+            ["apple", "pear", "can", "ice", "fish",
+             "fork", "house", "hedgehog", "jacket", "cake"],
+            x=400,
+        )
+
+        geoms = [
+            _make_geom(0, articles, x=0, width=50, width_ratio=0.04),
+            _make_geom(1, headwords, x=80, width=200, width_ratio=0.15),
+            _make_geom(2, translations, x=400, width=200, width_ratio=0.15),
+        ]
+
+        result = _score_dictionary_signals(geoms)
+
+        assert result["signals"]["article_density"] >= 0.80
+        assert result["signals"]["article_col"] == 0
+
+    def test_first_letter_uniformity(self):
+        """Words all starting with same letter should have high uniformity."""
+        z_words = _make_words([
+            "Zahl", "Zahn", "zart", "Zauber", "Zaun",
+            "Zeichen", "zeigen", "Zeit", "Zelt", "Zentrum",
+        ])
+        other = _make_words(
+            ["number", "tooth", "tender", "magic", "fence",
+             "sign", "to show", "time", "tent", "centre"],
+            x=300,
+        )
+
+        geoms = [
+            _make_geom(0, z_words, x=0, width=200, width_ratio=0.15),
+            _make_geom(1, other, x=300, width=200, width_ratio=0.15),
+        ]
+
+        result = _score_dictionary_signals(geoms)
+        assert result["signals"]["first_letter_uniformity"] >= 0.80
+
+    def test_letter_transition_detected(self):
+        """Words transitioning from one letter to next (A→B) should be detected."""
+        words = _make_words([
+            "Apfel", "Arm", "Auto", "Auge", "Abend",
+            "Ball", "Baum", "Berg", "Blume", "Boot",
+        ])
+        other = _make_words(
+            ["apple", "arm", "car", "eye", "evening",
+             "ball", "tree", "mountain", "flower", "boat"],
+            x=300,
+        )
+
+        geoms = [
+            _make_geom(0, words, x=0, width=200, width_ratio=0.15),
+            _make_geom(1, other, x=300, width=200, width_ratio=0.15),
+        ]
+
+        result = _score_dictionary_signals(geoms)
+        assert result["signals"]["has_letter_transition"] is True
+
+    def test_category_boost(self):
+        """document_category='woerterbuch' should boost confidence."""
+        # Weak signals that normally wouldn't trigger dictionary detection
+        words_a = _make_words(["cat", "dog", "fish", "hat", "map"], x=0)
+        words_b = _make_words(["Katze", "Hund", "Fisch", "Hut", "Karte"], x=300)
+
+        geoms = [
+            _make_geom(0, words_a, x=0, width=200, width_ratio=0.15),
+            _make_geom(1, words_b, x=300, width=200, width_ratio=0.15),
+        ]
+
+        without_boost = _score_dictionary_signals(geoms)
+        with_boost = _score_dictionary_signals(geoms, document_category="woerterbuch")
+
+        assert with_boost["confidence"] > without_boost["confidence"]
+        assert with_boost["confidence"] - without_boost["confidence"] >= 0.19  # ~0.20 boost
+
+    def test_margin_strip_signal(self):
+        """margin_strip_detected=True should contribute to confidence."""
+        words_a = _make_words(["Apfel", "Arm", "Auto", "Auge", "Abend"], x=0)
+        words_b = _make_words(["apple", "arm", "car", "eye", "evening"], x=300)
+
+        geoms = [
+            _make_geom(0, words_a, x=0, width=200, width_ratio=0.15),
+            _make_geom(1, words_b, x=300, width=200, width_ratio=0.15),
+        ]
+
+        without = _score_dictionary_signals(geoms, margin_strip_detected=False)
+        with_strip = _score_dictionary_signals(geoms, margin_strip_detected=True)
+
+        assert with_strip["confidence"] > without["confidence"]
+        assert with_strip["signals"]["margin_strip_detected"] is True
+
+    def test_too_few_columns(self):
+        """Single column should return is_dictionary=False."""
+        words = _make_words(["Zahl", "Zahn", "zart", "Zauber", "Zaun"])
+        geoms = [_make_geom(0, words)]
+
+        result = _score_dictionary_signals(geoms)
+        assert result["is_dictionary"] is False
+
+    def test_empty_words(self):
+        """Columns with no words should return is_dictionary=False."""
+        geoms = [
+            _make_geom(0, [], x=0),
+            _make_geom(1, [], x=300),
+        ]
+        result = _score_dictionary_signals(geoms)
+        assert result["is_dictionary"] is False
+
+
+class TestClassifyDictionaryColumns:
+    """Test _classify_dictionary_columns with dictionary-detected data."""
+
+    def test_assigns_article_and_headword(self):
+        """When dictionary detected, assigns column_article and column_headword."""
+        articles = _make_words(
+            ["der", "die", "das", "der", "die", "das", "der", "die", "das", "der"],
+            x=0,
+        )
+        headwords = _make_words([
+            "Zahl", "Zahn", "zart", "Zauber", "Zaun",
+            "Zeichen", "zeigen", "Zeit", "Zelt", "Zentrum",
+        ])
+        translations = _make_words(
+            ["number", "tooth", "tender", "magic", "fence",
+             "sign", "to show", "time", "tent", "centre"],
+            x=400,
+        )
+
+        geoms = [
+            _make_geom(0, articles, x=0, width=50, width_ratio=0.04),
+            _make_geom(1, headwords, x=80, width=200, width_ratio=0.15),
+            _make_geom(2, translations, x=400, width=200, width_ratio=0.15),
+        ]
+
+        dict_signals = _score_dictionary_signals(geoms)
+        assert dict_signals["is_dictionary"] is True
+
+        lang_scores = [_score_language(g.words) for g in geoms]
+        regions = _classify_dictionary_columns(geoms, dict_signals, lang_scores, 1000)
+
+        assert regions is not None
+        types = [r.type for r in regions]
+        assert "column_article" in types, f"Expected column_article in {types}"
+        assert "column_headword" in types, f"Expected column_headword in {types}"
+        # All regions should have classification_method='dictionary'
+        for r in regions:
+            assert r.classification_method == "dictionary"
+
+    def test_returns_none_when_not_dictionary(self):
+        """Should return None when dict_signals says not a dictionary."""
+        geoms = [
+            _make_geom(0, _make_words(["cat", "dog"]), x=0),
+            _make_geom(1, _make_words(["Katze", "Hund"]), x=300),
+        ]
+        dict_signals = {"is_dictionary": False, "confidence": 0.1}
+        lang_scores = [_score_language(g.words) for g in geoms]
+        result = _classify_dictionary_columns(geoms, dict_signals, lang_scores, 1000)
+        assert result is None