diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 6c8eeaa..4e25f67 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -1312,9 +1312,24 @@ def detect_row_geometry(
         logger.warning("detect_row_geometry: content area too small")
         return []
 
-    # --- Step 1: Horizontal projection profile ---
+    # --- Step 1: Horizontal projection profile (text-only, images masked out) ---
     content_strip = inv[top_y:bottom_y, left_x:right_x]
-    h_proj = np.sum(content_strip, axis=1).astype(float)
+
+    # Build a word-coverage mask so that image regions (high ink density but no
+    # Tesseract words) are ignored.  Only pixels within/near word bounding boxes
+    # contribute to the projection.  This prevents large illustrations from
+    # merging multiple vocabulary rows into one.
+    WORD_PAD_Y = max(4, content_h // 300)  # small vertical padding around words
+    word_mask = np.zeros((content_h, content_w), dtype=np.uint8)
+    for wd in word_dicts:
+        y1 = max(0, wd['top'] - WORD_PAD_Y)
+        y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y)
+        x1 = max(0, wd['left'])
+        x2 = min(content_w, wd['left'] + wd['width'])
+        word_mask[y1:y2, x1:x2] = 255
+
+    masked_strip = cv2.bitwise_and(content_strip, word_mask)
+    h_proj = np.sum(masked_strip, axis=1).astype(float)
     h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
 
     # --- Step 2: Smoothing + threshold ---