From f3d61a939434b2e77ff4bc725419cc99960e6a38 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Tue, 3 Mar 2026 17:08:03 +0100
Subject: [PATCH] fix: extend initial Tesseract scan to full image width for
 word detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

content_roi was cropped to [left_x:right_x] — the detected content boundary.
Words at the right edge of the last column (beyond right_x) were never
found in the initial scan, so they remained missing even after the column
geometry was extended to full image width (w).

Fix: crop to [left_x:w] so all words including those near the right margin
are detected and assigned correctly to the last column.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 0613c95..e7806e9 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -1460,7 +1460,10 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
                 f"y=[{top_y}..{bottom_y}] ({content_h}px)")
 
     # --- Step 2: Get word bounding boxes from Tesseract ---
-    content_roi = dewarped_bgr[top_y:bottom_y, left_x:right_x]
+    # Crop from left_x to full image width (not right_x) so words at the right
+    # edge of the last column are included even if they extend past the detected
+    # content boundary (right_x).
+    content_roi = dewarped_bgr[top_y:bottom_y, left_x:w]
     pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB))
 
     try: