From 3904ddb4933f444bf3443d02f78c24b27c303320 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Mon, 2 Mar 2026 19:16:13 +0100
Subject: [PATCH] fix(sub-columns): convert relative word positions to absolute
 coords for split

Word 'left' values in ColumnGeometry.words are relative to the content
ROI (left_x), but geo.x is in absolute image coordinates. The split
position was computed from relative word positions and then compared
against absolute geo.x, resulting in negative widths and no splits on
real data. Pass left_x through to _detect_sub_columns to bridge the
two coordinate systems.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py  | 23 +++++++++++++------
 klausur-service/backend/ocr_pipeline_api.py   |  2 +-
 .../backend/tests/test_cv_vocab_pipeline.py   | 23 +++++++++++++++++++
 3 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index ae91187..155dc6e 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -1037,6 +1037,7 @@ def _detect_columns_by_clustering(
 def _detect_sub_columns(
     geometries: List[ColumnGeometry],
     content_w: int,
+    left_x: int = 0,
     _edge_tolerance: int = 8,
     _min_col_start_ratio: float = 0.10,
 ) -> List[ColumnGeometry]:
@@ -1048,6 +1049,10 @@ def _detect_sub_columns(
     start.  Any words to the left of that bin form a sub-column, provided they
     number >= 2 and < 35 % of total.
 
+    Word ``left`` values are relative to the content ROI (offset by *left_x*),
+    while ``ColumnGeometry.x`` is in absolute image coordinates.  *left_x*
+    bridges the two coordinate systems.
+
     Returns a new list of ColumnGeometry — potentially longer than the input.
     """
     if content_w <= 0:
@@ -1101,13 +1106,16 @@ def _detect_sub_columns(
             continue
 
         # --- Build two sub-column geometries ---
+        # Word 'left' values are relative to left_x; geo.x is absolute.
+        # Convert the split position from relative to absolute coordinates.
         max_sub_left = max(w['left'] for w in sub_words)
-        split_x = (max_sub_left + col_start_bin[2]) // 2
+        split_rel = (max_sub_left + col_start_bin[2]) // 2
+        split_abs = split_rel + left_x
 
         sub_x = geo.x
-        sub_width = split_x - geo.x
-        main_x = split_x
-        main_width = (geo.x + geo.width) - split_x
+        sub_width = split_abs - geo.x
+        main_x = split_abs
+        main_width = (geo.x + geo.width) - split_abs
 
         if sub_width <= 0 or main_width <= 0:
             result.append(geo)
@@ -1138,8 +1146,9 @@ def _detect_sub_columns(
         result.append(main_geo)
 
         logger.info(
-            f"SubColumnSplit: column idx={geo.index} split at x={split_x}, "
-            f"sub={len(sub_words)} words (left), main={len(main_words)} words, "
+            f"SubColumnSplit: column idx={geo.index} split at abs_x={split_abs} "
+            f"(rel={split_rel}), sub={len(sub_words)} words, "
+            f"main={len(main_words)} words, "
             f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})"
         )
 
@@ -2846,7 +2855,7 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
     content_w = right_x - left_x
 
     # Split sub-columns (e.g. page references) before classification
-    geometries = _detect_sub_columns(geometries, content_w)
+    geometries = _detect_sub_columns(geometries, content_w, left_x=left_x)
 
     # Phase B: Content-based classification
     regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py
index e5f83d2..2dff162 100644
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -700,7 +700,7 @@ async def detect_columns(session_id: str):
         cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
 
         # Split sub-columns (e.g. page references) before classification
-        geometries = _detect_sub_columns(geometries, content_w)
+        geometries = _detect_sub_columns(geometries, content_w, left_x=left_x)
 
         # Phase B: Content-based classification
         regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
diff --git a/klausur-service/backend/tests/test_cv_vocab_pipeline.py b/klausur-service/backend/tests/test_cv_vocab_pipeline.py
index b95164b..1752334 100644
--- a/klausur-service/backend/tests/test_cv_vocab_pipeline.py
+++ b/klausur-service/backend/tests/test_cv_vocab_pipeline.py
@@ -1307,6 +1307,29 @@ class TestSubColumnDetection:
 
         assert len(result) == 1
 
+    def test_sub_column_split_with_left_x_offset(self):
+        """Word 'left' values are relative to left_x; geo.x is absolute.
+
+        Real-world scenario: left_x=195, EN column at geo.x=310.
+        Page refs at relative left=115-157, vocab words at relative left=216.
+        Without left_x, split_x would be ~202 (< geo.x=310) → negative width → no split.
+        With left_x=195, split_abs = 202 + 195 = 397, which is between geo.x(310)
+        and geo.x+geo.width(748) → valid split.
+        """
+        content_w = 1469
+        left_x = 195
+        page_refs = [self._make_word(115, "p.59"), self._make_word(157, "p.60"),
+                     self._make_word(157, "p.61")]
+        vocab = [self._make_word(216, f"word{i}") for i in range(40)]
+        all_words = page_refs + vocab
+        geo = self._make_geo(x=310, width=438, words=all_words, content_w=content_w)
+
+        result = _detect_sub_columns([geo], content_w, left_x=left_x)
+
+        assert len(result) == 2, f"Expected 2 columns, got {len(result)}"
+        assert result[0].word_count == 3
+        assert result[1].word_count == 40
+
 
 class TestCellsToVocabEntriesPageRef:
     """Test that page_ref cells are mapped to source_page field."""