diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index ae91187..155dc6e 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -1037,6 +1037,7 @@ def _detect_columns_by_clustering( def _detect_sub_columns( geometries: List[ColumnGeometry], content_w: int, + left_x: int = 0, _edge_tolerance: int = 8, _min_col_start_ratio: float = 0.10, ) -> List[ColumnGeometry]: @@ -1048,6 +1049,10 @@ def _detect_sub_columns( start. Any words to the left of that bin form a sub-column, provided they number >= 2 and < 35 % of total. + Word ``left`` values are relative to the content ROI (offset by *left_x*), + while ``ColumnGeometry.x`` is in absolute image coordinates. *left_x* + bridges the two coordinate systems. + Returns a new list of ColumnGeometry — potentially longer than the input. """ if content_w <= 0: @@ -1101,13 +1106,16 @@ def _detect_sub_columns( continue # --- Build two sub-column geometries --- + # Word 'left' values are relative to left_x; geo.x is absolute. + # Convert the split position from relative to absolute coordinates. max_sub_left = max(w['left'] for w in sub_words) - split_x = (max_sub_left + col_start_bin[2]) // 2 + split_rel = (max_sub_left + col_start_bin[2]) // 2 + split_abs = split_rel + left_x sub_x = geo.x - sub_width = split_x - geo.x - main_x = split_x - main_width = (geo.x + geo.width) - split_x + sub_width = split_abs - geo.x + main_x = split_abs + main_width = (geo.x + geo.width) - split_abs if sub_width <= 0 or main_width <= 0: result.append(geo) @@ -1138,8 +1146,9 @@ def _detect_sub_columns( result.append(main_geo) logger.info( - f"SubColumnSplit: column idx={geo.index} split at x={split_x}, " - f"sub={len(sub_words)} words (left), main={len(main_words)} words, " + f"SubColumnSplit: column idx={geo.index} split at abs_x={split_abs} " + f"(rel={split_rel}), sub={len(sub_words)} words, " + f"main={len(main_words)} words, " f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})" ) @@ -2846,7 +2855,7 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li content_w = right_x - left_x # Split sub-columns (e.g. page references) before classification - geometries = _detect_sub_columns(geometries, content_w) + geometries = _detect_sub_columns(geometries, content_w, left_x=left_x) # Phase B: Content-based classification regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y, diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index e5f83d2..2dff162 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -700,7 +700,7 @@ async def detect_columns(session_id: str): cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y) # Split sub-columns (e.g. page references) before classification - geometries = _detect_sub_columns(geometries, content_w) + geometries = _detect_sub_columns(geometries, content_w, left_x=left_x) # Phase B: Content-based classification regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y, diff --git a/klausur-service/backend/tests/test_cv_vocab_pipeline.py b/klausur-service/backend/tests/test_cv_vocab_pipeline.py index b95164b..1752334 100644 --- a/klausur-service/backend/tests/test_cv_vocab_pipeline.py +++ b/klausur-service/backend/tests/test_cv_vocab_pipeline.py @@ -1307,6 +1307,29 @@ class TestSubColumnDetection: assert len(result) == 1 + def test_sub_column_split_with_left_x_offset(self): + """Word 'left' values are relative to left_x; geo.x is absolute. + + Real-world scenario: left_x=195, EN column at geo.x=310. + Page refs at relative left=115-157, vocab words at relative left=216. + Without left_x, split_x would be ~202 (< geo.x=310) → negative width → no split. + With left_x=195, split_abs = 202 + 195 = 397, which is between geo.x(310) + and geo.x+geo.width(748) → valid split. + """ + content_w = 1469 + left_x = 195 + page_refs = [self._make_word(115, "p.59"), self._make_word(157, "p.60"), + self._make_word(157, "p.61")] + vocab = [self._make_word(216, f"word{i}") for i in range(40)] + all_words = page_refs + vocab + geo = self._make_geo(x=310, width=438, words=all_words, content_w=content_w) + + result = _detect_sub_columns([geo], content_w, left_x=left_x) + + assert len(result) == 2, f"Expected 2 columns, got {len(result)}" + assert result[0].word_count == 3 + assert result[1].word_count == 40 + class TestCellsToVocabEntriesPageRef: """Test that page_ref cells are mapped to source_page field."""