From 9dd77ab54a3fc503db97460f35acd6925f69d31c Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 4 Mar 2026 10:07:40 +0100 Subject: [PATCH] fix: move column expansion AFTER sub-column split The narrow column expansion was running inside detect_column_geometry() on the 4 main columns, but the narrowest columns (marker ~14px, page_ref ~93px) are created AFTERWARDS by _detect_sub_columns(). Extracted expand_narrow_columns() as standalone function and call it after sub-column splitting in the columns API endpoint. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 145 ++++++++++--------- klausur-service/backend/ocr_pipeline_api.py | 4 + 2 files changed, 84 insertions(+), 65 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 38ab170..c8161df 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -1883,74 +1883,89 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: " f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}") - # --- Step 10: Expand narrow columns into adjacent gaps --- - # Narrow columns (marker, page_ref, < 10% width) often lose content at - # image edges due to residual shear. Expand them into the gap toward - # the neighbouring column, but never past 40 % of the gap or past the - # nearest word in the neighbour. - _NARROW_THRESHOLD_PCT = 10.0 # columns below this % of content_w are "narrow" - _GAP_CLAIM_RATIO = 0.40 # narrow col may claim up to 40 % of the gap - _MIN_WORD_MARGIN = 4 # always keep 4 px between col edge and nearest word - - if len(geometries) >= 2: - for i, g in enumerate(geometries): - col_pct = g.width / content_w * 100 if content_w > 0 else 100 - if col_pct >= _NARROW_THRESHOLD_PCT: - continue # not narrow — skip - - expanded = False - - # --- try expanding to the LEFT (into gap with left neighbor) --- - if i > 0: - left_nb = geometries[i - 1] - gap_left = g.x - (left_nb.x + left_nb.width) - if gap_left > _MIN_WORD_MARGIN * 2: - # Find nearest word in left neighbor (right edge) - nb_right_rel = (left_nb.x + left_nb.width) - left_x - nb_words_right = [wd['left'] + wd.get('width', 0) - for wd in left_nb.words] - max_word_right = max(nb_words_right) if nb_words_right else (nb_right_rel - 20) - # max_word_right is relative to left_x - safe_left_abs = left_x + max_word_right + _MIN_WORD_MARGIN - max_expand = int(gap_left * _GAP_CLAIM_RATIO) - new_x = max(safe_left_abs, g.x - max_expand) - if new_x < g.x: - delta = g.x - new_x - g.width += delta - g.x = new_x - expanded = True - - # --- try expanding to the RIGHT (into gap with right neighbor) --- - if i + 1 < len(geometries): - right_nb = geometries[i + 1] - gap_right = right_nb.x - (g.x + g.width) - if gap_right > _MIN_WORD_MARGIN * 2: - # Find nearest word in right neighbor (left edge) - nb_words_left = [wd['left'] for wd in right_nb.words] - min_word_left_rel = min(nb_words_left) if nb_words_left else ((right_nb.x - left_x) + 20) - safe_right_abs = left_x + min_word_left_rel - _MIN_WORD_MARGIN - max_expand = int(gap_right * _GAP_CLAIM_RATIO) - new_right = min(safe_right_abs, g.x + g.width + max_expand) - if new_right > g.x + g.width: - g.width = new_right - g.x - expanded = True - - if expanded: - # Re-assign words to this expanded column - col_left_rel = g.x - left_x - col_right_rel = col_left_rel + g.width - g.words = [wd for wd in word_dicts - if col_left_rel <= wd['left'] < col_right_rel] - g.word_count = len(g.words) - g.width_ratio = g.width / content_w if content_w > 0 else 0.0 - logger.info( - "ColumnGeometry: expanded narrow col %d " - "(%.1f%% → %.1f%%) x=%d w=%d", - i, col_pct, g.width / content_w * 100, g.x, g.width) - return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv) +def expand_narrow_columns( + geometries: List[ColumnGeometry], + content_w: int, + left_x: int, + word_dicts: List[Dict], +) -> List[ColumnGeometry]: + """Expand narrow columns into adjacent whitespace gaps. + + Narrow columns (marker, page_ref, < 10% content width) often lose + content at image edges due to residual shear. This expands them toward + the neighbouring column, but never past 40% of the gap or past the + nearest word in the neighbour. + + Must be called AFTER _detect_sub_columns() so that sub-column splits + (which create the narrowest columns) have already happened. + """ + _NARROW_THRESHOLD_PCT = 10.0 + _GAP_CLAIM_RATIO = 0.40 + _MIN_WORD_MARGIN = 4 + + if len(geometries) < 2: + return geometries + + for i, g in enumerate(geometries): + col_pct = g.width / content_w * 100 if content_w > 0 else 100 + if col_pct >= _NARROW_THRESHOLD_PCT: + continue + + expanded = False + orig_pct = col_pct + + # --- try expanding to the LEFT --- + if i > 0: + left_nb = geometries[i - 1] + gap_left = g.x - (left_nb.x + left_nb.width) + if gap_left > _MIN_WORD_MARGIN * 2: + nb_words_right = [wd['left'] + wd.get('width', 0) + for wd in left_nb.words] + if nb_words_right: + safe_left_abs = left_x + max(nb_words_right) + _MIN_WORD_MARGIN + else: + safe_left_abs = left_nb.x + left_nb.width + _MIN_WORD_MARGIN + max_expand = int(gap_left * _GAP_CLAIM_RATIO) + new_x = max(safe_left_abs, g.x - max_expand) + if new_x < g.x: + delta = g.x - new_x + g.width += delta + g.x = new_x + expanded = True + + # --- try expanding to the RIGHT --- + if i + 1 < len(geometries): + right_nb = geometries[i + 1] + gap_right = right_nb.x - (g.x + g.width) + if gap_right > _MIN_WORD_MARGIN * 2: + nb_words_left = [wd['left'] for wd in right_nb.words] + if nb_words_left: + safe_right_abs = left_x + min(nb_words_left) - _MIN_WORD_MARGIN + else: + safe_right_abs = right_nb.x - _MIN_WORD_MARGIN + max_expand = int(gap_right * _GAP_CLAIM_RATIO) + new_right = min(safe_right_abs, g.x + g.width + max_expand) + if new_right > g.x + g.width: + g.width = new_right - g.x + expanded = True + + if expanded: + col_left_rel = g.x - left_x + col_right_rel = col_left_rel + g.width + g.words = [wd for wd in word_dicts + if col_left_rel <= wd['left'] < col_right_rel] + g.word_count = len(g.words) + g.width_ratio = g.width / content_w if content_w > 0 else 0.0 + logger.info( + "ExpandNarrowCols: col %d (%.1f%% → %.1f%%) x=%d w=%d words=%d", + i, orig_pct, g.width / content_w * 100, g.x, g.width, g.word_count) + + return geometries + + # ============================================================================= # Row Geometry Detection (horizontal whitespace-gap analysis) # ============================================================================= diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index a81800b..c02f36e 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -51,6 +51,7 @@ from cv_vocab_pipeline import ( deskew_image_by_word_alignment, detect_column_geometry, detect_row_geometry, + expand_narrow_columns, _apply_shear, dewarp_image, dewarp_image_manual, @@ -802,6 +803,9 @@ async def detect_columns(session_id: str): geometries = _detect_sub_columns(geometries, content_w, left_x=left_x, top_y=top_y, header_y=header_y, footer_y=footer_y) + # Expand narrow columns (sub-columns are often very narrow) + geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts) + # Phase B: Content-based classification regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y, left_x=left_x, right_x=right_x, inv=inv)