fix: move column expansion AFTER sub-column split

The narrow column expansion was running inside detect_column_geometry() on the 4 main columns, but the narrowest columns (marker ~14px, page_ref ~93px) are created AFTERWARDS by _detect_sub_columns(). Extracted expand_narrow_columns() as standalone function and call it after sub-column splitting in the columns API endpoint. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 10:07:40 +01:00
parent e426de937c
commit 9dd77ab54a
2 changed files with 84 additions and 65 deletions
@@ -1883,74 +1883,89 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
        logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: "
                    f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")

-    # --- Step 10: Expand narrow columns into adjacent gaps ---
-    # Narrow columns (marker, page_ref, < 10% width) often lose content at
-    # image edges due to residual shear.  Expand them into the gap toward
-    # the neighbouring column, but never past 40 % of the gap or past the
-    # nearest word in the neighbour.
-    _NARROW_THRESHOLD_PCT = 10.0  # columns below this % of content_w are "narrow"
-    _GAP_CLAIM_RATIO = 0.40       # narrow col may claim up to 40 % of the gap
-    _MIN_WORD_MARGIN = 4          # always keep 4 px between col edge and nearest word
-
-    if len(geometries) >= 2:
-        for i, g in enumerate(geometries):
-            col_pct = g.width / content_w * 100 if content_w > 0 else 100
-            if col_pct >= _NARROW_THRESHOLD_PCT:
-                continue  # not narrow — skip
-
-            expanded = False
-
-            # --- try expanding to the LEFT (into gap with left neighbor) ---
-            if i > 0:
-                left_nb = geometries[i - 1]
-                gap_left = g.x - (left_nb.x + left_nb.width)
-                if gap_left > _MIN_WORD_MARGIN * 2:
-                    # Find nearest word in left neighbor (right edge)
-                    nb_right_rel = (left_nb.x + left_nb.width) - left_x
-                    nb_words_right = [wd['left'] + wd.get('width', 0)
-                                      for wd in left_nb.words]
-                    max_word_right = max(nb_words_right) if nb_words_right else (nb_right_rel - 20)
-                    # max_word_right is relative to left_x
-                    safe_left_abs = left_x + max_word_right + _MIN_WORD_MARGIN
-                    max_expand = int(gap_left * _GAP_CLAIM_RATIO)
-                    new_x = max(safe_left_abs, g.x - max_expand)
-                    if new_x < g.x:
-                        delta = g.x - new_x
-                        g.width += delta
-                        g.x = new_x
-                        expanded = True
-
-            # --- try expanding to the RIGHT (into gap with right neighbor) ---
-            if i + 1 < len(geometries):
-                right_nb = geometries[i + 1]
-                gap_right = right_nb.x - (g.x + g.width)
-                if gap_right > _MIN_WORD_MARGIN * 2:
-                    # Find nearest word in right neighbor (left edge)
-                    nb_words_left = [wd['left'] for wd in right_nb.words]
-                    min_word_left_rel = min(nb_words_left) if nb_words_left else ((right_nb.x - left_x) + 20)
-                    safe_right_abs = left_x + min_word_left_rel - _MIN_WORD_MARGIN
-                    max_expand = int(gap_right * _GAP_CLAIM_RATIO)
-                    new_right = min(safe_right_abs, g.x + g.width + max_expand)
-                    if new_right > g.x + g.width:
-                        g.width = new_right - g.x
-                        expanded = True
-
-            if expanded:
-                # Re-assign words to this expanded column
-                col_left_rel = g.x - left_x
-                col_right_rel = col_left_rel + g.width
-                g.words = [wd for wd in word_dicts
-                           if col_left_rel <= wd['left'] < col_right_rel]
-                g.word_count = len(g.words)
-                g.width_ratio = g.width / content_w if content_w > 0 else 0.0
-                logger.info(
-                    "ColumnGeometry: expanded narrow col %d "
-                    "(%.1f%% → %.1f%%) x=%d w=%d",
-                    i, col_pct, g.width / content_w * 100, g.x, g.width)
-
    return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)


+def expand_narrow_columns(
+    geometries: List[ColumnGeometry],
+    content_w: int,
+    left_x: int,
+    word_dicts: List[Dict],
+) -> List[ColumnGeometry]:
+    """Expand narrow columns into adjacent whitespace gaps.
+
+    Narrow columns (marker, page_ref, < 10% content width) often lose
+    content at image edges due to residual shear.  This expands them toward
+    the neighbouring column, but never past 40% of the gap or past the
+    nearest word in the neighbour.
+
+    Must be called AFTER _detect_sub_columns() so that sub-column splits
+    (which create the narrowest columns) have already happened.
+    """
+    _NARROW_THRESHOLD_PCT = 10.0
+    _GAP_CLAIM_RATIO = 0.40
+    _MIN_WORD_MARGIN = 4
+
+    if len(geometries) < 2:
+        return geometries
+
+    for i, g in enumerate(geometries):
+        col_pct = g.width / content_w * 100 if content_w > 0 else 100
+        if col_pct >= _NARROW_THRESHOLD_PCT:
+            continue
+
+        expanded = False
+        orig_pct = col_pct
+
+        # --- try expanding to the LEFT ---
+        if i > 0:
+            left_nb = geometries[i - 1]
+            gap_left = g.x - (left_nb.x + left_nb.width)
+            if gap_left > _MIN_WORD_MARGIN * 2:
+                nb_words_right = [wd['left'] + wd.get('width', 0)
+                                  for wd in left_nb.words]
+                if nb_words_right:
+                    safe_left_abs = left_x + max(nb_words_right) + _MIN_WORD_MARGIN
+                else:
+                    safe_left_abs = left_nb.x + left_nb.width + _MIN_WORD_MARGIN
+                max_expand = int(gap_left * _GAP_CLAIM_RATIO)
+                new_x = max(safe_left_abs, g.x - max_expand)
+                if new_x < g.x:
+                    delta = g.x - new_x
+                    g.width += delta
+                    g.x = new_x
+                    expanded = True
+
+        # --- try expanding to the RIGHT ---
+        if i + 1 < len(geometries):
+            right_nb = geometries[i + 1]
+            gap_right = right_nb.x - (g.x + g.width)
+            if gap_right > _MIN_WORD_MARGIN * 2:
+                nb_words_left = [wd['left'] for wd in right_nb.words]
+                if nb_words_left:
+                    safe_right_abs = left_x + min(nb_words_left) - _MIN_WORD_MARGIN
+                else:
+                    safe_right_abs = right_nb.x - _MIN_WORD_MARGIN
+                max_expand = int(gap_right * _GAP_CLAIM_RATIO)
+                new_right = min(safe_right_abs, g.x + g.width + max_expand)
+                if new_right > g.x + g.width:
+                    g.width = new_right - g.x
+                    expanded = True
+
+        if expanded:
+            col_left_rel = g.x - left_x
+            col_right_rel = col_left_rel + g.width
+            g.words = [wd for wd in word_dicts
+                       if col_left_rel <= wd['left'] < col_right_rel]
+            g.word_count = len(g.words)
+            g.width_ratio = g.width / content_w if content_w > 0 else 0.0
+            logger.info(
+                "ExpandNarrowCols: col %d (%.1f%% → %.1f%%) x=%d w=%d words=%d",
+                i, orig_pct, g.width / content_w * 100, g.x, g.width, g.word_count)
+
+    return geometries
+
+
 # =============================================================================
 # Row Geometry Detection (horizontal whitespace-gap analysis)
 # =============================================================================
@@ -51,6 +51,7 @@ from cv_vocab_pipeline import (
    deskew_image_by_word_alignment,
    detect_column_geometry,
    detect_row_geometry,
+    expand_narrow_columns,
    _apply_shear,
    dewarp_image,
    dewarp_image_manual,
@@ -802,6 +803,9 @@ async def detect_columns(session_id: str):
        geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
                                          top_y=top_y, header_y=header_y, footer_y=footer_y)

+        # Expand narrow columns (sub-columns are often very narrow)
+        geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
+
        # Phase B: Content-based classification
        regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
                                        left_x=left_x, right_x=right_x, inv=inv)