refactor(ocr-pipeline): use left-edge alignment approach for sub-column detection

Replace gap-based splitting with alignment-bin approach: cluster word left-edges within 8px tolerance, find the leftmost bin with >= 10% of words as the true column start, split off any words to its left as a sub-column. This correctly handles both page references ("p.59") and misread exclamation marks ("!" → "I") even when the pixel gap is small. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-02 18:56:38 +01:00
parent f13116345b
commit 7252f9a956
2 changed files with 87 additions and 97 deletions
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -1037,12 +1037,16 @@ def _detect_columns_by_clustering(
 def _detect_sub_columns(
    geometries: List[ColumnGeometry],
    content_w: int,
+    _edge_tolerance: int = 8,
+    _min_col_start_ratio: float = 0.10,
 ) -> List[ColumnGeometry]:
-    """Split columns that contain internal sub-columns based on left-edge clustering.
+    """Split columns that contain internal sub-columns based on left-edge alignment.

-    Detects cases where a minority of words in a column are left-aligned at a
-    different position than the majority (e.g. page references "p.59" next to
-    vocabulary words).
+    For each column, clusters word left-edges into alignment bins (within
+    ``_edge_tolerance`` px).  The leftmost bin whose word count reaches
+    ``_min_col_start_ratio`` of the column total is treated as the true column
+    start.  Any words to the left of that bin form a sub-column, provided they
+    number >= 2 and < 35 % of total.

    Returns a new list of ColumnGeometry — potentially longer than the input.
    """
@@ -1057,114 +1061,86 @@ def _detect_sub_columns(
            continue

        # Collect left-edges of confident words
-        left_edges: List[int] = []
-        for w in geo.words:
-            if w.get('conf', 0) >= 30:
-                left_edges.append(w['left'])
-
-        if len(left_edges) < 3:
+        confident = [w for w in geo.words if w.get('conf', 0) >= 30]
+        if len(confident) < 3:
            result.append(geo)
            continue

-        # Sort and find the largest gap between consecutive left-edge values
-        sorted_edges = sorted(left_edges)
-        best_gap = 0
-        best_gap_pos = 0  # split point: values <= best_gap_pos go left
-        for i in range(len(sorted_edges) - 1):
-            gap = sorted_edges[i + 1] - sorted_edges[i]
-            if gap > best_gap:
-                best_gap = gap
-                best_gap_pos = (sorted_edges[i] + sorted_edges[i + 1]) // 2
+        # --- Cluster left-edges into alignment bins ---
+        sorted_edges = sorted(w['left'] for w in confident)
+        bins: List[Tuple[int, int, int, int]] = []  # (center, count, min_edge, max_edge)
+        cur = [sorted_edges[0]]
+        for i in range(1, len(sorted_edges)):
+            if sorted_edges[i] - cur[-1] <= _edge_tolerance:
+                cur.append(sorted_edges[i])
+            else:
+                bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
+                cur = [sorted_edges[i]]
+        bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))

-        # Gap must be significant relative to column width
-        min_gap = max(15, int(geo.width * 0.08))
-        if best_gap < min_gap:
+        # --- Find the leftmost bin qualifying as a real column start ---
+        total = len(confident)
+        min_count = max(3, int(total * _min_col_start_ratio))
+        col_start_bin = None
+        for b in bins:
+            if b[1] >= min_count:
+                col_start_bin = b
+                break
+
+        if col_start_bin is None:
            result.append(geo)
            continue

-        # Split words into left (minority candidate) and right groups
-        left_words = [w for w in geo.words if w.get('conf', 0) >= 30 and w['left'] <= best_gap_pos]
-        right_words = [w for w in geo.words if w.get('conf', 0) >= 30 and w['left'] > best_gap_pos]
+        # Words to the left of the column-start bin are sub-column candidates
+        split_threshold = col_start_bin[2] - _edge_tolerance
+        sub_words = [w for w in geo.words if w['left'] < split_threshold]
+        main_words = [w for w in geo.words if w['left'] >= split_threshold]

-        # Also include low-conf words by position
-        for w in geo.words:
-            if w.get('conf', 0) < 30:
-                if w['left'] <= best_gap_pos:
-                    left_words.append(w)
-                else:
-                    right_words.append(w)
-
-        total = len(left_words) + len(right_words)
-        if total == 0:
+        if len(sub_words) < 2 or len(sub_words) / len(geo.words) >= 0.35:
            result.append(geo)
            continue

-        # Determine minority/majority
-        if len(left_words) <= len(right_words):
-            minority, majority = left_words, right_words
-            minority_is_left = True
-        else:
-            minority, majority = right_words, left_words
-            minority_is_left = False
+        # --- Build two sub-column geometries ---
+        max_sub_left = max(w['left'] for w in sub_words)
+        split_x = (max_sub_left + col_start_bin[2]) // 2

-        # Check minority constraints
-        minority_ratio = len(minority) / total
-        if minority_ratio >= 0.35 or len(minority) < 2:
-            result.append(geo)
-            continue
+        sub_x = geo.x
+        sub_width = split_x - geo.x
+        main_x = split_x
+        main_width = (geo.x + geo.width) - split_x

-        # Build two sub-column geometries
-        if minority_is_left:
-            # Minority is left sub-column, majority is right
-            sub_x = geo.x
-            sub_width = best_gap_pos - geo.x
-            main_x = best_gap_pos
-            main_width = (geo.x + geo.width) - best_gap_pos
-        else:
-            # Minority is right sub-column, majority is left
-            main_x = geo.x
-            main_width = best_gap_pos - geo.x
-            sub_x = best_gap_pos
-            sub_width = (geo.x + geo.width) - best_gap_pos
-
-        # Sanity check widths
        if sub_width <= 0 or main_width <= 0:
            result.append(geo)
            continue

        sub_geo = ColumnGeometry(
-            index=0,  # will be re-indexed below
+            index=0,
            x=sub_x,
            y=geo.y,
            width=sub_width,
            height=geo.height,
-            word_count=len(minority),
-            words=minority,
+            word_count=len(sub_words),
+            words=sub_words,
            width_ratio=sub_width / content_w if content_w > 0 else 0.0,
        )
        main_geo = ColumnGeometry(
-            index=0,  # will be re-indexed below
+            index=0,
            x=main_x,
            y=geo.y,
            width=main_width,
            height=geo.height,
-            word_count=len(majority),
-            words=majority,
+            word_count=len(main_words),
+            words=main_words,
            width_ratio=main_width / content_w if content_w > 0 else 0.0,
        )

-        # Insert in left-to-right order
-        if sub_x < main_x:
-            result.append(sub_geo)
-            result.append(main_geo)
-        else:
-            result.append(main_geo)
-            result.append(sub_geo)
+        result.append(sub_geo)
+        result.append(main_geo)

        logger.info(
-            f"SubColumnSplit: column idx={geo.index} split at gap={best_gap}px, "
-            f"minority={len(minority)} words (left={minority_is_left}), "
-            f"majority={len(majority)} words"
+            f"SubColumnSplit: column idx={geo.index} split at x={split_x}, "
+            f"sub={len(sub_words)} words (left), main={len(main_words)} words, "
+            f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})"
        )

    # Re-index by left-to-right order