From 1162eac7b48442a5b638c3673982ca4aa41002c4 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Sun, 15 Mar 2026 00:10:29 +0100
Subject: [PATCH] fix: use group-start positions for column detection, not all
 word left-edges

Only cluster left-edges of words that begin a new group within their row
(first word or preceded by a large gap). This filters out mid-phrase
word positions (IPA transcriptions, second words in multi-word entries)
that were causing too many false columns.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/grid_editor_api.py | 86 ++++++++++++++++------
 1 file changed, 63 insertions(+), 23 deletions(-)

diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index 4e1173f..0898b19 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -43,16 +43,17 @@ def _cluster_columns_by_alignment(
 ) -> List[Dict[str, Any]]:
     """Detect columns by clustering left-edge alignment across rows.
 
-    Algorithm (adapted from cv_layout._detect_columns_by_clustering):
-      1. Tag each word with its row index
-      2. Cluster word left-edges by X-proximity
-      3. Count distinct rows per cluster (Y-coverage)
-      4. Keep clusters with sufficient row coverage
-      5. Merge nearby clusters
-      6. Build column boundaries
+    Hybrid approach:
+      1. Group words by row, find "group start" positions within each row
+         (words preceded by a large gap or first word in row)
+      2. Cluster group-start left-edges by X-proximity across rows
+      3. Filter by row coverage (how many rows have a group start here)
+      4. Merge nearby clusters
+      5. Build column boundaries
 
-    With real OCR words (from Kombi mode) this is more reliable than the
-    original ink-based version because left-edge positions are precise.
+    This filters out mid-phrase word positions (e.g. IPA transcriptions,
+    second words in multi-word entries) by only considering positions
+    where a new word group begins within a row.
     """
     if not words or not rows:
         return []
@@ -61,26 +62,65 @@ def _cluster_columns_by_alignment(
     if total_rows == 0:
         return []
 
-    # --- Tag each word with its row index ---
-    row_of: Dict[int, int] = {}
+    # --- Group words by row ---
+    row_words: Dict[int, List[Dict]] = {}
     for w in words:
         y_center = w["top"] + w["height"] / 2
         best = min(rows, key=lambda r: abs(r["y_center"] - y_center))
-        row_of[id(w)] = best["index"]
+        row_words.setdefault(best["index"], []).append(w)
 
-    # --- Collect and sort left-edges ---
-    edge_data = sorted(
-        ((w["left"], row_of[id(w)]) for w in words),
-        key=lambda x: x[0],
+    # --- Compute adaptive gap threshold for group-start detection ---
+    all_gaps: List[float] = []
+    for ri, rw_list in row_words.items():
+        sorted_rw = sorted(rw_list, key=lambda w: w["left"])
+        for i in range(len(sorted_rw) - 1):
+            right = sorted_rw[i]["left"] + sorted_rw[i]["width"]
+            gap = sorted_rw[i + 1]["left"] - right
+            if gap > 0:
+                all_gaps.append(gap)
+
+    if all_gaps:
+        sorted_gaps = sorted(all_gaps)
+        median_gap = sorted_gaps[len(sorted_gaps) // 2]
+        heights = [w["height"] for w in words if w.get("height", 0) > 0]
+        median_h = sorted(heights)[len(heights) // 2] if heights else 25
+        # Column boundary: gap > 3× median gap or > 1.5× median word height
+        gap_threshold = max(median_gap * 3, median_h * 1.5, 30)
+    else:
+        gap_threshold = 50
+
+    # --- Find group-start positions (left-edges that begin a new column) ---
+    start_positions: List[tuple] = []  # (left_edge, row_index)
+    for ri, rw_list in row_words.items():
+        sorted_rw = sorted(rw_list, key=lambda w: w["left"])
+        # First word in row is always a group start
+        start_positions.append((sorted_rw[0]["left"], ri))
+        for i in range(1, len(sorted_rw)):
+            right_prev = sorted_rw[i - 1]["left"] + sorted_rw[i - 1]["width"]
+            gap = sorted_rw[i]["left"] - right_prev
+            if gap >= gap_threshold:
+                start_positions.append((sorted_rw[i]["left"], ri))
+
+    start_positions.sort(key=lambda x: x[0])
+
+    logger.info(
+        "alignment columns: %d group-start positions from %d words "
+        "(gap_threshold=%.0f, %d rows)",
+        len(start_positions), len(words), gap_threshold, total_rows,
     )
 
-    # --- Cluster by X-proximity ---
+    if not start_positions:
+        x_min = min(w["left"] for w in words)
+        x_max = max(w["left"] + w["width"] for w in words)
+        return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}]
+
+    # --- Cluster group-start positions by X-proximity ---
     tolerance = max(10, int(zone_w * 0.01))
     clusters: List[Dict[str, Any]] = []
-    cur_edges = [edge_data[0][0]]
-    cur_rows = {edge_data[0][1]}
+    cur_edges = [start_positions[0][0]]
+    cur_rows = {start_positions[0][1]}
 
-    for left, row_idx in edge_data[1:]:
+    for left, row_idx in start_positions[1:]:
         if left - cur_edges[-1] <= tolerance:
             cur_edges.append(left)
             cur_rows.add(row_idx)
@@ -105,8 +145,8 @@ def _cluster_columns_by_alignment(
     })
 
     # --- Filter by row coverage ---
-    MIN_COVERAGE_PRIMARY = 0.15
-    MIN_COVERAGE_SECONDARY = 0.08
+    MIN_COVERAGE_PRIMARY = 0.20
+    MIN_COVERAGE_SECONDARY = 0.12
     MIN_WORDS_SECONDARY = 3
     MIN_DISTINCT_ROWS = 2
 
@@ -126,7 +166,7 @@ def _cluster_columns_by_alignment(
     significant = sorted(primary + secondary, key=lambda c: c["mean_x"])
 
     logger.info(
-        "alignment columns: %d clusters total, %d primary, %d secondary → %d significant",
+        "alignment columns: %d clusters, %d primary, %d secondary → %d significant",
         len(clusters), len(primary), len(secondary), len(significant),
     )