From e4aff2b27e386c4c358a4df99b5413de54897607 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Wed, 4 Mar 2026 10:31:19 +0100
Subject: [PATCH] fix: rewrite Method D to measure vertical column drift
 instead of text-line slope
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After deskew, horizontal text lines are already straight (~0° slope).
Method D was measuring this (always ~0°) instead of the actual vertical
shear (column edge drift). This caused it to report 0.112° with 0.96
confidence, overwhelming Method A's correct detection of negative shear.

New Method D groups words by X-position into vertical columns, then
measures how left-edge X drifts with Y position via linear regression.
dx/dy = tan(shear_angle), directly measuring column tilt.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 103 ++++++++++---------
 1 file changed, 56 insertions(+), 47 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index c8161df..722cb67 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -627,12 +627,12 @@ def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
     """Detect shear by measuring text-line straightness (Method D).
 
     Runs a quick Tesseract scan (PSM 11, 50% downscale) to locate word
-    bounding boxes, groups them into horizontal lines by Y-proximity,
-    fits a linear regression to each line, and takes the median slope
-    as the shear angle.
+    bounding boxes, groups them into vertical columns by X-proximity,
+    and measures how the left-edge X position drifts with Y (vertical
+    position).  The drift dx/dy is the tangent of the shear angle.
 
-    This is the most robust method because it measures actual text content
-    rather than relying on edges, projections, or printed lines.
+    This directly measures vertical shear (column tilt) rather than
+    horizontal text-line slope, which is already corrected by deskew.
 
     Returns:
         Dict with keys: method, shear_degrees, confidence.
@@ -656,71 +656,80 @@ def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
     except Exception:
         return result
 
-    # Collect word centres
+    # Collect word left-edges (x) and vertical centres (y)
     words = []
     for i in range(len(data['text'])):
         text = data['text'][i].strip()
         conf = int(data['conf'][i])
         if not text or conf < 20 or len(text) < 2:
             continue
-        cx = data['left'][i] + data['width'][i] / 2.0
+        left_x = float(data['left'][i])
         cy = data['top'][i] + data['height'][i] / 2.0
-        words.append((cx, cy, data['height'][i]))
+        word_w = float(data['width'][i])
+        words.append((left_x, cy, word_w))
 
-    if len(words) < 10:
+    if len(words) < 15:
         return result
 
-    # Group words into lines by Y-proximity
-    avg_h = sum(wh for _, _, wh in words) / len(words)
-    y_tol = max(avg_h * 0.6, 8)
-    words_sorted = sorted(words, key=lambda w: w[1])
+    # --- Group words into vertical columns by left-edge X proximity ---
+    # Sort by x, then cluster words whose left-edges are within x_tol
+    avg_w = sum(ww for _, _, ww in words) / len(words)
+    x_tol = max(avg_w * 0.4, 8)  # tolerance for "same column"
 
-    lines: List[List[Tuple[float, float]]] = []
-    current_line: List[Tuple[float, float]] = [(words_sorted[0][0], words_sorted[0][1])]
-    current_y = words_sorted[0][1]
+    words_by_x = sorted(words, key=lambda w: w[0])
+    columns: List[List[Tuple[float, float]]] = []  # each: [(left_x, cy), ...]
+    cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])]
+    cur_x = words_by_x[0][0]
 
-    for cx, cy, _ in words_sorted[1:]:
-        if abs(cy - current_y) <= y_tol:
-            current_line.append((cx, cy))
+    for lx, cy, _ in words_by_x[1:]:
+        if abs(lx - cur_x) <= x_tol:
+            cur_col.append((lx, cy))
+            # Update running x as median of cluster
+            cur_x = cur_x * 0.8 + lx * 0.2
         else:
-            if len(current_line) >= 3:
-                lines.append(current_line)
-            current_line = [(cx, cy)]
-            current_y = cy
-    if len(current_line) >= 3:
-        lines.append(current_line)
+            if len(cur_col) >= 5:
+                columns.append(cur_col)
+            cur_col = [(lx, cy)]
+            cur_x = lx
+    if len(cur_col) >= 5:
+        columns.append(cur_col)
 
-    if len(lines) < 3:
+    if len(columns) < 2:
         return result
 
-    # Linear regression per line → slope (dy/dx)
-    slopes = []
-    for line in lines:
-        xs = np.array([p[0] for p in line])
-        ys = np.array([p[1] for p in line])
-        x_range = xs.max() - xs.min()
-        if x_range < 20:
-            continue
-        coeffs = np.polyfit(xs, ys, 1)
-        slopes.append(coeffs[0])  # dy/dx
+    # --- For each column, measure X-drift as a function of Y ---
+    # Fit: left_x = a * cy + b  →  a = dx/dy = tan(shear_angle)
+    drifts = []
+    for col in columns:
+        ys = np.array([p[1] for p in col])
+        xs = np.array([p[0] for p in col])
+        y_range = ys.max() - ys.min()
+        if y_range < h * scale * 0.3:
+            continue  # column must span at least 30% of image height
+        # Linear regression: x = a*y + b
+        coeffs = np.polyfit(ys, xs, 1)
+        drifts.append(coeffs[0])  # dx/dy
 
-    if len(slopes) < 3:
+    if len(drifts) < 2:
         return result
 
-    # Median slope → shear angle
-    # dy/dx of horizontal text lines = tan(shear_angle)
-    # Positive slope means text tilts down-right → vertical columns lean right
-    median_slope = float(np.median(slopes))
-    shear_degrees = math.degrees(math.atan(median_slope))
+    # Median dx/dy → shear angle
+    # dx/dy > 0 means left-edges move RIGHT as we go DOWN → columns lean right
+    median_drift = float(np.median(drifts))
+    shear_degrees = math.degrees(math.atan(median_drift))
 
-    # Confidence from line count + slope consistency
-    slope_std = float(np.std(slopes))
-    consistency = max(0.0, 1.0 - slope_std * 20)  # penalise high variance
-    count_factor = min(1.0, len(slopes) / 8.0)
-    confidence = count_factor * 0.6 + consistency * 0.4
+    # Confidence from column count + drift consistency
+    drift_std = float(np.std(drifts))
+    consistency = max(0.0, 1.0 - drift_std * 50)  # tighter penalty for drift variance
+    count_factor = min(1.0, len(drifts) / 4.0)
+    confidence = count_factor * 0.5 + consistency * 0.5
 
     result["shear_degrees"] = round(shear_degrees, 3)
     result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+    logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, "
+                "shear=%.3f°, conf=%.2f",
+                len(columns), len(drifts), median_drift,
+                shear_degrees, confidence)
     return result