fix: rewrite Method D to measure vertical column drift instead of text-line slope

After deskew, horizontal text lines are already straight (~0° slope). Method D was measuring this (always ~0°) instead of the actual vertical shear (column edge drift). This caused it to report 0.112° with 0.96 confidence, overwhelming Method A's correct detection of negative shear. New Method D groups words by X-position into vertical columns, then measures how left-edge X drifts with Y position via linear regression. dx/dy = tan(shear_angle), directly measuring column tilt. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 10:31:19 +01:00
parent 9dd77ab54a
commit e4aff2b27e
1 changed files with 56 additions and 47 deletions
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -627,12 +627,12 @@ def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
    """Detect shear by measuring text-line straightness (Method D).

    Runs a quick Tesseract scan (PSM 11, 50% downscale) to locate word
-    bounding boxes, groups them into horizontal lines by Y-proximity,
-    fits a linear regression to each line, and takes the median slope
-    as the shear angle.
+    bounding boxes, groups them into vertical columns by X-proximity,
+    and measures how the left-edge X position drifts with Y (vertical
+    position).  The drift dx/dy is the tangent of the shear angle.

-    This is the most robust method because it measures actual text content
-    rather than relying on edges, projections, or printed lines.
+    This directly measures vertical shear (column tilt) rather than
+    horizontal text-line slope, which is already corrected by deskew.

    Returns:
        Dict with keys: method, shear_degrees, confidence.
@@ -656,71 +656,80 @@ def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
    except Exception:
        return result

-    # Collect word centres
+    # Collect word left-edges (x) and vertical centres (y)
    words = []
    for i in range(len(data['text'])):
        text = data['text'][i].strip()
        conf = int(data['conf'][i])
        if not text or conf < 20 or len(text) < 2:
            continue
-        cx = data['left'][i] + data['width'][i] / 2.0
+        left_x = float(data['left'][i])
        cy = data['top'][i] + data['height'][i] / 2.0
-        words.append((cx, cy, data['height'][i]))
+        word_w = float(data['width'][i])
+        words.append((left_x, cy, word_w))

-    if len(words) < 10:
+    if len(words) < 15:
        return result

-    # Group words into lines by Y-proximity
-    avg_h = sum(wh for _, _, wh in words) / len(words)
-    y_tol = max(avg_h * 0.6, 8)
-    words_sorted = sorted(words, key=lambda w: w[1])
+    # --- Group words into vertical columns by left-edge X proximity ---
+    # Sort by x, then cluster words whose left-edges are within x_tol
+    avg_w = sum(ww for _, _, ww in words) / len(words)
+    x_tol = max(avg_w * 0.4, 8)  # tolerance for "same column"

-    lines: List[List[Tuple[float, float]]] = []
-    current_line: List[Tuple[float, float]] = [(words_sorted[0][0], words_sorted[0][1])]
-    current_y = words_sorted[0][1]
+    words_by_x = sorted(words, key=lambda w: w[0])
+    columns: List[List[Tuple[float, float]]] = []  # each: [(left_x, cy), ...]
+    cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])]
+    cur_x = words_by_x[0][0]

-    for cx, cy, _ in words_sorted[1:]:
-        if abs(cy - current_y) <= y_tol:
-            current_line.append((cx, cy))
+    for lx, cy, _ in words_by_x[1:]:
+        if abs(lx - cur_x) <= x_tol:
+            cur_col.append((lx, cy))
+            # Update running x as median of cluster
+            cur_x = cur_x * 0.8 + lx * 0.2
        else:
-            if len(current_line) >= 3:
-                lines.append(current_line)
-            current_line = [(cx, cy)]
-            current_y = cy
-    if len(current_line) >= 3:
-        lines.append(current_line)
+            if len(cur_col) >= 5:
+                columns.append(cur_col)
+            cur_col = [(lx, cy)]
+            cur_x = lx
+    if len(cur_col) >= 5:
+        columns.append(cur_col)

-    if len(lines) < 3:
+    if len(columns) < 2:
        return result

-    # Linear regression per line → slope (dy/dx)
-    slopes = []
-    for line in lines:
-        xs = np.array([p[0] for p in line])
-        ys = np.array([p[1] for p in line])
-        x_range = xs.max() - xs.min()
-        if x_range < 20:
-            continue
-        coeffs = np.polyfit(xs, ys, 1)
-        slopes.append(coeffs[0])  # dy/dx
+    # --- For each column, measure X-drift as a function of Y ---
+    # Fit: left_x = a * cy + b  →  a = dx/dy = tan(shear_angle)
+    drifts = []
+    for col in columns:
+        ys = np.array([p[1] for p in col])
+        xs = np.array([p[0] for p in col])
+        y_range = ys.max() - ys.min()
+        if y_range < h * scale * 0.3:
+            continue  # column must span at least 30% of image height
+        # Linear regression: x = a*y + b
+        coeffs = np.polyfit(ys, xs, 1)
+        drifts.append(coeffs[0])  # dx/dy

-    if len(slopes) < 3:
+    if len(drifts) < 2:
        return result

-    # Median slope → shear angle
-    # dy/dx of horizontal text lines = tan(shear_angle)
-    # Positive slope means text tilts down-right → vertical columns lean right
-    median_slope = float(np.median(slopes))
-    shear_degrees = math.degrees(math.atan(median_slope))
+    # Median dx/dy → shear angle
+    # dx/dy > 0 means left-edges move RIGHT as we go DOWN → columns lean right
+    median_drift = float(np.median(drifts))
+    shear_degrees = math.degrees(math.atan(median_drift))

-    # Confidence from line count + slope consistency
-    slope_std = float(np.std(slopes))
-    consistency = max(0.0, 1.0 - slope_std * 20)  # penalise high variance
-    count_factor = min(1.0, len(slopes) / 8.0)
-    confidence = count_factor * 0.6 + consistency * 0.4
+    # Confidence from column count + drift consistency
+    drift_std = float(np.std(drifts))
+    consistency = max(0.0, 1.0 - drift_std * 50)  # tighter penalty for drift variance
+    count_factor = min(1.0, len(drifts) / 4.0)
+    confidence = count_factor * 0.5 + consistency * 0.5

    result["shear_degrees"] = round(shear_degrees, 3)
    result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+    logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, "
+                "shear=%.3f°, conf=%.2f",
+                len(columns), len(drifts), median_drift,
+                shear_degrees, confidence)
    return result