From e4aff2b27e386c4c358a4df99b5413de54897607 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 4 Mar 2026 10:31:19 +0100 Subject: [PATCH] fix: rewrite Method D to measure vertical column drift instead of text-line slope MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After deskew, horizontal text lines are already straight (~0° slope). Method D was measuring this (always ~0°) instead of the actual vertical shear (column edge drift). This caused it to report 0.112° with 0.96 confidence, overwhelming Method A's correct detection of negative shear. New Method D groups words by X-position into vertical columns, then measures how left-edge X drifts with Y position via linear regression. dx/dy = tan(shear_angle), directly measuring column tilt. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 103 ++++++++++--------- 1 file changed, 56 insertions(+), 47 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index c8161df..722cb67 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -627,12 +627,12 @@ def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]: """Detect shear by measuring text-line straightness (Method D). Runs a quick Tesseract scan (PSM 11, 50% downscale) to locate word - bounding boxes, groups them into horizontal lines by Y-proximity, - fits a linear regression to each line, and takes the median slope - as the shear angle. + bounding boxes, groups them into vertical columns by X-proximity, + and measures how the left-edge X position drifts with Y (vertical + position). The drift dx/dy is the tangent of the shear angle. - This is the most robust method because it measures actual text content - rather than relying on edges, projections, or printed lines. + This directly measures vertical shear (column tilt) rather than + horizontal text-line slope, which is already corrected by deskew. Returns: Dict with keys: method, shear_degrees, confidence. @@ -656,71 +656,80 @@ def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]: except Exception: return result - # Collect word centres + # Collect word left-edges (x) and vertical centres (y) words = [] for i in range(len(data['text'])): text = data['text'][i].strip() conf = int(data['conf'][i]) if not text or conf < 20 or len(text) < 2: continue - cx = data['left'][i] + data['width'][i] / 2.0 + left_x = float(data['left'][i]) cy = data['top'][i] + data['height'][i] / 2.0 - words.append((cx, cy, data['height'][i])) + word_w = float(data['width'][i]) + words.append((left_x, cy, word_w)) - if len(words) < 10: + if len(words) < 15: return result - # Group words into lines by Y-proximity - avg_h = sum(wh for _, _, wh in words) / len(words) - y_tol = max(avg_h * 0.6, 8) - words_sorted = sorted(words, key=lambda w: w[1]) + # --- Group words into vertical columns by left-edge X proximity --- + # Sort by x, then cluster words whose left-edges are within x_tol + avg_w = sum(ww for _, _, ww in words) / len(words) + x_tol = max(avg_w * 0.4, 8) # tolerance for "same column" - lines: List[List[Tuple[float, float]]] = [] - current_line: List[Tuple[float, float]] = [(words_sorted[0][0], words_sorted[0][1])] - current_y = words_sorted[0][1] + words_by_x = sorted(words, key=lambda w: w[0]) + columns: List[List[Tuple[float, float]]] = [] # each: [(left_x, cy), ...] + cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])] + cur_x = words_by_x[0][0] - for cx, cy, _ in words_sorted[1:]: - if abs(cy - current_y) <= y_tol: - current_line.append((cx, cy)) + for lx, cy, _ in words_by_x[1:]: + if abs(lx - cur_x) <= x_tol: + cur_col.append((lx, cy)) + # Update running x as median of cluster + cur_x = cur_x * 0.8 + lx * 0.2 else: - if len(current_line) >= 3: - lines.append(current_line) - current_line = [(cx, cy)] - current_y = cy - if len(current_line) >= 3: - lines.append(current_line) + if len(cur_col) >= 5: + columns.append(cur_col) + cur_col = [(lx, cy)] + cur_x = lx + if len(cur_col) >= 5: + columns.append(cur_col) - if len(lines) < 3: + if len(columns) < 2: return result - # Linear regression per line → slope (dy/dx) - slopes = [] - for line in lines: - xs = np.array([p[0] for p in line]) - ys = np.array([p[1] for p in line]) - x_range = xs.max() - xs.min() - if x_range < 20: - continue - coeffs = np.polyfit(xs, ys, 1) - slopes.append(coeffs[0]) # dy/dx + # --- For each column, measure X-drift as a function of Y --- + # Fit: left_x = a * cy + b → a = dx/dy = tan(shear_angle) + drifts = [] + for col in columns: + ys = np.array([p[1] for p in col]) + xs = np.array([p[0] for p in col]) + y_range = ys.max() - ys.min() + if y_range < h * scale * 0.3: + continue # column must span at least 30% of image height + # Linear regression: x = a*y + b + coeffs = np.polyfit(ys, xs, 1) + drifts.append(coeffs[0]) # dx/dy - if len(slopes) < 3: + if len(drifts) < 2: return result - # Median slope → shear angle - # dy/dx of horizontal text lines = tan(shear_angle) - # Positive slope means text tilts down-right → vertical columns lean right - median_slope = float(np.median(slopes)) - shear_degrees = math.degrees(math.atan(median_slope)) + # Median dx/dy → shear angle + # dx/dy > 0 means left-edges move RIGHT as we go DOWN → columns lean right + median_drift = float(np.median(drifts)) + shear_degrees = math.degrees(math.atan(median_drift)) - # Confidence from line count + slope consistency - slope_std = float(np.std(slopes)) - consistency = max(0.0, 1.0 - slope_std * 20) # penalise high variance - count_factor = min(1.0, len(slopes) / 8.0) - confidence = count_factor * 0.6 + consistency * 0.4 + # Confidence from column count + drift consistency + drift_std = float(np.std(drifts)) + consistency = max(0.0, 1.0 - drift_std * 50) # tighter penalty for drift variance + count_factor = min(1.0, len(drifts) / 4.0) + confidence = count_factor * 0.5 + consistency * 0.5 result["shear_degrees"] = round(shear_degrees, 3) result["confidence"] = round(max(0.0, min(1.0, confidence)), 2) + logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, " + "shear=%.3f°, conf=%.2f", + len(columns), len(drifts), median_drift, + shear_degrees, confidence) return result