fix: rewrite Method D to measure vertical column drift instead of text-line slope
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m56s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 17s

After deskew, horizontal text lines are already straight (~0° slope).
Method D was measuring this (always ~0°) instead of the actual vertical
shear (column edge drift). This caused it to report 0.112° with 0.96
confidence, overwhelming Method A's correct detection of negative shear.

New Method D groups words by X-position into vertical columns, then
measures how left-edge X drifts with Y position via linear regression.
dx/dy = tan(shear_angle), directly measuring column tilt.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-04 10:31:19 +01:00
parent 9dd77ab54a
commit e4aff2b27e

View File

@@ -627,12 +627,12 @@ def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
"""Detect shear by measuring text-line straightness (Method D). """Detect shear by measuring text-line straightness (Method D).
Runs a quick Tesseract scan (PSM 11, 50% downscale) to locate word Runs a quick Tesseract scan (PSM 11, 50% downscale) to locate word
bounding boxes, groups them into horizontal lines by Y-proximity, bounding boxes, groups them into vertical columns by X-proximity,
fits a linear regression to each line, and takes the median slope and measures how the left-edge X position drifts with Y (vertical
as the shear angle. position). The drift dx/dy is the tangent of the shear angle.
This is the most robust method because it measures actual text content This directly measures vertical shear (column tilt) rather than
rather than relying on edges, projections, or printed lines. horizontal text-line slope, which is already corrected by deskew.
Returns: Returns:
Dict with keys: method, shear_degrees, confidence. Dict with keys: method, shear_degrees, confidence.
@@ -656,71 +656,80 @@ def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
except Exception: except Exception:
return result return result
# Collect word centres # Collect word left-edges (x) and vertical centres (y)
words = [] words = []
for i in range(len(data['text'])): for i in range(len(data['text'])):
text = data['text'][i].strip() text = data['text'][i].strip()
conf = int(data['conf'][i]) conf = int(data['conf'][i])
if not text or conf < 20 or len(text) < 2: if not text or conf < 20 or len(text) < 2:
continue continue
cx = data['left'][i] + data['width'][i] / 2.0 left_x = float(data['left'][i])
cy = data['top'][i] + data['height'][i] / 2.0 cy = data['top'][i] + data['height'][i] / 2.0
words.append((cx, cy, data['height'][i])) word_w = float(data['width'][i])
words.append((left_x, cy, word_w))
if len(words) < 10: if len(words) < 15:
return result return result
# Group words into lines by Y-proximity # --- Group words into vertical columns by left-edge X proximity ---
avg_h = sum(wh for _, _, wh in words) / len(words) # Sort by x, then cluster words whose left-edges are within x_tol
y_tol = max(avg_h * 0.6, 8) avg_w = sum(ww for _, _, ww in words) / len(words)
words_sorted = sorted(words, key=lambda w: w[1]) x_tol = max(avg_w * 0.4, 8) # tolerance for "same column"
lines: List[List[Tuple[float, float]]] = [] words_by_x = sorted(words, key=lambda w: w[0])
current_line: List[Tuple[float, float]] = [(words_sorted[0][0], words_sorted[0][1])] columns: List[List[Tuple[float, float]]] = [] # each: [(left_x, cy), ...]
current_y = words_sorted[0][1] cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])]
cur_x = words_by_x[0][0]
for cx, cy, _ in words_sorted[1:]: for lx, cy, _ in words_by_x[1:]:
if abs(cy - current_y) <= y_tol: if abs(lx - cur_x) <= x_tol:
current_line.append((cx, cy)) cur_col.append((lx, cy))
# Update running x as median of cluster
cur_x = cur_x * 0.8 + lx * 0.2
else: else:
if len(current_line) >= 3: if len(cur_col) >= 5:
lines.append(current_line) columns.append(cur_col)
current_line = [(cx, cy)] cur_col = [(lx, cy)]
current_y = cy cur_x = lx
if len(current_line) >= 3: if len(cur_col) >= 5:
lines.append(current_line) columns.append(cur_col)
if len(lines) < 3: if len(columns) < 2:
return result return result
# Linear regression per line → slope (dy/dx) # --- For each column, measure X-drift as a function of Y ---
slopes = [] # Fit: left_x = a * cy + b → a = dx/dy = tan(shear_angle)
for line in lines: drifts = []
xs = np.array([p[0] for p in line]) for col in columns:
ys = np.array([p[1] for p in line]) ys = np.array([p[1] for p in col])
x_range = xs.max() - xs.min() xs = np.array([p[0] for p in col])
if x_range < 20: y_range = ys.max() - ys.min()
continue if y_range < h * scale * 0.3:
coeffs = np.polyfit(xs, ys, 1) continue # column must span at least 30% of image height
slopes.append(coeffs[0]) # dy/dx # Linear regression: x = a*y + b
coeffs = np.polyfit(ys, xs, 1)
drifts.append(coeffs[0]) # dx/dy
if len(slopes) < 3: if len(drifts) < 2:
return result return result
# Median slope → shear angle # Median dx/dy → shear angle
# dy/dx of horizontal text lines = tan(shear_angle) # dx/dy > 0 means left-edges move RIGHT as we go DOWN → columns lean right
# Positive slope means text tilts down-right → vertical columns lean right median_drift = float(np.median(drifts))
median_slope = float(np.median(slopes)) shear_degrees = math.degrees(math.atan(median_drift))
shear_degrees = math.degrees(math.atan(median_slope))
# Confidence from line count + slope consistency # Confidence from column count + drift consistency
slope_std = float(np.std(slopes)) drift_std = float(np.std(drifts))
consistency = max(0.0, 1.0 - slope_std * 20) # penalise high variance consistency = max(0.0, 1.0 - drift_std * 50) # tighter penalty for drift variance
count_factor = min(1.0, len(slopes) / 8.0) count_factor = min(1.0, len(drifts) / 4.0)
confidence = count_factor * 0.6 + consistency * 0.4 confidence = count_factor * 0.5 + consistency * 0.5
result["shear_degrees"] = round(shear_degrees, 3) result["shear_degrees"] = round(shear_degrees, 3)
result["confidence"] = round(max(0.0, min(1.0, confidence)), 2) result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, "
"shear=%.3f°, conf=%.2f",
len(columns), len(drifts), median_drift,
shear_degrees, confidence)
return result return result