fix: rewrite Method D to measure vertical column drift instead of text-line slope
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m56s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 17s

After deskew, horizontal text lines are already straight (~0° slope).
Method D was measuring this (always ~0°) instead of the actual vertical
shear (column edge drift). This caused it to report 0.112° with 0.96
confidence, overwhelming Method A's correct detection of negative shear.

New Method D groups words by X-position into vertical columns, then
measures how left-edge X drifts with Y position via linear regression.
dx/dy = tan(shear_angle), directly measuring column tilt.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-04 10:31:19 +01:00
parent 9dd77ab54a
commit e4aff2b27e

View File

@@ -627,12 +627,12 @@ def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
"""Detect shear by measuring text-line straightness (Method D).
Runs a quick Tesseract scan (PSM 11, 50% downscale) to locate word
bounding boxes, groups them into horizontal lines by Y-proximity,
fits a linear regression to each line, and takes the median slope
as the shear angle.
bounding boxes, groups them into vertical columns by X-proximity,
and measures how the left-edge X position drifts with Y (vertical
position). The drift dx/dy is the tangent of the shear angle.
This is the most robust method because it measures actual text content
rather than relying on edges, projections, or printed lines.
This directly measures vertical shear (column tilt) rather than
horizontal text-line slope, which is already corrected by deskew.
Returns:
Dict with keys: method, shear_degrees, confidence.
@@ -656,71 +656,80 @@ def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
except Exception:
return result
# Collect word centres
# Collect word left-edges (x) and vertical centres (y)
words = []
for i in range(len(data['text'])):
text = data['text'][i].strip()
conf = int(data['conf'][i])
if not text or conf < 20 or len(text) < 2:
continue
cx = data['left'][i] + data['width'][i] / 2.0
left_x = float(data['left'][i])
cy = data['top'][i] + data['height'][i] / 2.0
words.append((cx, cy, data['height'][i]))
word_w = float(data['width'][i])
words.append((left_x, cy, word_w))
if len(words) < 10:
if len(words) < 15:
return result
# Group words into lines by Y-proximity
avg_h = sum(wh for _, _, wh in words) / len(words)
y_tol = max(avg_h * 0.6, 8)
words_sorted = sorted(words, key=lambda w: w[1])
# --- Group words into vertical columns by left-edge X proximity ---
# Sort by x, then cluster words whose left-edges are within x_tol
avg_w = sum(ww for _, _, ww in words) / len(words)
x_tol = max(avg_w * 0.4, 8) # tolerance for "same column"
lines: List[List[Tuple[float, float]]] = []
current_line: List[Tuple[float, float]] = [(words_sorted[0][0], words_sorted[0][1])]
current_y = words_sorted[0][1]
words_by_x = sorted(words, key=lambda w: w[0])
columns: List[List[Tuple[float, float]]] = [] # each: [(left_x, cy), ...]
cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])]
cur_x = words_by_x[0][0]
for cx, cy, _ in words_sorted[1:]:
if abs(cy - current_y) <= y_tol:
current_line.append((cx, cy))
for lx, cy, _ in words_by_x[1:]:
if abs(lx - cur_x) <= x_tol:
cur_col.append((lx, cy))
# Update running x as median of cluster
cur_x = cur_x * 0.8 + lx * 0.2
else:
if len(current_line) >= 3:
lines.append(current_line)
current_line = [(cx, cy)]
current_y = cy
if len(current_line) >= 3:
lines.append(current_line)
if len(cur_col) >= 5:
columns.append(cur_col)
cur_col = [(lx, cy)]
cur_x = lx
if len(cur_col) >= 5:
columns.append(cur_col)
if len(lines) < 3:
if len(columns) < 2:
return result
# Linear regression per line → slope (dy/dx)
slopes = []
for line in lines:
xs = np.array([p[0] for p in line])
ys = np.array([p[1] for p in line])
x_range = xs.max() - xs.min()
if x_range < 20:
continue
coeffs = np.polyfit(xs, ys, 1)
slopes.append(coeffs[0]) # dy/dx
# --- For each column, measure X-drift as a function of Y ---
# Fit: left_x = a * cy + b → a = dx/dy = tan(shear_angle)
drifts = []
for col in columns:
ys = np.array([p[1] for p in col])
xs = np.array([p[0] for p in col])
y_range = ys.max() - ys.min()
if y_range < h * scale * 0.3:
continue # column must span at least 30% of image height
# Linear regression: x = a*y + b
coeffs = np.polyfit(ys, xs, 1)
drifts.append(coeffs[0]) # dx/dy
if len(slopes) < 3:
if len(drifts) < 2:
return result
# Median slope → shear angle
# dy/dx of horizontal text lines = tan(shear_angle)
# Positive slope means text tilts down-right → vertical columns lean right
median_slope = float(np.median(slopes))
shear_degrees = math.degrees(math.atan(median_slope))
# Median dx/dy → shear angle
# dx/dy > 0 means left-edges move RIGHT as we go DOWN → columns lean right
median_drift = float(np.median(drifts))
shear_degrees = math.degrees(math.atan(median_drift))
# Confidence from line count + slope consistency
slope_std = float(np.std(slopes))
consistency = max(0.0, 1.0 - slope_std * 20) # penalise high variance
count_factor = min(1.0, len(slopes) / 8.0)
confidence = count_factor * 0.6 + consistency * 0.4
# Confidence from column count + drift consistency
drift_std = float(np.std(drifts))
consistency = max(0.0, 1.0 - drift_std * 50) # tighter penalty for drift variance
count_factor = min(1.0, len(drifts) / 4.0)
confidence = count_factor * 0.5 + consistency * 0.5
result["shear_degrees"] = round(shear_degrees, 3)
result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, "
"shear=%.3f°, conf=%.2f",
len(columns), len(drifts), median_drift,
shear_degrees, confidence)
return result