fix: rewrite Method D to measure vertical column drift instead of text-line slope
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m56s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 17s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m56s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 17s
After deskew, horizontal text lines are already straight (~0° slope). Method D was measuring this (always ~0°) instead of the actual vertical shear (column edge drift). This caused it to report 0.112° with 0.96 confidence, overwhelming Method A's correct detection of negative shear. New Method D groups words by X-position into vertical columns, then measures how left-edge X drifts with Y position via linear regression. dx/dy = tan(shear_angle), directly measuring column tilt. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -627,12 +627,12 @@ def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
|
||||
"""Detect shear by measuring text-line straightness (Method D).
|
||||
|
||||
Runs a quick Tesseract scan (PSM 11, 50% downscale) to locate word
|
||||
bounding boxes, groups them into horizontal lines by Y-proximity,
|
||||
fits a linear regression to each line, and takes the median slope
|
||||
as the shear angle.
|
||||
bounding boxes, groups them into vertical columns by X-proximity,
|
||||
and measures how the left-edge X position drifts with Y (vertical
|
||||
position). The drift dx/dy is the tangent of the shear angle.
|
||||
|
||||
This is the most robust method because it measures actual text content
|
||||
rather than relying on edges, projections, or printed lines.
|
||||
This directly measures vertical shear (column tilt) rather than
|
||||
horizontal text-line slope, which is already corrected by deskew.
|
||||
|
||||
Returns:
|
||||
Dict with keys: method, shear_degrees, confidence.
|
||||
@@ -656,71 +656,80 @@ def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
|
||||
except Exception:
|
||||
return result
|
||||
|
||||
# Collect word centres
|
||||
# Collect word left-edges (x) and vertical centres (y)
|
||||
words = []
|
||||
for i in range(len(data['text'])):
|
||||
text = data['text'][i].strip()
|
||||
conf = int(data['conf'][i])
|
||||
if not text or conf < 20 or len(text) < 2:
|
||||
continue
|
||||
cx = data['left'][i] + data['width'][i] / 2.0
|
||||
left_x = float(data['left'][i])
|
||||
cy = data['top'][i] + data['height'][i] / 2.0
|
||||
words.append((cx, cy, data['height'][i]))
|
||||
word_w = float(data['width'][i])
|
||||
words.append((left_x, cy, word_w))
|
||||
|
||||
if len(words) < 10:
|
||||
if len(words) < 15:
|
||||
return result
|
||||
|
||||
# Group words into lines by Y-proximity
|
||||
avg_h = sum(wh for _, _, wh in words) / len(words)
|
||||
y_tol = max(avg_h * 0.6, 8)
|
||||
words_sorted = sorted(words, key=lambda w: w[1])
|
||||
# --- Group words into vertical columns by left-edge X proximity ---
|
||||
# Sort by x, then cluster words whose left-edges are within x_tol
|
||||
avg_w = sum(ww for _, _, ww in words) / len(words)
|
||||
x_tol = max(avg_w * 0.4, 8) # tolerance for "same column"
|
||||
|
||||
lines: List[List[Tuple[float, float]]] = []
|
||||
current_line: List[Tuple[float, float]] = [(words_sorted[0][0], words_sorted[0][1])]
|
||||
current_y = words_sorted[0][1]
|
||||
words_by_x = sorted(words, key=lambda w: w[0])
|
||||
columns: List[List[Tuple[float, float]]] = [] # each: [(left_x, cy), ...]
|
||||
cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])]
|
||||
cur_x = words_by_x[0][0]
|
||||
|
||||
for cx, cy, _ in words_sorted[1:]:
|
||||
if abs(cy - current_y) <= y_tol:
|
||||
current_line.append((cx, cy))
|
||||
for lx, cy, _ in words_by_x[1:]:
|
||||
if abs(lx - cur_x) <= x_tol:
|
||||
cur_col.append((lx, cy))
|
||||
# Update running x as median of cluster
|
||||
cur_x = cur_x * 0.8 + lx * 0.2
|
||||
else:
|
||||
if len(current_line) >= 3:
|
||||
lines.append(current_line)
|
||||
current_line = [(cx, cy)]
|
||||
current_y = cy
|
||||
if len(current_line) >= 3:
|
||||
lines.append(current_line)
|
||||
if len(cur_col) >= 5:
|
||||
columns.append(cur_col)
|
||||
cur_col = [(lx, cy)]
|
||||
cur_x = lx
|
||||
if len(cur_col) >= 5:
|
||||
columns.append(cur_col)
|
||||
|
||||
if len(lines) < 3:
|
||||
if len(columns) < 2:
|
||||
return result
|
||||
|
||||
# Linear regression per line → slope (dy/dx)
|
||||
slopes = []
|
||||
for line in lines:
|
||||
xs = np.array([p[0] for p in line])
|
||||
ys = np.array([p[1] for p in line])
|
||||
x_range = xs.max() - xs.min()
|
||||
if x_range < 20:
|
||||
continue
|
||||
coeffs = np.polyfit(xs, ys, 1)
|
||||
slopes.append(coeffs[0]) # dy/dx
|
||||
# --- For each column, measure X-drift as a function of Y ---
|
||||
# Fit: left_x = a * cy + b → a = dx/dy = tan(shear_angle)
|
||||
drifts = []
|
||||
for col in columns:
|
||||
ys = np.array([p[1] for p in col])
|
||||
xs = np.array([p[0] for p in col])
|
||||
y_range = ys.max() - ys.min()
|
||||
if y_range < h * scale * 0.3:
|
||||
continue # column must span at least 30% of image height
|
||||
# Linear regression: x = a*y + b
|
||||
coeffs = np.polyfit(ys, xs, 1)
|
||||
drifts.append(coeffs[0]) # dx/dy
|
||||
|
||||
if len(slopes) < 3:
|
||||
if len(drifts) < 2:
|
||||
return result
|
||||
|
||||
# Median slope → shear angle
|
||||
# dy/dx of horizontal text lines = tan(shear_angle)
|
||||
# Positive slope means text tilts down-right → vertical columns lean right
|
||||
median_slope = float(np.median(slopes))
|
||||
shear_degrees = math.degrees(math.atan(median_slope))
|
||||
# Median dx/dy → shear angle
|
||||
# dx/dy > 0 means left-edges move RIGHT as we go DOWN → columns lean right
|
||||
median_drift = float(np.median(drifts))
|
||||
shear_degrees = math.degrees(math.atan(median_drift))
|
||||
|
||||
# Confidence from line count + slope consistency
|
||||
slope_std = float(np.std(slopes))
|
||||
consistency = max(0.0, 1.0 - slope_std * 20) # penalise high variance
|
||||
count_factor = min(1.0, len(slopes) / 8.0)
|
||||
confidence = count_factor * 0.6 + consistency * 0.4
|
||||
# Confidence from column count + drift consistency
|
||||
drift_std = float(np.std(drifts))
|
||||
consistency = max(0.0, 1.0 - drift_std * 50) # tighter penalty for drift variance
|
||||
count_factor = min(1.0, len(drifts) / 4.0)
|
||||
confidence = count_factor * 0.5 + consistency * 0.5
|
||||
|
||||
result["shear_degrees"] = round(shear_degrees, 3)
|
||||
result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
|
||||
logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, "
|
||||
"shear=%.3f°, conf=%.2f",
|
||||
len(columns), len(drifts), median_drift,
|
||||
shear_degrees, confidence)
|
||||
return result
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user