fix(columns): extend rightmost column to full image width (w) not content right_x

right_x is the detected content boundary, which can still be several
pixels short of actual text near the page margin. Since the page margin
contains only white space, extending the last column's OCR crop to the
full image width (w) is always safe and prevents right-edge text cutoff.

Affects three locations in detect_column_geometry():
- Word count logging loop
- ColumnGeometry boundary building (Step 8)
- Phantom filter boundary adjustment (Step 9)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-03 16:25:07 +01:00
parent 123b7ada0b
commit b914b6f49d

View File

@@ -1631,10 +1631,11 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
if i + 1 < len(col_starts):
next_start = col_starts[i + 1]
else:
# Rightmost column always extends to content right edge.
# There is nothing to the right of the last column except page margin,
# so there is no reason to end earlier.
next_start = right_x
# Rightmost column always extends to full image width (w).
# The page margin contains only white space — extending the OCR
# crop to the image edge is safe and prevents text near the right
# border from being cut off.
next_start = w
col_left_rel = start_x - left_x
col_right_rel = next_start - left_x
@@ -1653,9 +1654,8 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
if i + 1 < len(col_starts):
end_x = col_starts[i + 1]
else:
# Rightmost column always extends to content right edge (right_x).
# Page margin detection may underestimate — extend fully so no text is cropped.
end_x = right_x
# Rightmost column always extends to full image width (w).
end_x = w
all_boundaries.append((start_x, end_x))
geometries = []
@@ -1696,7 +1696,7 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
if i + 1 < len(filtered_geoms):
g.width = filtered_geoms[i + 1].x - g.x
else:
g.width = right_x - g.x
g.width = w - g.x
g.index = i
col_left_rel = g.x - left_x
col_right_rel = col_left_rel + g.width