fix(columns): extend rightmost column to full image width (w) not content right_x

right_x is the detected content boundary, which can still be several
pixels short of actual text near the page margin. Since the page margin
contains only white space, extending the last column's OCR crop to the
full image width (w) is always safe and prevents right-edge text cutoff.

Affects three locations in detect_column_geometry():
- Word count logging loop
- ColumnGeometry boundary building (Step 8)
- Phantom filter boundary adjustment (Step 9)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-03 16:25:07 +01:00
parent 123b7ada0b
commit b914b6f49d

View File

@@ -1631,10 +1631,11 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
if i + 1 < len(col_starts): if i + 1 < len(col_starts):
next_start = col_starts[i + 1] next_start = col_starts[i + 1]
else: else:
# Rightmost column always extends to content right edge. # Rightmost column always extends to full image width (w).
# There is nothing to the right of the last column except page margin, # The page margin contains only white space — extending the OCR
# so there is no reason to end earlier. # crop to the image edge is safe and prevents text near the right
next_start = right_x # border from being cut off.
next_start = w
col_left_rel = start_x - left_x col_left_rel = start_x - left_x
col_right_rel = next_start - left_x col_right_rel = next_start - left_x
@@ -1653,9 +1654,8 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
if i + 1 < len(col_starts): if i + 1 < len(col_starts):
end_x = col_starts[i + 1] end_x = col_starts[i + 1]
else: else:
# Rightmost column always extends to content right edge (right_x). # Rightmost column always extends to full image width (w).
# Page margin detection may underestimate — extend fully so no text is cropped. end_x = w
end_x = right_x
all_boundaries.append((start_x, end_x)) all_boundaries.append((start_x, end_x))
geometries = [] geometries = []
@@ -1696,7 +1696,7 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
if i + 1 < len(filtered_geoms): if i + 1 < len(filtered_geoms):
g.width = filtered_geoms[i + 1].x - g.x g.width = filtered_geoms[i + 1].x - g.x
else: else:
g.width = right_x - g.x g.width = w - g.x
g.index = i g.index = i
col_left_rel = g.x - left_x col_left_rel = g.x - left_x
col_right_rel = col_left_rel + g.width col_right_rel = col_left_rel + g.width