fix(columns): extend rightmost column to full image width (w) not content right_x
right_x is the detected content boundary, which can still be several pixels short of actual text near the page margin. Since the page margin contains only white space, extending the last column's OCR crop to the full image width (w) is always safe and prevents right-edge text cutoff. Affects three locations in detect_column_geometry(): - Word count logging loop - ColumnGeometry boundary building (Step 8) - Phantom filter boundary adjustment (Step 9) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1631,10 +1631,11 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
|
||||
if i + 1 < len(col_starts):
|
||||
next_start = col_starts[i + 1]
|
||||
else:
|
||||
# Rightmost column always extends to content right edge.
|
||||
# There is nothing to the right of the last column except page margin,
|
||||
# so there is no reason to end earlier.
|
||||
next_start = right_x
|
||||
# Rightmost column always extends to full image width (w).
|
||||
# The page margin contains only white space — extending the OCR
|
||||
# crop to the image edge is safe and prevents text near the right
|
||||
# border from being cut off.
|
||||
next_start = w
|
||||
|
||||
col_left_rel = start_x - left_x
|
||||
col_right_rel = next_start - left_x
|
||||
@@ -1653,9 +1654,8 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
|
||||
if i + 1 < len(col_starts):
|
||||
end_x = col_starts[i + 1]
|
||||
else:
|
||||
# Rightmost column always extends to content right edge (right_x).
|
||||
# Page margin detection may underestimate — extend fully so no text is cropped.
|
||||
end_x = right_x
|
||||
# Rightmost column always extends to full image width (w).
|
||||
end_x = w
|
||||
all_boundaries.append((start_x, end_x))
|
||||
|
||||
geometries = []
|
||||
@@ -1696,7 +1696,7 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
|
||||
if i + 1 < len(filtered_geoms):
|
||||
g.width = filtered_geoms[i + 1].x - g.x
|
||||
else:
|
||||
g.width = right_x - g.x
|
||||
g.width = w - g.x
|
||||
g.index = i
|
||||
col_left_rel = g.x - left_x
|
||||
col_right_rel = col_left_rel + g.width
|
||||
|
||||
Reference in New Issue
Block a user