From b914b6f49db8df3347bc2ac95bc8ba9b866dfbd1 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 3 Mar 2026 16:25:07 +0100 Subject: [PATCH] fix(columns): extend rightmost column to full image width (w) not content right_x right_x is the detected content boundary, which can still be several pixels short of actual text near the page margin. Since the page margin contains only white space, extending the last column's OCR crop to the full image width (w) is always safe and prevents right-edge text cutoff. Affects three locations in detect_column_geometry(): - Word count logging loop - ColumnGeometry boundary building (Step 8) - Phantom filter boundary adjustment (Step 9) Co-Authored-By: Claude Sonnet 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 89d3238..1303c22 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -1631,10 +1631,11 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt if i + 1 < len(col_starts): next_start = col_starts[i + 1] else: - # Rightmost column always extends to content right edge. - # There is nothing to the right of the last column except page margin, - # so there is no reason to end earlier. - next_start = right_x + # Rightmost column always extends to full image width (w). + # The page margin contains only white space — extending the OCR + # crop to the image edge is safe and prevents text near the right + # border from being cut off. + next_start = w col_left_rel = start_x - left_x col_right_rel = next_start - left_x @@ -1653,9 +1654,8 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt if i + 1 < len(col_starts): end_x = col_starts[i + 1] else: - # Rightmost column always extends to content right edge (right_x). - # Page margin detection may underestimate — extend fully so no text is cropped. - end_x = right_x + # Rightmost column always extends to full image width (w). + end_x = w all_boundaries.append((start_x, end_x)) geometries = [] @@ -1696,7 +1696,7 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt if i + 1 < len(filtered_geoms): g.width = filtered_geoms[i + 1].x - g.x else: - g.width = right_x - g.x + g.width = w - g.x g.index = i col_left_rel = g.x - left_x col_right_rel = col_left_rel + g.width