diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index d204da0..0b0f157 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -1627,19 +1627,28 @@ async def detect_rows(session_id: str): else: zones = column_result.get("zones") or [] # zones can be None for sub-sessions - # Collect box y-ranges for filtering + # Collect box y-ranges for filtering. + # Use border_thickness to shrink the exclusion zone: the border pixels + # belong visually to the box frame, but text rows above/below the box + # may overlap with the border area and must not be clipped. box_ranges = [] # [(y_start, y_end)] + box_ranges_inner = [] # [(y_start + border, y_end - border)] for row filtering for zone in zones: if zone.get("zone_type") == "box" and zone.get("box"): box = zone["box"] + bt = max(box.get("border_thickness", 0), 5) # minimum 5px margin box_ranges.append((box["y"], box["y"] + box["height"])) + # Inner range: shrink by border thickness so boundary rows aren't excluded + box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt)) if box_ranges and inv is not None: # Combined-image approach: strip box regions from inv image, # run row detection on the combined image, then remap y-coords back. content_strips = [] # [(y_start, y_end)] in absolute coords - # Build content strips by subtracting box ranges from [top_y, bottom_y] - sorted_boxes = sorted(box_ranges, key=lambda r: r[0]) + # Build content strips by subtracting box inner ranges from [top_y, bottom_y]. + # Using inner ranges means the border area is included in the content + # strips, so the last row above a box isn't clipped by the border. + sorted_boxes = sorted(box_ranges_inner, key=lambda r: r[0]) strip_start = top_y for by_start, by_end in sorted_boxes: if by_start > strip_start: @@ -1934,18 +1943,21 @@ async def detect_words( ] row.word_count = len(row.words) - # Exclude rows that fall within box zones + # Exclude rows that fall within box zones. + # Use inner box range (shrunk by border_thickness) so that rows at + # the boundary (overlapping with the box border) are NOT excluded. zones = column_result.get("zones") or [] - box_ranges = [] + box_ranges_inner = [] for zone in zones: if zone.get("zone_type") == "box" and zone.get("box"): box = zone["box"] - box_ranges.append((box["y"], box["y"] + box["height"])) + bt = max(box.get("border_thickness", 0), 5) # minimum 5px margin + box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt)) - if box_ranges: + if box_ranges_inner: def _row_in_box(r): center_y = r.y + r.height / 2 - return any(by_s <= center_y < by_e for by_s, by_e in box_ranges) + return any(by_s <= center_y < by_e for by_s, by_e in box_ranges_inner) before_count = len(row_geoms) row_geoms = [r for r in row_geoms if not _row_in_box(r)]