""" Tests for box boundary row filtering logic (box_ranges_inner). Verifies that rows at the border of box zones are NOT excluded during row detection and word filtering. This prevents the last row above a box from being clipped by the box's border pixels. Related fix in ocr_pipeline_api.py: detect_rows() and detect_words() use box_ranges_inner (shrunk by border_thickness, min 5px) instead of full box_ranges for row exclusion. """ import pytest import numpy as np from dataclasses import dataclass # --------------------------------------------------------------------------- # Simulate the box_ranges_inner calculation from ocr_pipeline_api.py # --------------------------------------------------------------------------- def compute_box_ranges(zones: list[dict]) -> tuple[list, list]: """ Replicates the box_ranges / box_ranges_inner calculation from detect_rows() in ocr_pipeline_api.py. """ box_ranges = [] box_ranges_inner = [] for zone in zones: if zone.get("zone_type") == "box" and zone.get("box"): box = zone["box"] bt = max(box.get("border_thickness", 0), 5) # minimum 5px margin box_ranges.append((box["y"], box["y"] + box["height"])) box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt)) return box_ranges, box_ranges_inner def build_content_strips(box_ranges_inner: list, top_y: int, bottom_y: int) -> list: """ Replicates the content_strips calculation from detect_rows() in ocr_pipeline_api.py. """ sorted_boxes = sorted(box_ranges_inner, key=lambda r: r[0]) content_strips = [] strip_start = top_y for by_start, by_end in sorted_boxes: if by_start > strip_start: content_strips.append((strip_start, by_start)) strip_start = max(strip_start, by_end) if strip_start < bottom_y: content_strips.append((strip_start, bottom_y)) return [(ys, ye) for ys, ye in content_strips if ye - ys >= 20] def row_in_box(row_y: int, row_height: int, box_ranges_inner: list) -> bool: """ Replicates the _row_in_box filter from detect_words() in ocr_pipeline_api.py. """ center_y = row_y + row_height / 2 return any(by_s <= center_y < by_e for by_s, by_e in box_ranges_inner) # --------------------------------------------------------------------------- # Tests # --------------------------------------------------------------------------- class TestBoxRangesInner: """Tests for box_ranges_inner calculation.""" def test_border_thickness_shrinks_inner_range(self): """Inner range should be shrunk by border_thickness.""" zones = [{ "zone_type": "box", "box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 10}, }] box_ranges, inner = compute_box_ranges(zones) assert box_ranges == [(500, 700)] assert inner == [(510, 690)] # shrunk by 10px on each side def test_minimum_5px_margin(self): """Even with border_thickness=0, minimum 5px margin should apply.""" zones = [{ "zone_type": "box", "box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 0}, }] _, inner = compute_box_ranges(zones) assert inner == [(505, 695)] # minimum 5px applied def test_no_box_zones_returns_empty(self): """Without box zones, both ranges should be empty.""" zones = [ {"zone_type": "content", "y": 0, "height": 500}, ] box_ranges, inner = compute_box_ranges(zones) assert box_ranges == [] assert inner == [] def test_multiple_boxes(self): """Multiple boxes should each get their own inner range.""" zones = [ {"zone_type": "box", "box": {"x": 50, "y": 300, "width": 1100, "height": 150, "border_thickness": 8}}, {"zone_type": "box", "box": {"x": 50, "y": 700, "width": 1100, "height": 150, "border_thickness": 3}}, ] box_ranges, inner = compute_box_ranges(zones) assert len(box_ranges) == 2 assert len(inner) == 2 assert inner[0] == (308, 442) # 300+8 to 450-8 assert inner[1] == (705, 845) # 700+5(min) to 850-5(min) class TestContentStrips: """Tests for content strip building with box_ranges_inner.""" def test_single_box_creates_two_strips(self): """A single box in the middle should create two content strips.""" inner = [(505, 695)] # box inner at y=505..695 strips = build_content_strips(inner, top_y=100, bottom_y=1700) assert len(strips) == 2 assert strips[0] == (100, 505) # above box assert strips[1] == (695, 1700) # below box def test_content_strip_includes_box_border_area(self): """Content strips should INCLUDE the box border area (not just stop at box outer edge).""" # Box at y=500, height=200, border=10 → inner=(510, 690) inner = [(510, 690)] strips = build_content_strips(inner, top_y=100, bottom_y=1700) # Strip above extends to 510 (not 500), including border area assert strips[0] == (100, 510) # Strip below starts at 690 (not 700), including border area assert strips[1] == (690, 1700) def test_row_at_box_border_is_in_content_strip(self): """A row at y=495 (just above box at y=500) should be in the content strip.""" # Box at y=500, height=200, border=10 → inner=(510, 690) inner = [(510, 690)] strips = build_content_strips(inner, top_y=100, bottom_y=1700) # Row at y=495, height=30 → center at y=510 → just at the edge row_center = 495 + 15 # = 510 # This row center is at the boundary — it should be in the first strip in_first_strip = strips[0][0] <= row_center <= strips[0][1] assert in_first_strip def test_no_boxes_single_strip(self): """Without boxes, a single strip covering the full content should be returned.""" strips = build_content_strips([], top_y=100, bottom_y=1700) assert len(strips) == 1 assert strips[0] == (100, 1700) class TestRowInBoxFilter: """Tests for the _row_in_box filter using box_ranges_inner.""" def test_row_inside_box_is_excluded(self): """A row clearly inside the box inner range should be excluded.""" inner = [(510, 690)] # Row at y=550, height=30 → center at 565 assert row_in_box(550, 30, inner) is True def test_row_above_box_not_excluded(self): """A row above the box (at the border area) should NOT be excluded.""" inner = [(510, 690)] # Row at y=490, height=30 → center at 505 → below inner start (510) assert row_in_box(490, 30, inner) is False def test_row_below_box_not_excluded(self): """A row below the box (at the border area) should NOT be excluded.""" inner = [(510, 690)] # Row at y=695, height=30 → center at 710 → above inner end (690) assert row_in_box(695, 30, inner) is False def test_row_at_box_border_not_excluded(self): """A row overlapping with the box border should NOT be excluded. This is the key fix: previously, box_ranges (not inner) was used, which would exclude this row because its center (505) falls within the full box range (500-700). """ # Full box range: (500, 700), inner: (510, 690) inner = [(510, 690)] # Row at y=490, height=30 → center at 505 # With box_ranges (500, 700): 500 <= 505 < 700 → excluded (BUG!) # With box_ranges_inner (510, 690): 510 <= 505 → False → not excluded (FIXED!) assert row_in_box(490, 30, inner) is False def test_row_at_bottom_border_not_excluded(self): """A row overlapping with the bottom box border should NOT be excluded.""" inner = [(510, 690)] # Row at y=685, height=30 → center at 700 # With box_ranges (500, 700): 500 <= 700 < 700 → not excluded (edge) # With box_ranges_inner (510, 690): 510 <= 700 → True but 700 >= 690 → False assert row_in_box(685, 30, inner) is False def test_no_boxes_nothing_excluded(self): """Without box zones, no rows should be excluded.""" assert row_in_box(500, 30, []) is False class TestBoxBoundaryIntegration: """Integration test: simulate the full row → content strip → filter pipeline.""" def test_boundary_row_preserved_with_inner_ranges(self): """ End-to-end: A row at the box boundary is preserved in content strips and not filtered out by _row_in_box. Simulates the real scenario: page with a box at y=500..700, border_thickness=10. Row at y=488..518 (center=503) sits just above the box border. """ zones = [{ "zone_type": "box", "box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 10}, }] # Step 1: Compute inner ranges box_ranges, inner = compute_box_ranges(zones) assert inner == [(510, 690)] # Step 2: Build content strips strips = build_content_strips(inner, top_y=20, bottom_y=2400) assert len(strips) == 2 # First strip extends to 510 (includes the border area 500-510) assert strips[0] == (20, 510) # Step 3: Check that the boundary row is NOT in box row_y, row_h = 488, 30 # center = 503 assert row_in_box(row_y, row_h, inner) is False # Step 4: Verify the row's center falls within a content strip row_center = row_y + row_h / 2 # 503 in_any_strip = any(ys <= row_center < ye for ys, ye in strips) assert in_any_strip, f"Row center {row_center} should be in content strips {strips}" def test_boundary_row_would_be_lost_with_full_ranges(self): """ Demonstrates the bug: using full box_ranges (not inner) WOULD exclude the boundary row. """ zones = [{ "zone_type": "box", "box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 10}, }] box_ranges, _ = compute_box_ranges(zones) # The full range is (500, 700) row_center = 488 + 30 / 2 # 503 # With full range: 500 <= 503 < 700 → would be excluded! in_box_full = any(by_s <= row_center < by_e for by_s, by_e in box_ranges) assert in_box_full is True, "Full range SHOULD incorrectly exclude this row"