Files
breakpilot-lehrer/klausur-service/backend/tests/test_box_boundary_rows.py
Benjamin Admin f39cbe9283 refactor: remove unused pages and backends (model-management, OCR legacy, GPU/vast.ai, video-chat, matrix)
Deleted pages:
- /ai/model-management (mock data only, no real backend)
- /ai/ocr-compare (old /vocab/ backend, replaced by ocr-kombi)
- /ai/ocr-pipeline (minimal session browser, redundant)
- /ai/ocr-overlay (legacy monolith, redundant)
- /ai/gpu (vast.ai GPU management, no longer used)
- /infrastructure/gpu (same)
- /communication/video-chat (moved to core)
- /communication/matrix (moved to core)

Deleted backends:
- backend-lehrer/infra/vast_client.py + vast_power.py
- backend-lehrer/meetings_api.py + jitsi_api.py
- website/app/api/admin/gpu/
- edu-search-service/scripts/vast_ai_extractor.py

Total: ~7,800 LOC removed. All code preserved in git history.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-23 13:14:12 +02:00

257 lines
10 KiB
Python

"""
Tests for box boundary row filtering logic (box_ranges_inner).
Verifies that rows at the border of box zones are NOT excluded during
row detection and word filtering. This prevents the last row above a
box from being clipped by the box's border pixels.
Related fix in ocr_pipeline_api.py: detect_rows() and detect_words()
use box_ranges_inner (shrunk by border_thickness, min 5px) instead of
full box_ranges for row exclusion.
"""
import pytest
import numpy as np
from dataclasses import dataclass
# ---------------------------------------------------------------------------
# Simulate the box_ranges_inner calculation from ocr_pipeline_api.py
# ---------------------------------------------------------------------------
def compute_box_ranges(zones: list[dict]) -> tuple[list, list]:
"""
Replicates the box_ranges / box_ranges_inner calculation
from detect_rows() in ocr_pipeline_api.py.
"""
box_ranges = []
box_ranges_inner = []
for zone in zones:
if zone.get("zone_type") == "box" and zone.get("box"):
box = zone["box"]
bt = max(box.get("border_thickness", 0), 5) # minimum 5px margin
box_ranges.append((box["y"], box["y"] + box["height"]))
box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt))
return box_ranges, box_ranges_inner
def build_content_strips(box_ranges_inner: list, top_y: int, bottom_y: int) -> list:
"""
Replicates the content_strips calculation from detect_rows() in ocr_pipeline_api.py.
"""
sorted_boxes = sorted(box_ranges_inner, key=lambda r: r[0])
content_strips = []
strip_start = top_y
for by_start, by_end in sorted_boxes:
if by_start > strip_start:
content_strips.append((strip_start, by_start))
strip_start = max(strip_start, by_end)
if strip_start < bottom_y:
content_strips.append((strip_start, bottom_y))
return [(ys, ye) for ys, ye in content_strips if ye - ys >= 20]
def row_in_box(row_y: int, row_height: int, box_ranges_inner: list) -> bool:
"""
Replicates the _row_in_box filter from detect_words() in ocr_pipeline_api.py.
"""
center_y = row_y + row_height / 2
return any(by_s <= center_y < by_e for by_s, by_e in box_ranges_inner)
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
class TestBoxRangesInner:
"""Tests for box_ranges_inner calculation."""
def test_border_thickness_shrinks_inner_range(self):
"""Inner range should be shrunk by border_thickness."""
zones = [{
"zone_type": "box",
"box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 10},
}]
box_ranges, inner = compute_box_ranges(zones)
assert box_ranges == [(500, 700)]
assert inner == [(510, 690)] # shrunk by 10px on each side
def test_minimum_5px_margin(self):
"""Even with border_thickness=0, minimum 5px margin should apply."""
zones = [{
"zone_type": "box",
"box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 0},
}]
_, inner = compute_box_ranges(zones)
assert inner == [(505, 695)] # minimum 5px applied
def test_no_box_zones_returns_empty(self):
"""Without box zones, both ranges should be empty."""
zones = [
{"zone_type": "content", "y": 0, "height": 500},
]
box_ranges, inner = compute_box_ranges(zones)
assert box_ranges == []
assert inner == []
def test_multiple_boxes(self):
"""Multiple boxes should each get their own inner range."""
zones = [
{"zone_type": "box", "box": {"x": 50, "y": 300, "width": 1100, "height": 150, "border_thickness": 8}},
{"zone_type": "box", "box": {"x": 50, "y": 700, "width": 1100, "height": 150, "border_thickness": 3}},
]
box_ranges, inner = compute_box_ranges(zones)
assert len(box_ranges) == 2
assert len(inner) == 2
assert inner[0] == (308, 442) # 300+8 to 450-8
assert inner[1] == (705, 845) # 700+5(min) to 850-5(min)
class TestContentStrips:
"""Tests for content strip building with box_ranges_inner."""
def test_single_box_creates_two_strips(self):
"""A single box in the middle should create two content strips."""
inner = [(505, 695)] # box inner at y=505..695
strips = build_content_strips(inner, top_y=100, bottom_y=1700)
assert len(strips) == 2
assert strips[0] == (100, 505) # above box
assert strips[1] == (695, 1700) # below box
def test_content_strip_includes_box_border_area(self):
"""Content strips should INCLUDE the box border area (not just stop at box outer edge)."""
# Box at y=500, height=200, border=10 → inner=(510, 690)
inner = [(510, 690)]
strips = build_content_strips(inner, top_y=100, bottom_y=1700)
# Strip above extends to 510 (not 500), including border area
assert strips[0] == (100, 510)
# Strip below starts at 690 (not 700), including border area
assert strips[1] == (690, 1700)
def test_row_at_box_border_is_in_content_strip(self):
"""A row at y=495 (just above box at y=500) should be in the content strip."""
# Box at y=500, height=200, border=10 → inner=(510, 690)
inner = [(510, 690)]
strips = build_content_strips(inner, top_y=100, bottom_y=1700)
# Row at y=495, height=30 → center at y=510 → just at the edge
row_center = 495 + 15 # = 510
# This row center is at the boundary — it should be in the first strip
in_first_strip = strips[0][0] <= row_center <= strips[0][1]
assert in_first_strip
def test_no_boxes_single_strip(self):
"""Without boxes, a single strip covering the full content should be returned."""
strips = build_content_strips([], top_y=100, bottom_y=1700)
assert len(strips) == 1
assert strips[0] == (100, 1700)
class TestRowInBoxFilter:
"""Tests for the _row_in_box filter using box_ranges_inner."""
def test_row_inside_box_is_excluded(self):
"""A row clearly inside the box inner range should be excluded."""
inner = [(510, 690)]
# Row at y=550, height=30 → center at 565
assert row_in_box(550, 30, inner) is True
def test_row_above_box_not_excluded(self):
"""A row above the box (at the border area) should NOT be excluded."""
inner = [(510, 690)]
# Row at y=490, height=30 → center at 505 → below inner start (510)
assert row_in_box(490, 30, inner) is False
def test_row_below_box_not_excluded(self):
"""A row below the box (at the border area) should NOT be excluded."""
inner = [(510, 690)]
# Row at y=695, height=30 → center at 710 → above inner end (690)
assert row_in_box(695, 30, inner) is False
def test_row_at_box_border_not_excluded(self):
"""A row overlapping with the box border should NOT be excluded.
This is the key fix: previously, box_ranges (not inner) was used,
which would exclude this row because its center (505) falls within
the full box range (500-700).
"""
# Full box range: (500, 700), inner: (510, 690)
inner = [(510, 690)]
# Row at y=490, height=30 → center at 505
# With box_ranges (500, 700): 500 <= 505 < 700 → excluded (BUG!)
# With box_ranges_inner (510, 690): 510 <= 505 → False → not excluded (FIXED!)
assert row_in_box(490, 30, inner) is False
def test_row_at_bottom_border_not_excluded(self):
"""A row overlapping with the bottom box border should NOT be excluded."""
inner = [(510, 690)]
# Row at y=685, height=30 → center at 700
# With box_ranges (500, 700): 500 <= 700 < 700 → not excluded (edge)
# With box_ranges_inner (510, 690): 510 <= 700 → True but 700 >= 690 → False
assert row_in_box(685, 30, inner) is False
def test_no_boxes_nothing_excluded(self):
"""Without box zones, no rows should be excluded."""
assert row_in_box(500, 30, []) is False
class TestBoxBoundaryIntegration:
"""Integration test: simulate the full row → content strip → filter pipeline."""
def test_boundary_row_preserved_with_inner_ranges(self):
"""
End-to-end: A row at the box boundary is preserved in content strips
and not filtered out by _row_in_box.
Simulates the real scenario: page with a box at y=500..700,
border_thickness=10. Row at y=488..518 (center=503) sits just
above the box border.
"""
zones = [{
"zone_type": "box",
"box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 10},
}]
# Step 1: Compute inner ranges
box_ranges, inner = compute_box_ranges(zones)
assert inner == [(510, 690)]
# Step 2: Build content strips
strips = build_content_strips(inner, top_y=20, bottom_y=2400)
assert len(strips) == 2
# First strip extends to 510 (includes the border area 500-510)
assert strips[0] == (20, 510)
# Step 3: Check that the boundary row is NOT in box
row_y, row_h = 488, 30 # center = 503
assert row_in_box(row_y, row_h, inner) is False
# Step 4: Verify the row's center falls within a content strip
row_center = row_y + row_h / 2 # 503
in_any_strip = any(ys <= row_center < ye for ys, ye in strips)
assert in_any_strip, f"Row center {row_center} should be in content strips {strips}"
def test_boundary_row_would_be_lost_with_full_ranges(self):
"""
Demonstrates the bug: using full box_ranges (not inner) WOULD
exclude the boundary row.
"""
zones = [{
"zone_type": "box",
"box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 10},
}]
box_ranges, _ = compute_box_ranges(zones)
# The full range is (500, 700)
row_center = 488 + 30 / 2 # 503
# With full range: 500 <= 503 < 700 → would be excluded!
in_box_full = any(by_s <= row_center < by_e for by_s, by_e in box_ranges)
assert in_box_full is True, "Full range SHOULD incorrectly exclude this row"