Deleted pages: - /ai/model-management (mock data only, no real backend) - /ai/ocr-compare (old /vocab/ backend, replaced by ocr-kombi) - /ai/ocr-pipeline (minimal session browser, redundant) - /ai/ocr-overlay (legacy monolith, redundant) - /ai/gpu (vast.ai GPU management, no longer used) - /infrastructure/gpu (same) - /communication/video-chat (moved to core) - /communication/matrix (moved to core) Deleted backends: - backend-lehrer/infra/vast_client.py + vast_power.py - backend-lehrer/meetings_api.py + jitsi_api.py - website/app/api/admin/gpu/ - edu-search-service/scripts/vast_ai_extractor.py Total: ~7,800 LOC removed. All code preserved in git history. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
257 lines
10 KiB
Python
257 lines
10 KiB
Python
"""
|
|
Tests for box boundary row filtering logic (box_ranges_inner).
|
|
|
|
Verifies that rows at the border of box zones are NOT excluded during
|
|
row detection and word filtering. This prevents the last row above a
|
|
box from being clipped by the box's border pixels.
|
|
|
|
Related fix in ocr_pipeline_api.py: detect_rows() and detect_words()
|
|
use box_ranges_inner (shrunk by border_thickness, min 5px) instead of
|
|
full box_ranges for row exclusion.
|
|
"""
|
|
|
|
import pytest
|
|
import numpy as np
|
|
from dataclasses import dataclass
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Simulate the box_ranges_inner calculation from ocr_pipeline_api.py
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def compute_box_ranges(zones: list[dict]) -> tuple[list, list]:
|
|
"""
|
|
Replicates the box_ranges / box_ranges_inner calculation
|
|
from detect_rows() in ocr_pipeline_api.py.
|
|
"""
|
|
box_ranges = []
|
|
box_ranges_inner = []
|
|
for zone in zones:
|
|
if zone.get("zone_type") == "box" and zone.get("box"):
|
|
box = zone["box"]
|
|
bt = max(box.get("border_thickness", 0), 5) # minimum 5px margin
|
|
box_ranges.append((box["y"], box["y"] + box["height"]))
|
|
box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt))
|
|
return box_ranges, box_ranges_inner
|
|
|
|
|
|
def build_content_strips(box_ranges_inner: list, top_y: int, bottom_y: int) -> list:
|
|
"""
|
|
Replicates the content_strips calculation from detect_rows() in ocr_pipeline_api.py.
|
|
"""
|
|
sorted_boxes = sorted(box_ranges_inner, key=lambda r: r[0])
|
|
content_strips = []
|
|
strip_start = top_y
|
|
for by_start, by_end in sorted_boxes:
|
|
if by_start > strip_start:
|
|
content_strips.append((strip_start, by_start))
|
|
strip_start = max(strip_start, by_end)
|
|
if strip_start < bottom_y:
|
|
content_strips.append((strip_start, bottom_y))
|
|
return [(ys, ye) for ys, ye in content_strips if ye - ys >= 20]
|
|
|
|
|
|
def row_in_box(row_y: int, row_height: int, box_ranges_inner: list) -> bool:
|
|
"""
|
|
Replicates the _row_in_box filter from detect_words() in ocr_pipeline_api.py.
|
|
"""
|
|
center_y = row_y + row_height / 2
|
|
return any(by_s <= center_y < by_e for by_s, by_e in box_ranges_inner)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestBoxRangesInner:
|
|
"""Tests for box_ranges_inner calculation."""
|
|
|
|
def test_border_thickness_shrinks_inner_range(self):
|
|
"""Inner range should be shrunk by border_thickness."""
|
|
zones = [{
|
|
"zone_type": "box",
|
|
"box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 10},
|
|
}]
|
|
box_ranges, inner = compute_box_ranges(zones)
|
|
|
|
assert box_ranges == [(500, 700)]
|
|
assert inner == [(510, 690)] # shrunk by 10px on each side
|
|
|
|
def test_minimum_5px_margin(self):
|
|
"""Even with border_thickness=0, minimum 5px margin should apply."""
|
|
zones = [{
|
|
"zone_type": "box",
|
|
"box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 0},
|
|
}]
|
|
_, inner = compute_box_ranges(zones)
|
|
|
|
assert inner == [(505, 695)] # minimum 5px applied
|
|
|
|
def test_no_box_zones_returns_empty(self):
|
|
"""Without box zones, both ranges should be empty."""
|
|
zones = [
|
|
{"zone_type": "content", "y": 0, "height": 500},
|
|
]
|
|
box_ranges, inner = compute_box_ranges(zones)
|
|
|
|
assert box_ranges == []
|
|
assert inner == []
|
|
|
|
def test_multiple_boxes(self):
|
|
"""Multiple boxes should each get their own inner range."""
|
|
zones = [
|
|
{"zone_type": "box", "box": {"x": 50, "y": 300, "width": 1100, "height": 150, "border_thickness": 8}},
|
|
{"zone_type": "box", "box": {"x": 50, "y": 700, "width": 1100, "height": 150, "border_thickness": 3}},
|
|
]
|
|
box_ranges, inner = compute_box_ranges(zones)
|
|
|
|
assert len(box_ranges) == 2
|
|
assert len(inner) == 2
|
|
assert inner[0] == (308, 442) # 300+8 to 450-8
|
|
assert inner[1] == (705, 845) # 700+5(min) to 850-5(min)
|
|
|
|
|
|
class TestContentStrips:
|
|
"""Tests for content strip building with box_ranges_inner."""
|
|
|
|
def test_single_box_creates_two_strips(self):
|
|
"""A single box in the middle should create two content strips."""
|
|
inner = [(505, 695)] # box inner at y=505..695
|
|
strips = build_content_strips(inner, top_y=100, bottom_y=1700)
|
|
|
|
assert len(strips) == 2
|
|
assert strips[0] == (100, 505) # above box
|
|
assert strips[1] == (695, 1700) # below box
|
|
|
|
def test_content_strip_includes_box_border_area(self):
|
|
"""Content strips should INCLUDE the box border area (not just stop at box outer edge)."""
|
|
# Box at y=500, height=200, border=10 → inner=(510, 690)
|
|
inner = [(510, 690)]
|
|
strips = build_content_strips(inner, top_y=100, bottom_y=1700)
|
|
|
|
# Strip above extends to 510 (not 500), including border area
|
|
assert strips[0] == (100, 510)
|
|
# Strip below starts at 690 (not 700), including border area
|
|
assert strips[1] == (690, 1700)
|
|
|
|
def test_row_at_box_border_is_in_content_strip(self):
|
|
"""A row at y=495 (just above box at y=500) should be in the content strip."""
|
|
# Box at y=500, height=200, border=10 → inner=(510, 690)
|
|
inner = [(510, 690)]
|
|
strips = build_content_strips(inner, top_y=100, bottom_y=1700)
|
|
|
|
# Row at y=495, height=30 → center at y=510 → just at the edge
|
|
row_center = 495 + 15 # = 510
|
|
# This row center is at the boundary — it should be in the first strip
|
|
in_first_strip = strips[0][0] <= row_center <= strips[0][1]
|
|
assert in_first_strip
|
|
|
|
def test_no_boxes_single_strip(self):
|
|
"""Without boxes, a single strip covering the full content should be returned."""
|
|
strips = build_content_strips([], top_y=100, bottom_y=1700)
|
|
|
|
assert len(strips) == 1
|
|
assert strips[0] == (100, 1700)
|
|
|
|
|
|
class TestRowInBoxFilter:
|
|
"""Tests for the _row_in_box filter using box_ranges_inner."""
|
|
|
|
def test_row_inside_box_is_excluded(self):
|
|
"""A row clearly inside the box inner range should be excluded."""
|
|
inner = [(510, 690)]
|
|
# Row at y=550, height=30 → center at 565
|
|
assert row_in_box(550, 30, inner) is True
|
|
|
|
def test_row_above_box_not_excluded(self):
|
|
"""A row above the box (at the border area) should NOT be excluded."""
|
|
inner = [(510, 690)]
|
|
# Row at y=490, height=30 → center at 505 → below inner start (510)
|
|
assert row_in_box(490, 30, inner) is False
|
|
|
|
def test_row_below_box_not_excluded(self):
|
|
"""A row below the box (at the border area) should NOT be excluded."""
|
|
inner = [(510, 690)]
|
|
# Row at y=695, height=30 → center at 710 → above inner end (690)
|
|
assert row_in_box(695, 30, inner) is False
|
|
|
|
def test_row_at_box_border_not_excluded(self):
|
|
"""A row overlapping with the box border should NOT be excluded.
|
|
|
|
This is the key fix: previously, box_ranges (not inner) was used,
|
|
which would exclude this row because its center (505) falls within
|
|
the full box range (500-700).
|
|
"""
|
|
# Full box range: (500, 700), inner: (510, 690)
|
|
inner = [(510, 690)]
|
|
# Row at y=490, height=30 → center at 505
|
|
# With box_ranges (500, 700): 500 <= 505 < 700 → excluded (BUG!)
|
|
# With box_ranges_inner (510, 690): 510 <= 505 → False → not excluded (FIXED!)
|
|
assert row_in_box(490, 30, inner) is False
|
|
|
|
def test_row_at_bottom_border_not_excluded(self):
|
|
"""A row overlapping with the bottom box border should NOT be excluded."""
|
|
inner = [(510, 690)]
|
|
# Row at y=685, height=30 → center at 700
|
|
# With box_ranges (500, 700): 500 <= 700 < 700 → not excluded (edge)
|
|
# With box_ranges_inner (510, 690): 510 <= 700 → True but 700 >= 690 → False
|
|
assert row_in_box(685, 30, inner) is False
|
|
|
|
def test_no_boxes_nothing_excluded(self):
|
|
"""Without box zones, no rows should be excluded."""
|
|
assert row_in_box(500, 30, []) is False
|
|
|
|
|
|
class TestBoxBoundaryIntegration:
|
|
"""Integration test: simulate the full row → content strip → filter pipeline."""
|
|
|
|
def test_boundary_row_preserved_with_inner_ranges(self):
|
|
"""
|
|
End-to-end: A row at the box boundary is preserved in content strips
|
|
and not filtered out by _row_in_box.
|
|
|
|
Simulates the real scenario: page with a box at y=500..700,
|
|
border_thickness=10. Row at y=488..518 (center=503) sits just
|
|
above the box border.
|
|
"""
|
|
zones = [{
|
|
"zone_type": "box",
|
|
"box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 10},
|
|
}]
|
|
|
|
# Step 1: Compute inner ranges
|
|
box_ranges, inner = compute_box_ranges(zones)
|
|
assert inner == [(510, 690)]
|
|
|
|
# Step 2: Build content strips
|
|
strips = build_content_strips(inner, top_y=20, bottom_y=2400)
|
|
assert len(strips) == 2
|
|
# First strip extends to 510 (includes the border area 500-510)
|
|
assert strips[0] == (20, 510)
|
|
|
|
# Step 3: Check that the boundary row is NOT in box
|
|
row_y, row_h = 488, 30 # center = 503
|
|
assert row_in_box(row_y, row_h, inner) is False
|
|
|
|
# Step 4: Verify the row's center falls within a content strip
|
|
row_center = row_y + row_h / 2 # 503
|
|
in_any_strip = any(ys <= row_center < ye for ys, ye in strips)
|
|
assert in_any_strip, f"Row center {row_center} should be in content strips {strips}"
|
|
|
|
def test_boundary_row_would_be_lost_with_full_ranges(self):
|
|
"""
|
|
Demonstrates the bug: using full box_ranges (not inner) WOULD
|
|
exclude the boundary row.
|
|
"""
|
|
zones = [{
|
|
"zone_type": "box",
|
|
"box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 10},
|
|
}]
|
|
box_ranges, _ = compute_box_ranges(zones)
|
|
|
|
# The full range is (500, 700)
|
|
row_center = 488 + 30 / 2 # 503
|
|
# With full range: 500 <= 503 < 700 → would be excluded!
|
|
in_box_full = any(by_s <= row_center < by_e for by_s, by_e in box_ranges)
|
|
assert in_box_full is True, "Full range SHOULD incorrectly exclude this row"
|