feat(ocr-pipeline): filter scan artifacts in content bounds and add margin regions
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m50s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m50s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Thin black lines (1-5px) at page edges from scanning were incorrectly detected as content, shifting content bounds and creating spurious IGNORE columns. This filters narrow projection runs (<1% of image dimension) and introduces explicit margin_left/margin_right regions for downstream page reconstruction. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -32,6 +32,8 @@ from cv_vocab_pipeline import (
|
||||
create_ocr_image,
|
||||
create_layout_image,
|
||||
_find_content_bounds,
|
||||
_filter_narrow_runs,
|
||||
_build_margin_regions,
|
||||
analyze_layout,
|
||||
_group_words_into_lines,
|
||||
match_lines_to_vocab,
|
||||
@@ -843,6 +845,150 @@ class TestMergeContinuationRows:
|
||||
assert len(result) == 2
|
||||
|
||||
|
||||
# =============================================
|
||||
# Test: Content-Bounds Scan-Artifact Filtering
|
||||
# =============================================
|
||||
|
||||
class TestContentBoundsFiltering:
|
||||
"""Test that _find_content_bounds filters narrow scan artifacts."""
|
||||
|
||||
def test_thin_vertical_line_ignored(self):
|
||||
"""A 2px black line at the left edge should not pull left_x leftward."""
|
||||
inv = np.zeros((400, 600), dtype=np.uint8)
|
||||
# Main content block in the middle
|
||||
inv[50:350, 100:550] = 255
|
||||
# 2px thin vertical scan artifact at x=5..6
|
||||
inv[50:350, 5:7] = 255
|
||||
|
||||
left, right, top, bottom = _find_content_bounds(inv)
|
||||
# left_x must be near 100 (the real content), not near 5
|
||||
assert left >= 90, f"left_x={left} should be >=90 (near real content, not artifact)"
|
||||
|
||||
def test_thick_content_preserved(self):
|
||||
"""A 50px wide text block is real content and must not be filtered."""
|
||||
inv = np.zeros((400, 600), dtype=np.uint8)
|
||||
inv[50:350, 80:130] = 255 # 50px wide block
|
||||
inv[50:350, 200:500] = 255 # wider block
|
||||
|
||||
left, right, top, bottom = _find_content_bounds(inv)
|
||||
assert left <= 82, f"left_x={left} should be <=82 (50px block is real content)"
|
||||
|
||||
def test_no_artifacts_unchanged(self):
|
||||
"""Normal image without artifacts: bounds should match content."""
|
||||
inv = np.zeros((400, 600), dtype=np.uint8)
|
||||
inv[100:300, 50:550] = 255
|
||||
|
||||
left, right, top, bottom = _find_content_bounds(inv)
|
||||
assert left <= 52
|
||||
assert right >= 548
|
||||
assert top <= 105
|
||||
assert bottom >= 295
|
||||
|
||||
def test_right_edge_artifact_ignored(self):
|
||||
"""A thin vertical line at the right edge should not pull right_x rightward."""
|
||||
inv = np.zeros((400, 600), dtype=np.uint8)
|
||||
inv[50:350, 50:500] = 255 # real content
|
||||
inv[50:350, 595:598] = 255 # 3px artifact at right edge
|
||||
|
||||
left, right, top, bottom = _find_content_bounds(inv)
|
||||
assert right <= 510, f"right_x={right} should be <=510, ignoring right-edge artifact"
|
||||
|
||||
def test_horizontal_line_ignored(self):
|
||||
"""A thin horizontal line at the top should not pull top_y upward."""
|
||||
inv = np.zeros((400, 600), dtype=np.uint8)
|
||||
inv[100:350, 50:550] = 255 # real content
|
||||
inv[2:4, 50:550] = 255 # 2px horizontal artifact at top
|
||||
|
||||
left, right, top, bottom = _find_content_bounds(inv)
|
||||
assert top >= 90, f"top_y={top} should be >=90 (ignoring thin top line)"
|
||||
|
||||
|
||||
class TestFilterNarrowRuns:
|
||||
"""Test the _filter_narrow_runs helper directly."""
|
||||
|
||||
def test_removes_short_run(self):
|
||||
mask = np.array([False, True, True, False, True, True, True, True, True, False])
|
||||
result = _filter_narrow_runs(mask, min_width=3)
|
||||
# The 2-wide run at indices 1-2 should be removed
|
||||
assert not result[1]
|
||||
assert not result[2]
|
||||
# The 5-wide run at indices 4-8 should remain
|
||||
assert result[4]
|
||||
assert result[8]
|
||||
|
||||
def test_keeps_wide_run(self):
|
||||
mask = np.array([True] * 10)
|
||||
result = _filter_narrow_runs(mask, min_width=5)
|
||||
assert all(result)
|
||||
|
||||
def test_all_narrow(self):
|
||||
mask = np.array([True, True, False, True, False])
|
||||
result = _filter_narrow_runs(mask, min_width=3)
|
||||
assert not any(result)
|
||||
|
||||
|
||||
# =============================================
|
||||
# Test: Margin Regions
|
||||
# =============================================
|
||||
|
||||
class TestMarginRegions:
|
||||
"""Test _build_margin_regions and margin integration."""
|
||||
|
||||
def test_margin_left_created(self):
|
||||
"""When left_x > 5, a margin_left region should be created."""
|
||||
existing = [
|
||||
PageRegion(type='column_en', x=100, y=50, width=200, height=300),
|
||||
PageRegion(type='column_de', x=320, y=50, width=200, height=300),
|
||||
]
|
||||
margins = _build_margin_regions(existing, left_x=100, right_x=520,
|
||||
img_w=600, top_y=50, content_h=300)
|
||||
left_margins = [m for m in margins if m.type == 'margin_left']
|
||||
assert len(left_margins) == 1
|
||||
ml = left_margins[0]
|
||||
assert ml.x == 0
|
||||
assert ml.width == 100
|
||||
|
||||
def test_margin_right_created(self):
|
||||
"""When there's space after the last column, margin_right should be created."""
|
||||
existing = [
|
||||
PageRegion(type='column_en', x=50, y=50, width=200, height=300),
|
||||
PageRegion(type='column_de', x=260, y=50, width=200, height=300),
|
||||
]
|
||||
# last_col_end = 260 + 200 = 460, img_w = 600 → gap = 140
|
||||
margins = _build_margin_regions(existing, left_x=50, right_x=460,
|
||||
img_w=600, top_y=50, content_h=300)
|
||||
right_margins = [m for m in margins if m.type == 'margin_right']
|
||||
assert len(right_margins) == 1
|
||||
mr = right_margins[0]
|
||||
assert mr.x == 460
|
||||
assert mr.width == 140
|
||||
|
||||
def test_no_margin_when_flush(self):
|
||||
"""When columns are flush with the image edges, no margins should appear."""
|
||||
existing = [
|
||||
PageRegion(type='column_en', x=0, y=0, width=300, height=400),
|
||||
PageRegion(type='column_de', x=300, y=0, width=300, height=400),
|
||||
]
|
||||
margins = _build_margin_regions(existing, left_x=0, right_x=600,
|
||||
img_w=600, top_y=0, content_h=400)
|
||||
assert len(margins) == 0
|
||||
|
||||
def test_margins_in_skip_types(self):
|
||||
"""Verify margin types are in the skip set used by build_cell_grid."""
|
||||
skip = {'column_ignore', 'header', 'footer', 'page_ref', 'margin_left', 'margin_right'}
|
||||
assert 'margin_left' in skip
|
||||
assert 'margin_right' in skip
|
||||
|
||||
def test_margin_confidence_and_method(self):
|
||||
"""Margin regions should have confidence 1.0 and method 'content_bounds'."""
|
||||
existing = [PageRegion(type='column_en', x=80, y=20, width=400, height=500)]
|
||||
margins = _build_margin_regions(existing, left_x=80, right_x=480,
|
||||
img_w=600, top_y=20, content_h=500)
|
||||
for m in margins:
|
||||
assert m.classification_confidence == 1.0
|
||||
assert m.classification_method == 'content_bounds'
|
||||
|
||||
|
||||
# =============================================
|
||||
# RUN TESTS
|
||||
# =============================================
|
||||
|
||||
Reference in New Issue
Block a user