feat(ocr-pipeline): distinguish header/footer vs margin_top/margin_bottom
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m0s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m0s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s
Check for actual ink content in detected top/bottom regions: - 'header'/'footer' when text is present (e.g. title, page number) - 'margin_top'/'margin_bottom' when the region is empty page margin Also update all skip-type sets and color maps for the new types. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -98,7 +98,7 @@ ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'o
|
|||||||
@dataclass
|
@dataclass
|
||||||
class PageRegion:
|
class PageRegion:
|
||||||
"""A detected region on the page."""
|
"""A detected region on the page."""
|
||||||
type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer'
|
type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom'
|
||||||
x: int
|
x: int
|
||||||
y: int
|
y: int
|
||||||
width: int
|
width: int
|
||||||
@@ -924,12 +924,10 @@ def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegi
|
|||||||
# Add header/footer info (gap-based detection with fallback)
|
# Add header/footer info (gap-based detection with fallback)
|
||||||
_add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
|
_add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
|
||||||
|
|
||||||
has_header = any(r.type == 'header' for r in regions)
|
top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none')
|
||||||
has_footer = any(r.type == 'footer' for r in regions)
|
bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none')
|
||||||
col_count = len([r for r in regions if r.type.startswith('column')])
|
col_count = len([r for r in regions if r.type.startswith('column')])
|
||||||
logger.info(f"Layout: {col_count} columns, "
|
logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}")
|
||||||
f"header={'yes' if has_header else 'no'}, "
|
|
||||||
f"footer={'yes' if has_footer else 'no'}")
|
|
||||||
|
|
||||||
return regions
|
return regions
|
||||||
|
|
||||||
@@ -2042,7 +2040,8 @@ def _build_margin_regions(
|
|||||||
|
|
||||||
# Right margin: from end of last content column to image edge
|
# Right margin: from end of last content column to image edge
|
||||||
non_margin = [r for r in all_regions
|
non_margin = [r for r in all_regions
|
||||||
if r.type not in ('margin_left', 'margin_right', 'header', 'footer')]
|
if r.type not in ('margin_left', 'margin_right', 'header', 'footer',
|
||||||
|
'margin_top', 'margin_bottom')]
|
||||||
if non_margin:
|
if non_margin:
|
||||||
last_col_end = max(r.x + r.width for r in non_margin)
|
last_col_end = max(r.x + r.width for r in non_margin)
|
||||||
else:
|
else:
|
||||||
@@ -2625,13 +2624,37 @@ def _detect_header_footer_gaps(
|
|||||||
return header_y, footer_y
|
return header_y, footer_y
|
||||||
|
|
||||||
|
|
||||||
|
def _region_has_content(inv: np.ndarray, y_start: int, y_end: int,
|
||||||
|
min_density: float = 0.005) -> bool:
|
||||||
|
"""Check whether a horizontal strip contains meaningful ink.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inv: Inverted binarized image (white-on-black).
|
||||||
|
y_start: Top of the region (inclusive).
|
||||||
|
y_end: Bottom of the region (exclusive).
|
||||||
|
min_density: Fraction of white pixels required to count as content.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if the region contains text/graphics, False if empty margin.
|
||||||
|
"""
|
||||||
|
if y_start >= y_end:
|
||||||
|
return False
|
||||||
|
strip = inv[y_start:y_end, :]
|
||||||
|
density = float(np.sum(strip)) / (strip.shape[0] * strip.shape[1] * 255)
|
||||||
|
return density > min_density
|
||||||
|
|
||||||
|
|
||||||
def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
|
def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
|
||||||
img_w: int, img_h: int,
|
img_w: int, img_h: int,
|
||||||
inv: Optional[np.ndarray] = None) -> None:
|
inv: Optional[np.ndarray] = None) -> None:
|
||||||
"""Add header/footer regions in-place.
|
"""Add header/footer/margin regions in-place.
|
||||||
|
|
||||||
When *inv* is provided, uses gap-based detection to find header/footer
|
Uses gap-based detection when *inv* is provided, otherwise falls back
|
||||||
boundaries. Falls back to simple top_y/bottom_y check otherwise.
|
to simple top_y/bottom_y bounds.
|
||||||
|
|
||||||
|
Region types depend on whether there is actual content (text/graphics):
|
||||||
|
- 'header' / 'footer' — region contains text (e.g. title, page number)
|
||||||
|
- 'margin_top' / 'margin_bottom' — region is empty page margin
|
||||||
"""
|
"""
|
||||||
header_y: Optional[int] = None
|
header_y: Optional[int] = None
|
||||||
footer_y: Optional[int] = None
|
footer_y: Optional[int] = None
|
||||||
@@ -2639,17 +2662,28 @@ def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
|
|||||||
if inv is not None:
|
if inv is not None:
|
||||||
header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h)
|
header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h)
|
||||||
|
|
||||||
# Gap-based header
|
# --- Top region ---
|
||||||
if header_y is not None and header_y > 10:
|
top_boundary = header_y if header_y is not None and header_y > 10 else (
|
||||||
regions.append(PageRegion(type='header', x=0, y=0, width=img_w, height=header_y))
|
top_y if top_y > 10 else None
|
||||||
elif top_y > 10:
|
)
|
||||||
regions.append(PageRegion(type='header', x=0, y=0, width=img_w, height=top_y))
|
if top_boundary is not None:
|
||||||
|
has_content = inv is not None and _region_has_content(inv, 0, top_boundary)
|
||||||
|
rtype = 'header' if has_content else 'margin_top'
|
||||||
|
regions.append(PageRegion(type=rtype, x=0, y=0, width=img_w, height=top_boundary))
|
||||||
|
logger.info(f"HeaderFooter: top region type={rtype} height={top_boundary}px "
|
||||||
|
f"(has_content={has_content})")
|
||||||
|
|
||||||
# Gap-based footer
|
# --- Bottom region ---
|
||||||
if footer_y is not None and footer_y < img_h - 10:
|
bottom_boundary = footer_y if footer_y is not None and footer_y < img_h - 10 else (
|
||||||
regions.append(PageRegion(type='footer', x=0, y=footer_y, width=img_w, height=img_h - footer_y))
|
bottom_y if bottom_y < img_h - 10 else None
|
||||||
elif bottom_y < img_h - 10:
|
)
|
||||||
regions.append(PageRegion(type='footer', x=0, y=bottom_y, width=img_w, height=img_h - bottom_y))
|
if bottom_boundary is not None:
|
||||||
|
has_content = inv is not None and _region_has_content(inv, bottom_boundary, img_h)
|
||||||
|
rtype = 'footer' if has_content else 'margin_bottom'
|
||||||
|
regions.append(PageRegion(type=rtype, x=0, y=bottom_boundary, width=img_w,
|
||||||
|
height=img_h - bottom_boundary))
|
||||||
|
logger.info(f"HeaderFooter: bottom region type={rtype} y={bottom_boundary} "
|
||||||
|
f"height={img_h - bottom_boundary}px (has_content={has_content})")
|
||||||
|
|
||||||
|
|
||||||
# --- Main Entry Point ---
|
# --- Main Entry Point ---
|
||||||
@@ -2690,7 +2724,7 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
|
|||||||
col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
|
col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
|
||||||
methods = set(r.classification_method for r in regions if r.classification_method)
|
methods = set(r.classification_method for r in regions if r.classification_method)
|
||||||
logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): "
|
logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): "
|
||||||
f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer')]}")
|
f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer','margin_top','margin_bottom')]}")
|
||||||
|
|
||||||
return regions
|
return regions
|
||||||
|
|
||||||
@@ -3797,7 +3831,7 @@ def build_cell_grid(
|
|||||||
return [], []
|
return [], []
|
||||||
|
|
||||||
# Use columns only — skip ignore, header, footer, page_ref
|
# Use columns only — skip ignore, header, footer, page_ref
|
||||||
_skip_types = {'column_ignore', 'header', 'footer', 'page_ref', 'margin_left', 'margin_right'}
|
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'page_ref', 'margin_left', 'margin_right'}
|
||||||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||||||
if not relevant_cols:
|
if not relevant_cols:
|
||||||
logger.warning("build_cell_grid: no usable columns found")
|
logger.warning("build_cell_grid: no usable columns found")
|
||||||
@@ -3959,7 +3993,7 @@ def build_cell_grid_streaming(
|
|||||||
if not content_rows:
|
if not content_rows:
|
||||||
return
|
return
|
||||||
|
|
||||||
_skip_types = {'column_ignore', 'header', 'footer', 'page_ref', 'margin_left', 'margin_right'}
|
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'page_ref', 'margin_left', 'margin_right'}
|
||||||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||||||
if not relevant_cols:
|
if not relevant_cols:
|
||||||
return
|
return
|
||||||
@@ -4444,7 +4478,7 @@ def run_multi_pass_ocr(ocr_img: np.ndarray,
|
|||||||
"""
|
"""
|
||||||
results: Dict[str, List[Dict]] = {}
|
results: Dict[str, List[Dict]] = {}
|
||||||
|
|
||||||
_ocr_skip = {'header', 'footer', 'margin_left', 'margin_right'}
|
_ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
|
||||||
for region in regions:
|
for region in regions:
|
||||||
if region.type in _ocr_skip:
|
if region.type in _ocr_skip:
|
||||||
continue # Skip non-content regions
|
continue # Skip non-content regions
|
||||||
|
|||||||
@@ -840,6 +840,8 @@ async def _get_columns_overlay(session_id: str) -> Response:
|
|||||||
"column_ignore": (180, 180, 180), # Light Gray
|
"column_ignore": (180, 180, 180), # Light Gray
|
||||||
"header": (128, 128, 128), # Gray
|
"header": (128, 128, 128), # Gray
|
||||||
"footer": (128, 128, 128), # Gray
|
"footer": (128, 128, 128), # Gray
|
||||||
|
"margin_top": (100, 100, 100), # Dark Gray
|
||||||
|
"margin_bottom": (100, 100, 100), # Dark Gray
|
||||||
}
|
}
|
||||||
|
|
||||||
overlay = img.copy()
|
overlay = img.copy()
|
||||||
@@ -1226,7 +1228,7 @@ async def _word_stream_generator(
|
|||||||
|
|
||||||
# Compute grid shape upfront for the meta event
|
# Compute grid shape upfront for the meta event
|
||||||
n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
|
n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
|
||||||
_skip_types = {'column_ignore', 'header', 'footer', 'page_ref'}
|
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'page_ref'}
|
||||||
n_cols = len([c for c in col_regions if c.type not in _skip_types])
|
n_cols = len([c for c in col_regions if c.type not in _skip_types])
|
||||||
|
|
||||||
# Determine layout
|
# Determine layout
|
||||||
@@ -1712,6 +1714,8 @@ async def _get_rows_overlay(session_id: str) -> Response:
|
|||||||
"content": (255, 180, 0), # Blue
|
"content": (255, 180, 0), # Blue
|
||||||
"header": (128, 128, 128), # Gray
|
"header": (128, 128, 128), # Gray
|
||||||
"footer": (128, 128, 128), # Gray
|
"footer": (128, 128, 128), # Gray
|
||||||
|
"margin_top": (100, 100, 100), # Dark Gray
|
||||||
|
"margin_bottom": (100, 100, 100), # Dark Gray
|
||||||
}
|
}
|
||||||
|
|
||||||
overlay = img.copy()
|
overlay = img.copy()
|
||||||
|
|||||||
@@ -35,6 +35,8 @@ from cv_vocab_pipeline import (
|
|||||||
_filter_narrow_runs,
|
_filter_narrow_runs,
|
||||||
_build_margin_regions,
|
_build_margin_regions,
|
||||||
_detect_header_footer_gaps,
|
_detect_header_footer_gaps,
|
||||||
|
_region_has_content,
|
||||||
|
_add_header_footer,
|
||||||
analyze_layout,
|
analyze_layout,
|
||||||
_group_words_into_lines,
|
_group_words_into_lines,
|
||||||
match_lines_to_vocab,
|
match_lines_to_vocab,
|
||||||
@@ -340,7 +342,8 @@ class TestLayoutAnalysis:
|
|||||||
ocr_img = create_ocr_image(text_like_image)
|
ocr_img = create_ocr_image(text_like_image)
|
||||||
layout_img = create_layout_image(text_like_image)
|
layout_img = create_layout_image(text_like_image)
|
||||||
regions = analyze_layout(layout_img, ocr_img)
|
regions = analyze_layout(layout_img, ocr_img)
|
||||||
valid_types = {'column_en', 'column_de', 'column_example', 'header', 'footer'}
|
valid_types = {'column_en', 'column_de', 'column_example',
|
||||||
|
'header', 'footer', 'margin_top', 'margin_bottom'}
|
||||||
for r in regions:
|
for r in regions:
|
||||||
assert r.type in valid_types, f"Unexpected region type: {r.type}"
|
assert r.type in valid_types, f"Unexpected region type: {r.type}"
|
||||||
|
|
||||||
@@ -976,7 +979,7 @@ class TestMarginRegions:
|
|||||||
|
|
||||||
def test_margins_in_skip_types(self):
|
def test_margins_in_skip_types(self):
|
||||||
"""Verify margin types are in the skip set used by build_cell_grid."""
|
"""Verify margin types are in the skip set used by build_cell_grid."""
|
||||||
skip = {'column_ignore', 'header', 'footer', 'page_ref', 'margin_left', 'margin_right'}
|
skip = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'page_ref', 'margin_left', 'margin_right'}
|
||||||
assert 'margin_left' in skip
|
assert 'margin_left' in skip
|
||||||
assert 'margin_right' in skip
|
assert 'margin_right' in skip
|
||||||
|
|
||||||
@@ -1090,6 +1093,72 @@ class TestHeaderFooterGapDetection:
|
|||||||
assert footer_y is None
|
assert footer_y is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestRegionContentCheck:
|
||||||
|
"""Tests for _region_has_content() and _add_header_footer() type selection."""
|
||||||
|
|
||||||
|
def _make_inv(self, height: int, width: int, bands: list) -> np.ndarray:
|
||||||
|
inv = np.zeros((height, width), dtype=np.uint8)
|
||||||
|
for y1, y2 in bands:
|
||||||
|
inv[y1:y2, :] = 255
|
||||||
|
return inv
|
||||||
|
|
||||||
|
def test_region_with_text_has_content(self):
|
||||||
|
"""Strip with ink → True."""
|
||||||
|
inv = self._make_inv(1000, 800, [(10, 50)])
|
||||||
|
assert _region_has_content(inv, 0, 100) is True
|
||||||
|
|
||||||
|
def test_empty_region_no_content(self):
|
||||||
|
"""Strip without ink → False."""
|
||||||
|
inv = self._make_inv(1000, 800, [(500, 600)])
|
||||||
|
assert _region_has_content(inv, 0, 100) is False
|
||||||
|
|
||||||
|
def test_header_with_text_is_header(self):
|
||||||
|
"""Top region with text → type='header' (via content bounds fallback)."""
|
||||||
|
h, w = 1000, 800
|
||||||
|
# Header text at 20-60, body starts at 200
|
||||||
|
inv = self._make_inv(h, w, [(20, 60), (200, 900)])
|
||||||
|
regions: list = []
|
||||||
|
# Simulate content bounds detecting body start at y=200
|
||||||
|
_add_header_footer(regions, top_y=200, bottom_y=h, img_w=w, img_h=h, inv=inv)
|
||||||
|
top_regions = [r for r in regions if r.type in ('header', 'margin_top')]
|
||||||
|
assert len(top_regions) == 1
|
||||||
|
assert top_regions[0].type == 'header' # text at 20-60 → header
|
||||||
|
|
||||||
|
def test_empty_top_is_margin_top(self):
|
||||||
|
"""Top region without text → type='margin_top'."""
|
||||||
|
h, w = 1000, 800
|
||||||
|
# Content only in body area (200-900), nothing in top 200px
|
||||||
|
inv = self._make_inv(h, w, [(200, 900)])
|
||||||
|
regions: list = []
|
||||||
|
# Simulate top_y=200 from content bounds
|
||||||
|
_add_header_footer(regions, top_y=200, bottom_y=h, img_w=w, img_h=h, inv=inv)
|
||||||
|
top_regions = [r for r in regions if r.type in ('header', 'margin_top')]
|
||||||
|
assert len(top_regions) == 1
|
||||||
|
assert top_regions[0].type == 'margin_top'
|
||||||
|
|
||||||
|
def test_empty_bottom_is_margin_bottom(self):
|
||||||
|
"""Bottom region without text → type='margin_bottom'."""
|
||||||
|
h, w = 1000, 800
|
||||||
|
# Content only in top/body (50-700), nothing below 700
|
||||||
|
inv = self._make_inv(h, w, [(50, 700)])
|
||||||
|
regions: list = []
|
||||||
|
_add_header_footer(regions, top_y=50, bottom_y=700, img_w=w, img_h=h, inv=inv)
|
||||||
|
bottom_regions = [r for r in regions if r.type in ('footer', 'margin_bottom')]
|
||||||
|
assert len(bottom_regions) == 1
|
||||||
|
assert bottom_regions[0].type == 'margin_bottom'
|
||||||
|
|
||||||
|
def test_footer_with_page_number_is_footer(self):
|
||||||
|
"""Bottom region with page number text → type='footer'."""
|
||||||
|
h, w = 1000, 800
|
||||||
|
# Body 50-700, page number at 900-930
|
||||||
|
inv = self._make_inv(h, w, [(50, 700), (900, 930)])
|
||||||
|
regions: list = []
|
||||||
|
_add_header_footer(regions, top_y=50, bottom_y=700, img_w=w, img_h=h, inv=inv)
|
||||||
|
bottom_regions = [r for r in regions if r.type in ('footer', 'margin_bottom')]
|
||||||
|
assert len(bottom_regions) == 1
|
||||||
|
assert bottom_regions[0].type == 'footer'
|
||||||
|
|
||||||
|
|
||||||
# =============================================
|
# =============================================
|
||||||
# RUN TESTS
|
# RUN TESTS
|
||||||
# =============================================
|
# =============================================
|
||||||
|
|||||||
Reference in New Issue
Block a user