From c8981423d4277c5c00380ed58976f8db07922887 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 2 Mar 2026 16:55:41 +0100 Subject: [PATCH] feat(ocr-pipeline): distinguish header/footer vs margin_top/margin_bottom Check for actual ink content in detected top/bottom regions: - 'header'/'footer' when text is present (e.g. title, page number) - 'margin_top'/'margin_bottom' when the region is empty page margin Also update all skip-type sets and color maps for the new types. Co-Authored-By: Claude Sonnet 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 82 +++++++++++++------ klausur-service/backend/ocr_pipeline_api.py | 6 +- .../backend/tests/test_cv_vocab_pipeline.py | 73 ++++++++++++++++- 3 files changed, 134 insertions(+), 27 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 6c2d890..52756aa 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -98,7 +98,7 @@ ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'o @dataclass class PageRegion: """A detected region on the page.""" - type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer' + type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom' x: int y: int width: int @@ -924,12 +924,10 @@ def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegi # Add header/footer info (gap-based detection with fallback) _add_header_footer(regions, top_y, bottom_y, w, h, inv=inv) - has_header = any(r.type == 'header' for r in regions) - has_footer = any(r.type == 'footer' for r in regions) + top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none') + bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none') col_count = len([r for r in regions if r.type.startswith('column')]) - logger.info(f"Layout: {col_count} columns, " - f"header={'yes' if has_header else 'no'}, " - f"footer={'yes' if has_footer else 'no'}") + logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}") return regions @@ -2042,7 +2040,8 @@ def _build_margin_regions( # Right margin: from end of last content column to image edge non_margin = [r for r in all_regions - if r.type not in ('margin_left', 'margin_right', 'header', 'footer')] + if r.type not in ('margin_left', 'margin_right', 'header', 'footer', + 'margin_top', 'margin_bottom')] if non_margin: last_col_end = max(r.x + r.width for r in non_margin) else: @@ -2625,13 +2624,37 @@ def _detect_header_footer_gaps( return header_y, footer_y +def _region_has_content(inv: np.ndarray, y_start: int, y_end: int, + min_density: float = 0.005) -> bool: + """Check whether a horizontal strip contains meaningful ink. + + Args: + inv: Inverted binarized image (white-on-black). + y_start: Top of the region (inclusive). + y_end: Bottom of the region (exclusive). + min_density: Fraction of white pixels required to count as content. + + Returns: + True if the region contains text/graphics, False if empty margin. + """ + if y_start >= y_end: + return False + strip = inv[y_start:y_end, :] + density = float(np.sum(strip)) / (strip.shape[0] * strip.shape[1] * 255) + return density > min_density + + def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int, img_w: int, img_h: int, inv: Optional[np.ndarray] = None) -> None: - """Add header/footer regions in-place. + """Add header/footer/margin regions in-place. - When *inv* is provided, uses gap-based detection to find header/footer - boundaries. Falls back to simple top_y/bottom_y check otherwise. + Uses gap-based detection when *inv* is provided, otherwise falls back + to simple top_y/bottom_y bounds. + + Region types depend on whether there is actual content (text/graphics): + - 'header' / 'footer' — region contains text (e.g. title, page number) + - 'margin_top' / 'margin_bottom' — region is empty page margin """ header_y: Optional[int] = None footer_y: Optional[int] = None @@ -2639,17 +2662,28 @@ def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int, if inv is not None: header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h) - # Gap-based header - if header_y is not None and header_y > 10: - regions.append(PageRegion(type='header', x=0, y=0, width=img_w, height=header_y)) - elif top_y > 10: - regions.append(PageRegion(type='header', x=0, y=0, width=img_w, height=top_y)) + # --- Top region --- + top_boundary = header_y if header_y is not None and header_y > 10 else ( + top_y if top_y > 10 else None + ) + if top_boundary is not None: + has_content = inv is not None and _region_has_content(inv, 0, top_boundary) + rtype = 'header' if has_content else 'margin_top' + regions.append(PageRegion(type=rtype, x=0, y=0, width=img_w, height=top_boundary)) + logger.info(f"HeaderFooter: top region type={rtype} height={top_boundary}px " + f"(has_content={has_content})") - # Gap-based footer - if footer_y is not None and footer_y < img_h - 10: - regions.append(PageRegion(type='footer', x=0, y=footer_y, width=img_w, height=img_h - footer_y)) - elif bottom_y < img_h - 10: - regions.append(PageRegion(type='footer', x=0, y=bottom_y, width=img_w, height=img_h - bottom_y)) + # --- Bottom region --- + bottom_boundary = footer_y if footer_y is not None and footer_y < img_h - 10 else ( + bottom_y if bottom_y < img_h - 10 else None + ) + if bottom_boundary is not None: + has_content = inv is not None and _region_has_content(inv, bottom_boundary, img_h) + rtype = 'footer' if has_content else 'margin_bottom' + regions.append(PageRegion(type=rtype, x=0, y=bottom_boundary, width=img_w, + height=img_h - bottom_boundary)) + logger.info(f"HeaderFooter: bottom region type={rtype} y={bottom_boundary} " + f"height={img_h - bottom_boundary}px (has_content={has_content})") # --- Main Entry Point --- @@ -2690,7 +2724,7 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref']) methods = set(r.classification_method for r in regions if r.classification_method) logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): " - f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer')]}") + f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer','margin_top','margin_bottom')]}") return regions @@ -3797,7 +3831,7 @@ def build_cell_grid( return [], [] # Use columns only — skip ignore, header, footer, page_ref - _skip_types = {'column_ignore', 'header', 'footer', 'page_ref', 'margin_left', 'margin_right'} + _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'page_ref', 'margin_left', 'margin_right'} relevant_cols = [c for c in column_regions if c.type not in _skip_types] if not relevant_cols: logger.warning("build_cell_grid: no usable columns found") @@ -3959,7 +3993,7 @@ def build_cell_grid_streaming( if not content_rows: return - _skip_types = {'column_ignore', 'header', 'footer', 'page_ref', 'margin_left', 'margin_right'} + _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'page_ref', 'margin_left', 'margin_right'} relevant_cols = [c for c in column_regions if c.type not in _skip_types] if not relevant_cols: return @@ -4444,7 +4478,7 @@ def run_multi_pass_ocr(ocr_img: np.ndarray, """ results: Dict[str, List[Dict]] = {} - _ocr_skip = {'header', 'footer', 'margin_left', 'margin_right'} + _ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'} for region in regions: if region.type in _ocr_skip: continue # Skip non-content regions diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index e70c168..78299f0 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -840,6 +840,8 @@ async def _get_columns_overlay(session_id: str) -> Response: "column_ignore": (180, 180, 180), # Light Gray "header": (128, 128, 128), # Gray "footer": (128, 128, 128), # Gray + "margin_top": (100, 100, 100), # Dark Gray + "margin_bottom": (100, 100, 100), # Dark Gray } overlay = img.copy() @@ -1226,7 +1228,7 @@ async def _word_stream_generator( # Compute grid shape upfront for the meta event n_content_rows = len([r for r in row_geoms if r.row_type == 'content']) - _skip_types = {'column_ignore', 'header', 'footer', 'page_ref'} + _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'page_ref'} n_cols = len([c for c in col_regions if c.type not in _skip_types]) # Determine layout @@ -1712,6 +1714,8 @@ async def _get_rows_overlay(session_id: str) -> Response: "content": (255, 180, 0), # Blue "header": (128, 128, 128), # Gray "footer": (128, 128, 128), # Gray + "margin_top": (100, 100, 100), # Dark Gray + "margin_bottom": (100, 100, 100), # Dark Gray } overlay = img.copy() diff --git a/klausur-service/backend/tests/test_cv_vocab_pipeline.py b/klausur-service/backend/tests/test_cv_vocab_pipeline.py index 6569df4..0ce25d7 100644 --- a/klausur-service/backend/tests/test_cv_vocab_pipeline.py +++ b/klausur-service/backend/tests/test_cv_vocab_pipeline.py @@ -35,6 +35,8 @@ from cv_vocab_pipeline import ( _filter_narrow_runs, _build_margin_regions, _detect_header_footer_gaps, + _region_has_content, + _add_header_footer, analyze_layout, _group_words_into_lines, match_lines_to_vocab, @@ -340,7 +342,8 @@ class TestLayoutAnalysis: ocr_img = create_ocr_image(text_like_image) layout_img = create_layout_image(text_like_image) regions = analyze_layout(layout_img, ocr_img) - valid_types = {'column_en', 'column_de', 'column_example', 'header', 'footer'} + valid_types = {'column_en', 'column_de', 'column_example', + 'header', 'footer', 'margin_top', 'margin_bottom'} for r in regions: assert r.type in valid_types, f"Unexpected region type: {r.type}" @@ -976,7 +979,7 @@ class TestMarginRegions: def test_margins_in_skip_types(self): """Verify margin types are in the skip set used by build_cell_grid.""" - skip = {'column_ignore', 'header', 'footer', 'page_ref', 'margin_left', 'margin_right'} + skip = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'page_ref', 'margin_left', 'margin_right'} assert 'margin_left' in skip assert 'margin_right' in skip @@ -1090,6 +1093,72 @@ class TestHeaderFooterGapDetection: assert footer_y is None +class TestRegionContentCheck: + """Tests for _region_has_content() and _add_header_footer() type selection.""" + + def _make_inv(self, height: int, width: int, bands: list) -> np.ndarray: + inv = np.zeros((height, width), dtype=np.uint8) + for y1, y2 in bands: + inv[y1:y2, :] = 255 + return inv + + def test_region_with_text_has_content(self): + """Strip with ink → True.""" + inv = self._make_inv(1000, 800, [(10, 50)]) + assert _region_has_content(inv, 0, 100) is True + + def test_empty_region_no_content(self): + """Strip without ink → False.""" + inv = self._make_inv(1000, 800, [(500, 600)]) + assert _region_has_content(inv, 0, 100) is False + + def test_header_with_text_is_header(self): + """Top region with text → type='header' (via content bounds fallback).""" + h, w = 1000, 800 + # Header text at 20-60, body starts at 200 + inv = self._make_inv(h, w, [(20, 60), (200, 900)]) + regions: list = [] + # Simulate content bounds detecting body start at y=200 + _add_header_footer(regions, top_y=200, bottom_y=h, img_w=w, img_h=h, inv=inv) + top_regions = [r for r in regions if r.type in ('header', 'margin_top')] + assert len(top_regions) == 1 + assert top_regions[0].type == 'header' # text at 20-60 → header + + def test_empty_top_is_margin_top(self): + """Top region without text → type='margin_top'.""" + h, w = 1000, 800 + # Content only in body area (200-900), nothing in top 200px + inv = self._make_inv(h, w, [(200, 900)]) + regions: list = [] + # Simulate top_y=200 from content bounds + _add_header_footer(regions, top_y=200, bottom_y=h, img_w=w, img_h=h, inv=inv) + top_regions = [r for r in regions if r.type in ('header', 'margin_top')] + assert len(top_regions) == 1 + assert top_regions[0].type == 'margin_top' + + def test_empty_bottom_is_margin_bottom(self): + """Bottom region without text → type='margin_bottom'.""" + h, w = 1000, 800 + # Content only in top/body (50-700), nothing below 700 + inv = self._make_inv(h, w, [(50, 700)]) + regions: list = [] + _add_header_footer(regions, top_y=50, bottom_y=700, img_w=w, img_h=h, inv=inv) + bottom_regions = [r for r in regions if r.type in ('footer', 'margin_bottom')] + assert len(bottom_regions) == 1 + assert bottom_regions[0].type == 'margin_bottom' + + def test_footer_with_page_number_is_footer(self): + """Bottom region with page number text → type='footer'.""" + h, w = 1000, 800 + # Body 50-700, page number at 900-930 + inv = self._make_inv(h, w, [(50, 700), (900, 930)]) + regions: list = [] + _add_header_footer(regions, top_y=50, bottom_y=700, img_w=w, img_h=h, inv=inv) + bottom_regions = [r for r in regions if r.type in ('footer', 'margin_bottom')] + assert len(bottom_regions) == 1 + assert bottom_regions[0].type == 'footer' + + # ============================================= # RUN TESTS # =============================================