diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 155dc6e..625a2d2 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -1038,6 +1038,9 @@ def _detect_sub_columns( geometries: List[ColumnGeometry], content_w: int, left_x: int = 0, + top_y: int = 0, + header_y: Optional[int] = None, + footer_y: Optional[int] = None, _edge_tolerance: int = 8, _min_col_start_ratio: float = 0.10, ) -> List[ColumnGeometry]: @@ -1053,6 +1056,11 @@ def _detect_sub_columns( while ``ColumnGeometry.x`` is in absolute image coordinates. *left_x* bridges the two coordinate systems. + If *header_y* / *footer_y* are provided (absolute y-coordinates), words + in header/footer regions are excluded from alignment clustering to avoid + polluting the bins with page numbers or chapter titles. Word ``top`` + values are relative to *top_y*. + Returns a new list of ColumnGeometry — potentially longer than the input. """ if content_w <= 0: @@ -1065,8 +1073,15 @@ def _detect_sub_columns( result.append(geo) continue - # Collect left-edges of confident words - confident = [w for w in geo.words if w.get('conf', 0) >= 30] + # Collect left-edges of confident words, excluding header/footer + # Convert header_y/footer_y from absolute to relative (word 'top' is relative to top_y) + min_top_rel = (header_y - top_y) if header_y is not None else None + max_top_rel = (footer_y - top_y) if footer_y is not None else None + + confident = [w for w in geo.words + if w.get('conf', 0) >= 30 + and (min_top_rel is None or w['top'] >= min_top_rel) + and (max_top_rel is None or w['top'] <= max_top_rel)] if len(confident) < 3: result.append(geo) continue @@ -1101,7 +1116,12 @@ def _detect_sub_columns( sub_words = [w for w in geo.words if w['left'] < split_threshold] main_words = [w for w in geo.words if w['left'] >= split_threshold] - if len(sub_words) < 2 or len(sub_words) / len(geo.words) >= 0.35: + # Count only body words (excluding header/footer) for the threshold check + # so that header/footer words don't artificially trigger a split. + sub_body = [w for w in sub_words + if (min_top_rel is None or w['top'] >= min_top_rel) + and (max_top_rel is None or w['top'] <= max_top_rel)] + if len(sub_body) < 2 or len(sub_body) / len(geo.words) >= 0.35: result.append(geo) continue @@ -2854,8 +2874,12 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result content_w = right_x - left_x + # Detect header/footer early so sub-column clustering ignores them + header_y, footer_y = _detect_header_footer_gaps(_inv, w, h) if _inv is not None else (None, None) + # Split sub-columns (e.g. page references) before classification - geometries = _detect_sub_columns(geometries, content_w, left_x=left_x) + geometries = _detect_sub_columns(geometries, content_w, left_x=left_x, + top_y=top_y, header_y=header_y, footer_y=footer_y) # Phase B: Content-based classification regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y, diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 2dff162..ba8dcb8 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -34,6 +34,7 @@ from cv_vocab_pipeline import ( PageRegion, RowGeometry, _cells_to_vocab_entries, + _detect_header_footer_gaps, _detect_sub_columns, _fix_character_confusion, _fix_phonetic_brackets, @@ -699,8 +700,12 @@ async def detect_columns(session_id: str): cached["_inv"] = inv cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y) + # Detect header/footer early so sub-column clustering ignores them + header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None) + # Split sub-columns (e.g. page references) before classification - geometries = _detect_sub_columns(geometries, content_w, left_x=left_x) + geometries = _detect_sub_columns(geometries, content_w, left_x=left_x, + top_y=top_y, header_y=header_y, footer_y=footer_y) # Phase B: Content-based classification regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y, diff --git a/klausur-service/backend/tests/test_cv_vocab_pipeline.py b/klausur-service/backend/tests/test_cv_vocab_pipeline.py index 1752334..3afbf66 100644 --- a/klausur-service/backend/tests/test_cv_vocab_pipeline.py +++ b/klausur-service/backend/tests/test_cv_vocab_pipeline.py @@ -1330,6 +1330,73 @@ class TestSubColumnDetection: assert result[0].word_count == 3 assert result[1].word_count == 40 + def test_header_words_excluded_from_alignment(self): + """Header words (top < header_y) should not participate in alignment clustering. + + Without header_y: 3 header words at left=100 + 40 content words at left=250 + would cause a split (3 outliers vs 40 main). + With header_y: the 3 header words are excluded from clustering, leaving only + 40 uniform words at left=250 → no split. + """ + content_w = 1000 + top_y = 0 + # Header words: top=5 (relative to top_y=0), well above header_y=50 + header_words = [{'left': 100, 'top': 5, 'width': 50, 'height': 20, + 'text': f"Ch.{i}", 'conf': 90} for i in range(3)] + # Content words: top=200, below header_y=50 + content_words = [{'left': 250, 'top': 200, 'width': 50, 'height': 20, + 'text': f"word{i}", 'conf': 90} for i in range(40)] + all_words = header_words + content_words + geo = self._make_geo(x=80, width=300, words=all_words, content_w=content_w) + + # Without header_y: split happens (3 outliers at left=100) + result_no_filter = _detect_sub_columns([geo], content_w) + assert len(result_no_filter) == 2, "Should split without header filtering" + + # With header_y=50: header words excluded, only 40 uniform words remain → no split + result_filtered = _detect_sub_columns([geo], content_w, top_y=top_y, header_y=50) + assert len(result_filtered) == 1, "Should NOT split with header words excluded" + assert result_filtered[0].word_count == 43 # all words still in the geometry + + def test_footer_words_excluded_from_alignment(self): + """Footer words (top > footer_y) should not participate in alignment clustering. + + Analog to header test but with footer words at the bottom. + """ + content_w = 1000 + top_y = 0 + # Content words: top=200, above footer_y=800 + content_words = [{'left': 250, 'top': 200, 'width': 50, 'height': 20, + 'text': f"word{i}", 'conf': 90} for i in range(40)] + # Footer words: top=900, below footer_y=800 + footer_words = [{'left': 100, 'top': 900, 'width': 50, 'height': 20, + 'text': f"p.{i}", 'conf': 90} for i in range(3)] + all_words = content_words + footer_words + geo = self._make_geo(x=80, width=300, words=all_words, content_w=content_w) + + # Without footer_y: split happens (3 outliers at left=100) + result_no_filter = _detect_sub_columns([geo], content_w) + assert len(result_no_filter) == 2, "Should split without footer filtering" + + # With footer_y=800: footer words excluded → no split + result_filtered = _detect_sub_columns([geo], content_w, top_y=top_y, footer_y=800) + assert len(result_filtered) == 1, "Should NOT split with footer words excluded" + assert result_filtered[0].word_count == 43 + + def test_header_footer_none_no_filtering(self): + """header_y=None, footer_y=None → same behavior as before (no filtering).""" + content_w = 1000 + page_words = [self._make_word(100, f"p.{59+i}") for i in range(3)] + vocab_words = [self._make_word(250, f"word{i}") for i in range(40)] + all_words = page_words + vocab_words + geo = self._make_geo(x=80, width=300, words=all_words, content_w=content_w) + + result = _detect_sub_columns([geo], content_w, header_y=None, footer_y=None) + + assert len(result) == 2, "Should still split with None header/footer" + assert result[0].word_count == 3 + assert result[1].word_count == 40 + class TestCellsToVocabEntriesPageRef: """Test that page_ref cells are mapped to source_page field."""