diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index a35e8d8..6c2d890 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -921,22 +921,15 @@ def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegi width=w, height=content_h )) - # Add header/footer info - if top_y > 10: - regions.append(PageRegion( - type='header', x=0, y=0, - width=w, height=top_y - )) - if bottom_y < h - 10: - regions.append(PageRegion( - type='footer', x=0, y=bottom_y, - width=w, height=h - bottom_y - )) + # Add header/footer info (gap-based detection with fallback) + _add_header_footer(regions, top_y, bottom_y, w, h, inv=inv) + has_header = any(r.type == 'header' for r in regions) + has_footer = any(r.type == 'footer' for r in regions) col_count = len([r for r in regions if r.type.startswith('column')]) logger.info(f"Layout: {col_count} columns, " - f"header={'yes' if top_y > 10 else 'no'}, " - f"footer={'yes' if bottom_y < h - 10 else 'no'}") + f"header={'yes' if has_header else 'no'}, " + f"footer={'yes' if has_footer else 'no'}") return regions @@ -2076,7 +2069,8 @@ def classify_column_types(geometries: List[ColumnGeometry], img_h: int, bottom_y: int, left_x: int = 0, - right_x: int = 0) -> List[PageRegion]: + right_x: int = 0, + inv: Optional[np.ndarray] = None) -> List[PageRegion]: """Classify column types using a 3-level fallback chain. Level 1: Content-based (language + role scoring) @@ -2159,20 +2153,20 @@ def classify_column_types(geometries: List[ColumnGeometry], regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h) if regions is not None: logger.info("ClassifyColumns: Level 1 (content-based) succeeded") - _add_header_footer(regions, top_y, bottom_y, img_w, img_h) + _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv) return _with_margins(ignore_regions + regions) # --- Level 2: Position + language enhanced --- regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h) if regions is not None: logger.info("ClassifyColumns: Level 2 (position+language) succeeded") - _add_header_footer(regions, top_y, bottom_y, img_w, img_h) + _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv) return _with_margins(ignore_regions + regions) # --- Level 3: Pure position fallback (old code, no regression) --- logger.info("ClassifyColumns: Level 3 (position fallback)") regions = _classify_by_position_fallback(geometries, content_w, content_h) - _add_header_footer(regions, top_y, bottom_y, img_w, img_h) + _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv) return _with_margins(ignore_regions + regions) @@ -2534,12 +2528,127 @@ def _classify_by_position_fallback(geometries: List[ColumnGeometry], return regions +def _detect_header_footer_gaps( + inv: np.ndarray, + img_w: int, + img_h: int, +) -> Tuple[Optional[int], Optional[int]]: + """Detect header/footer boundaries via horizontal projection gap analysis. + + Scans the full-page inverted image for large horizontal gaps in the top/bottom + 20% that separate header/footer content from the main body. + + Returns: + (header_y, footer_y) — absolute y-coordinates. + header_y = bottom edge of header region (None if no header detected). + footer_y = top edge of footer region (None if no footer detected). + """ + HEADER_FOOTER_ZONE = 0.20 + GAP_MULTIPLIER = 2.0 + + # Step 1: Horizontal projection over full image width + h_proj = np.sum(inv, axis=1).astype(float) + h_proj_norm = h_proj / (img_w * 255) if img_w > 0 else h_proj + + # Step 2: Smoothing + kernel_size = max(3, img_h // 200) + if kernel_size % 2 == 0: + kernel_size += 1 + h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same') + + # Step 3: Gap threshold + positive = h_smooth[h_smooth > 0] + median_density = float(np.median(positive)) if len(positive) > 0 else 0.01 + gap_threshold = max(median_density * 0.15, 0.003) + + in_gap = h_smooth < gap_threshold + MIN_GAP_HEIGHT = max(3, img_h // 500) + + # Step 4: Collect contiguous gaps + raw_gaps: List[Tuple[int, int]] = [] + gap_start: Optional[int] = None + for y in range(len(in_gap)): + if in_gap[y]: + if gap_start is None: + gap_start = y + else: + if gap_start is not None: + gap_height = y - gap_start + if gap_height >= MIN_GAP_HEIGHT: + raw_gaps.append((gap_start, y)) + gap_start = None + if gap_start is not None: + gap_height = len(in_gap) - gap_start + if gap_height >= MIN_GAP_HEIGHT: + raw_gaps.append((gap_start, len(in_gap))) + + if not raw_gaps: + return None, None + + # Step 5: Compute median gap size and large-gap threshold + gap_sizes = [g[1] - g[0] for g in raw_gaps] + median_gap = float(np.median(gap_sizes)) + large_gap_threshold = median_gap * GAP_MULTIPLIER + + # Step 6: Find largest qualifying gap in header / footer zones + header_zone_limit = int(img_h * HEADER_FOOTER_ZONE) + footer_zone_start = int(img_h * (1.0 - HEADER_FOOTER_ZONE)) + + header_y: Optional[int] = None + footer_y: Optional[int] = None + + best_header_size = 0 + for gs, ge in raw_gaps: + gap_mid = (gs + ge) / 2 + gap_size = ge - gs + if gap_mid < header_zone_limit and gap_size > large_gap_threshold: + if gap_size > best_header_size: + best_header_size = gap_size + header_y = ge # bottom edge of gap + + best_footer_size = 0 + for gs, ge in raw_gaps: + gap_mid = (gs + ge) / 2 + gap_size = ge - gs + if gap_mid > footer_zone_start and gap_size > large_gap_threshold: + if gap_size > best_footer_size: + best_footer_size = gap_size + footer_y = gs # top edge of gap + + if header_y is not None: + logger.info(f"HeaderFooterGaps: header boundary at y={header_y} " + f"(gap={best_header_size}px, median_gap={median_gap:.0f}px)") + if footer_y is not None: + logger.info(f"HeaderFooterGaps: footer boundary at y={footer_y} " + f"(gap={best_footer_size}px, median_gap={median_gap:.0f}px)") + + return header_y, footer_y + + def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int, - img_w: int, img_h: int) -> None: - """Add header/footer regions in-place.""" - if top_y > 10: + img_w: int, img_h: int, + inv: Optional[np.ndarray] = None) -> None: + """Add header/footer regions in-place. + + When *inv* is provided, uses gap-based detection to find header/footer + boundaries. Falls back to simple top_y/bottom_y check otherwise. + """ + header_y: Optional[int] = None + footer_y: Optional[int] = None + + if inv is not None: + header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h) + + # Gap-based header + if header_y is not None and header_y > 10: + regions.append(PageRegion(type='header', x=0, y=0, width=img_w, height=header_y)) + elif top_y > 10: regions.append(PageRegion(type='header', x=0, y=0, width=img_w, height=top_y)) - if bottom_y < img_h - 10: + + # Gap-based footer + if footer_y is not None and footer_y < img_h - 10: + regions.append(PageRegion(type='footer', x=0, y=footer_y, width=img_w, height=img_h - footer_y)) + elif bottom_y < img_h - 10: regions.append(PageRegion(type='footer', x=0, y=bottom_y, width=img_w, height=img_h - bottom_y)) @@ -2576,7 +2685,7 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li # Phase B: Content-based classification regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y, - left_x=left_x, right_x=right_x) + left_x=left_x, right_x=right_x, inv=_inv) col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref']) methods = set(r.classification_method for r in regions if r.classification_method) diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 9da70a9..e70c168 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -700,7 +700,7 @@ async def detect_columns(session_id: str): # Phase B: Content-based classification regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y, - left_x=left_x, right_x=right_x) + left_x=left_x, right_x=right_x, inv=inv) duration = time.time() - t0 diff --git a/klausur-service/backend/tests/test_cv_vocab_pipeline.py b/klausur-service/backend/tests/test_cv_vocab_pipeline.py index 0dd544d..6569df4 100644 --- a/klausur-service/backend/tests/test_cv_vocab_pipeline.py +++ b/klausur-service/backend/tests/test_cv_vocab_pipeline.py @@ -34,6 +34,7 @@ from cv_vocab_pipeline import ( _find_content_bounds, _filter_narrow_runs, _build_margin_regions, + _detect_header_footer_gaps, analyze_layout, _group_words_into_lines, match_lines_to_vocab, @@ -989,6 +990,106 @@ class TestMarginRegions: assert m.classification_method == 'content_bounds' +# ============================================= +# Header/Footer Gap Detection +# ============================================= + +class TestHeaderFooterGapDetection: + """Tests for _detect_header_footer_gaps().""" + + def _make_inv(self, height: int, width: int, bands: list) -> np.ndarray: + """Create an inverted binary image with white horizontal bands. + + Args: + height: Image height. + width: Image width. + bands: List of (y_start, y_end) tuples where pixels are white (255). + """ + inv = np.zeros((height, width), dtype=np.uint8) + for y1, y2 in bands: + inv[y1:y2, :] = 255 + return inv + + def _make_body_with_lines(self, h, w, body_start, body_end, + line_h=15, gap_h=12): + """Create bands simulating text lines with inter-line gaps. + + gap_h must be large enough to survive smoothing (kernel ~ h//200). + """ + bands = [] + y = body_start + while y + line_h <= body_end: + bands.append((y, y + line_h)) + y += line_h + gap_h + return bands + + def test_header_gap_detected(self): + """Content at top + large gap + main body → header_y at the gap.""" + h, w = 2000, 800 + # Header content at rows 20-80 + bands = [(20, 80)] + # Large gap 80-300 (220px) — much larger than 12px line gaps + # Body lines from 300 to ~1990 (extends near bottom, no footer gap) + bands += self._make_body_with_lines(h, w, 300, 1990) + inv = self._make_inv(h, w, bands) + header_y, footer_y = _detect_header_footer_gaps(inv, w, h) + assert header_y is not None + assert 80 <= header_y <= 310 + + def test_footer_gap_detected(self): + """Main body + large gap + page number → footer_y at the gap.""" + h, w = 2000, 800 + # Body lines from 10 to 1600 (starts near top, no header gap) + bands = self._make_body_with_lines(h, w, 10, 1600) + # Large gap 1600-1880 (280px) + # Page number 1880-1920 + bands.append((1880, 1920)) + inv = self._make_inv(h, w, bands) + header_y, footer_y = _detect_header_footer_gaps(inv, w, h) + assert footer_y is not None + assert 1580 <= footer_y <= 1890 + + def test_both_header_and_footer(self): + """Header + gap + body lines + gap + footer → both detected.""" + h, w = 2000, 800 + # Header 10-60 + bands = [(10, 60)] + # Large gap 60-250 (190px) + # Body lines from 250 to 1700 + bands += self._make_body_with_lines(h, w, 250, 1700) + # Large gap 1700-1900 (200px) + # Footer 1900-1970 + bands.append((1900, 1970)) + inv = self._make_inv(h, w, bands) + header_y, footer_y = _detect_header_footer_gaps(inv, w, h) + assert header_y is not None + assert footer_y is not None + assert 60 <= header_y <= 260 + assert 1690 <= footer_y <= 1910 + + def test_no_gaps_returns_none(self): + """Uniform content across the page → (None, None).""" + h, w = 1000, 800 + # Content across entire height + inv = self._make_inv(h, w, [(0, 1000)]) + header_y, footer_y = _detect_header_footer_gaps(inv, w, h) + assert header_y is None + assert footer_y is None + + def test_small_gaps_ignored(self): + """Gaps smaller than 2x median should be ignored.""" + h, w = 1000, 800 + # Many small, evenly-spaced gaps (like line spacing) — no large outlier + bands = [] + for row_start in range(0, 1000, 20): + bands.append((row_start, row_start + 15)) # 15px content, 5px gap + inv = self._make_inv(h, w, bands) + header_y, footer_y = _detect_header_footer_gaps(inv, w, h) + # All gaps are equal size, none > 2x median → no header/footer + assert header_y is None + assert footer_y is None + + # ============================================= # RUN TESTS # =============================================