fix(sub-columns): exclude header/footer words from alignment clustering

Header/footer words (page numbers, chapter titles) could pollute the left-edge alignment bins and trigger false sub-column splits. Now _detect_header_footer_gaps() runs early and its boundaries are passed to _detect_sub_columns() to filter those words from clustering and the split threshold check. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 07:33:54 +01:00
parent 3904ddb493
commit 6527beae03
3 changed files with 101 additions and 5 deletions
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -34,6 +34,7 @@ from cv_vocab_pipeline import (
    PageRegion,
    RowGeometry,
    _cells_to_vocab_entries,
+    _detect_header_footer_gaps,
    _detect_sub_columns,
    _fix_character_confusion,
    _fix_phonetic_brackets,
@@ -699,8 +700,12 @@ async def detect_columns(session_id: str):
        cached["_inv"] = inv
        cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)

+        # Detect header/footer early so sub-column clustering ignores them
+        header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None)
+
        # Split sub-columns (e.g. page references) before classification
-        geometries = _detect_sub_columns(geometries, content_w, left_x=left_x)
+        geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
+                                          top_y=top_y, header_y=header_y, footer_y=footer_y)

        # Phase B: Content-based classification
        regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,