From 0d72f2c836e09dc7e5f8743e2607d25936887036 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 3 Mar 2026 07:55:53 +0100 Subject: [PATCH] fix(sub-columns): protect sub-columns from column_ignore pre-filter Add is_sub_column flag to ColumnGeometry. Sub-columns created by _detect_sub_columns() are now exempt from the edge-column word_count<8 rule that converts them to column_ignore. Co-Authored-By: Claude Sonnet 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 625a2d2..eb873d9 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -118,6 +118,7 @@ class ColumnGeometry: word_count: int words: List[Dict] # Wort-Dicts aus Tesseract (text, conf, left, top, ...) width_ratio: float # width / content_width (0.0-1.0) + is_sub_column: bool = False # True if created by _detect_sub_columns() split @dataclass @@ -1150,6 +1151,7 @@ def _detect_sub_columns( word_count=len(sub_words), words=sub_words, width_ratio=sub_width / content_w if content_w > 0 else 0.0, + is_sub_column=True, ) main_geo = ColumnGeometry( index=0, @@ -1160,6 +1162,7 @@ def _detect_sub_columns( word_count=len(main_words), words=main_words, width_ratio=main_width / content_w if content_w > 0 else 0.0, + is_sub_column=True, ) result.append(sub_geo) @@ -2254,10 +2257,12 @@ def classify_column_types(geometries: List[ColumnGeometry], )]) # --- Pre-filter: first/last columns with very few words → column_ignore --- + # Sub-columns from _detect_sub_columns() are exempt: they intentionally + # have few words (page refs, markers) and should not be discarded. ignore_regions = [] active_geometries = [] for idx, g in enumerate(geometries): - if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8: + if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8 and not g.is_sub_column: ignore_regions.append(PageRegion( type='column_ignore', x=g.x, y=g.y, width=g.width, height=content_h,