From 4532f6817331b187733259e7198995435eb92061 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 5 Mar 2026 23:13:19 +0100 Subject: [PATCH] fix: Word-Validation auf Segment-Woerter beschraenken Woerter aus Sub-Header-Bereichen ueberlappten korrekte Spaltenluecken und liessen die Word-Validation faelschlich Gaps verwerfen. Jetzt werden nur Woerter aus dem gewaehlten Segment fuer die Validation verwendet. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index effb7e2..031b7ff 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -2240,11 +2240,25 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}") # --- Step 5: Validate gaps against word bounding boxes --- + # When using a segment for projection, only validate against words + # inside that segment — words from sub-headers or other sections + # would incorrectly overlap with real column gaps. + if segments and len(segments) > 1: + seg_top_abs = best_seg[0] # relative to content strip + seg_bot_abs = best_seg[1] + segment_words = [wd for wd in word_dicts + if wd['top'] >= seg_top_abs + and wd['top'] + wd['height'] <= seg_bot_abs] + logger.info(f"ColumnGeometry: filtering words to segment: " + f"{len(segment_words)}/{len(word_dicts)} words") + else: + segment_words = word_dicts + validated_gaps = [] for gap_start_rel, gap_end_rel in raw_gaps: # Check if any word overlaps with this gap region overlapping = False - for wd in word_dicts: + for wd in segment_words: word_left = wd['left'] word_right = wd['left'] + wd['width'] if word_left < gap_end_rel and word_right > gap_start_rel: @@ -2258,7 +2272,7 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt # Find the tightest word boundaries within the gap region min_word_left = content_w max_word_right = 0 - for wd in word_dicts: + for wd in segment_words: word_left = wd['left'] word_right = wd['left'] + wd['width'] if word_left < gap_end_rel and word_right > gap_start_rel: @@ -2287,7 +2301,7 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt if len(validated_gaps) < 2: logger.info("ColumnGeometry: < 2 pixel-gaps, trying word-coverage gaps") word_coverage = np.zeros(content_w, dtype=np.int32) - for wd in word_dicts: + for wd in segment_words: wl = max(0, wd['left']) wr = min(wd['left'] + wd['width'], content_w) if wr > wl: