diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 90ca2f9..cdada5c 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -2250,6 +2250,50 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: " f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}") + # --- Step 5b: Word-coverage gap detection (fallback for noisy scans) --- + # When pixel-based projection fails (e.g. due to illustrations or colored + # bands), use word bounding boxes to find clear vertical gaps. This is + # immune to decorative graphics that Tesseract doesn't recognise as words. + if len(validated_gaps) < 2: + logger.info("ColumnGeometry: < 2 pixel-gaps, trying word-coverage gaps") + word_coverage = np.zeros(content_w, dtype=np.int32) + for wd in word_dicts: + wl = max(0, wd['left']) + wr = min(wd['left'] + wd['width'], content_w) + if wr > wl: + word_coverage[wl:wr] += 1 + + # Smooth slightly to bridge tiny 1-2px noise gaps between words + wc_kernel = max(3, content_w // 300) + if wc_kernel % 2 == 0: + wc_kernel += 1 + wc_smooth = np.convolve(word_coverage.astype(float), + np.ones(wc_kernel) / wc_kernel, mode='same') + + wc_in_gap = wc_smooth < 0.5 # effectively zero word coverage + WC_MIN_GAP = max(4, content_w // 300) + + wc_gaps: List[Tuple[int, int]] = [] + wc_gap_start = None + for x in range(len(wc_in_gap)): + if wc_in_gap[x]: + if wc_gap_start is None: + wc_gap_start = x + else: + if wc_gap_start is not None: + if x - wc_gap_start >= WC_MIN_GAP: + wc_gaps.append((wc_gap_start, x)) + wc_gap_start = None + if wc_gap_start is not None and len(wc_in_gap) - wc_gap_start >= WC_MIN_GAP: + wc_gaps.append((wc_gap_start, len(wc_in_gap))) + + logger.info(f"ColumnGeometry: {len(wc_gaps)} word-coverage gaps found " + f"(min_width={WC_MIN_GAP}px): " + f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in wc_gaps]}") + + if len(wc_gaps) >= 2: + validated_gaps = wc_gaps + # --- Step 6: Fallback to clustering if too few gaps --- if len(validated_gaps) < 2: logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")