diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index cdada5c..effb7e2 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -2131,45 +2131,75 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area") - # --- Step 2b: Mask out full-width rows (sub-headers, colored bands) --- - # Rows where ink spans nearly the full content width distort the vertical - # projection by filling in column gaps. Detect them via horizontal density - # and zero them out before computing v_proj. + # --- Step 2b: Segment by sub-headers --- + # Pages with sub-headers (e.g. "Unit 4: Bonnie Scotland") have full-width + # text bands that pollute the vertical projection. We detect large + # horizontal gaps (= whitespace rows separating sections) and use only + # the tallest content segment for the projection. This makes column + # detection immune to sub-headers, illustrations, and section dividers. content_strip = inv[top_y:bottom_y, left_x:right_x] h_proj_row = np.sum(content_strip, axis=1).astype(float) h_proj_row_norm = h_proj_row / (content_w * 255) if content_w > 0 else h_proj_row - FULLWIDTH_THRESHOLD = 0.40 # normal text ~10-25%; full-width bands 40%+ - fullwidth_mask = h_proj_row_norm > FULLWIDTH_THRESHOLD + # Find horizontal gaps (near-empty rows) + H_GAP_THRESH = 0.02 # rows with <2% ink density are "empty" + h_in_gap = h_proj_row_norm < H_GAP_THRESH + H_MIN_GAP = max(5, content_h // 200) # min gap height ~5-7px - # Only mask contiguous bands (>=3 rows), not isolated noisy rows - masked_strip = content_strip.copy() - n_masked = 0 - band_start = None - for y_idx in range(len(fullwidth_mask)): - if fullwidth_mask[y_idx]: - if band_start is None: - band_start = y_idx + h_gaps: List[Tuple[int, int]] = [] + h_gap_start = None + for y_idx in range(len(h_in_gap)): + if h_in_gap[y_idx]: + if h_gap_start is None: + h_gap_start = y_idx else: - if band_start is not None: - band_height = y_idx - band_start - if band_height >= 3: - masked_strip[band_start:y_idx, :] = 0 - n_masked += band_height - band_start = None - if band_start is not None: - band_height = len(fullwidth_mask) - band_start - if band_height >= 3: - masked_strip[band_start:len(fullwidth_mask), :] = 0 - n_masked += band_height + if h_gap_start is not None: + if y_idx - h_gap_start >= H_MIN_GAP: + h_gaps.append((h_gap_start, y_idx)) + h_gap_start = None + if h_gap_start is not None and len(h_in_gap) - h_gap_start >= H_MIN_GAP: + h_gaps.append((h_gap_start, len(h_in_gap))) - if n_masked > 0: - logger.info(f"ColumnGeometry: masked {n_masked} full-width rows " - f"({n_masked * 100 / content_h:.1f}% of content height)") + # Identify "large" gaps (significantly bigger than median) that indicate + # section boundaries (sub-headers, chapter titles). + if len(h_gaps) >= 3: + gap_sizes = sorted(g[1] - g[0] for g in h_gaps) + median_gap_h = gap_sizes[len(gap_sizes) // 2] + large_gap_thresh = max(median_gap_h * 1.8, H_MIN_GAP + 3) + large_gaps = [(gs, ge) for gs, ge in h_gaps if ge - gs >= large_gap_thresh] + else: + large_gaps = h_gaps + + # Build content segments between large gaps and pick the tallest + seg_boundaries = [0] + for gs, ge in large_gaps: + seg_boundaries.append(gs) + seg_boundaries.append(ge) + seg_boundaries.append(content_h) + + segments = [] + for i in range(0, len(seg_boundaries) - 1, 2): + seg_top = seg_boundaries[i] + seg_bot = seg_boundaries[i + 1] if i + 1 < len(seg_boundaries) else content_h + seg_height = seg_bot - seg_top + if seg_height > 20: # ignore tiny fragments + segments.append((seg_top, seg_bot, seg_height)) + + if segments: + segments.sort(key=lambda s: s[2], reverse=True) + best_seg = segments[0] + proj_strip = content_strip[best_seg[0]:best_seg[1], :] + effective_h = best_seg[2] + if len(segments) > 1: + logger.info(f"ColumnGeometry: {len(segments)} segments from {len(large_gaps)} " + f"large h-gaps, using tallest: rows {best_seg[0]}..{best_seg[1]} " + f"({effective_h}px, {effective_h*100/content_h:.0f}%)") + else: + proj_strip = content_strip + effective_h = content_h # --- Step 3: Vertical projection profile --- - effective_h = content_h - n_masked - v_proj = np.sum(masked_strip, axis=0).astype(float) + v_proj = np.sum(proj_strip, axis=0).astype(float) v_proj_norm = v_proj / (effective_h * 255) if effective_h > 0 else v_proj # Smooth the projection to avoid noise-induced micro-gaps