diff --git a/klausur-service/backend/cv_layout.py b/klausur-service/backend/cv_layout.py index c16e110..9598668 100644 --- a/klausur-service/backend/cv_layout.py +++ b/klausur-service/backend/cv_layout.py @@ -1265,72 +1265,95 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt if len(wc_gaps) >= 2: validated_gaps = wc_gaps - # --- Step 5c: Left-edge alignment validation --- - # A real column gap must have words to its right whose left-edges are - # consistently aligned (i.e. many words start at nearly the same x). - # If words to the right of a gap have scattered left-edges, the gap is - # just a natural gap within a wide column (e.g. short words ending - # before longer example sentences in the same column). + # --- Step 5c: Left-edge alignment validation (suspicious gaps only) --- + # Only check gaps that would create an unusually wide column to the right. + # These are likely false splits within a single wide column (e.g. short EN + # words followed by longer DE example sentences in the same column). + # Gaps that produce columns of similar width to their neighbors are trusted. if len(validated_gaps) > 2: edge_tolerance_align = max(8, content_w // 150) - min_aligned_ratio = 0.25 # at least 25% of words must share a left-edge bin + min_aligned_ratio = 0.15 # at least 15% of words must share a left-edge bin margin_thresh = max(10, int(content_w * 0.02)) - alignment_validated = [] - for gap_start_rel, gap_end_rel in validated_gaps: - # Skip margin gaps — they don't need alignment validation - if gap_start_rel <= margin_thresh: - alignment_validated.append((gap_start_rel, gap_end_rel)) - continue - if gap_end_rel >= content_w - margin_thresh: - alignment_validated.append((gap_start_rel, gap_end_rel)) - continue + # Compute tentative column widths from all gaps + sorted_gaps = sorted(validated_gaps, key=lambda g: g[0]) + # Interior gaps only (exclude margins) + interior_indices = [] + for gi, (gs, ge) in enumerate(sorted_gaps): + if gs > margin_thresh and ge < content_w - margin_thresh: + interior_indices.append(gi) - # Find the next gap after this one (or content end) - next_gap_start = content_w - for gs, ge in validated_gaps: - if gs > gap_end_rel: - next_gap_start = gs - break - - # Collect words to the right of this gap (up to the next gap) - right_words = [w for w in segment_words - if gap_end_rel <= w['left'] < next_gap_start] - - if len(right_words) < 3: - # Too few words — keep the gap (benefit of the doubt) - alignment_validated.append((gap_start_rel, gap_end_rel)) - continue - - # Cluster left-edges of right-side words - right_lefts = sorted(w['left'] for w in right_words) - bins = [] - cur_bin = [right_lefts[0]] - for le in right_lefts[1:]: - if le - cur_bin[-1] <= edge_tolerance_align: - cur_bin.append(le) + if interior_indices: + # For each interior gap, compute the width of the column it starts + gap_suspicion: dict = {} # gap_index → right_col_width + for gi in interior_indices: + gap_end = sorted_gaps[gi][1] + # Next gap start (or content right edge) + if gi + 1 < len(sorted_gaps): + next_gs = sorted_gaps[gi + 1][0] else: - bins.append(len(cur_bin)) - cur_bin = [le] - bins.append(len(cur_bin)) + next_gs = content_w + right_col_w = next_gs - gap_end + gap_suspicion[gi] = right_col_w - # The largest bin must contain a significant fraction of words - max_bin = max(bins) - ratio = max_bin / len(right_words) + # Median column width (from all gaps, including margins) + all_col_widths = [] + prev_end = 0 + for gs, ge in sorted_gaps: + cw = gs - prev_end + if cw > 0: + all_col_widths.append(cw) + prev_end = ge + trailing = content_w - prev_end + if trailing > 0: + all_col_widths.append(trailing) + median_col_w = sorted(all_col_widths)[len(all_col_widths) // 2] if all_col_widths else content_w - if ratio >= min_aligned_ratio: - alignment_validated.append((gap_start_rel, gap_end_rel)) - logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] " - f"passed alignment check (best_bin={max_bin}/{len(right_words)}={ratio:.2f})") - else: - logger.info(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] " - f"REJECTED — words to the right have no consistent left-edge alignment " - f"(best_bin={max_bin}/{len(right_words)}={ratio:.2f} < {min_aligned_ratio})") + # A gap is suspicious if the column to its right is > 2x median width + suspicious_threshold = median_col_w * 2.0 - if len(alignment_validated) >= 2: - validated_gaps = alignment_validated - else: - logger.info("ColumnGeometry: alignment filter removed too many gaps, keeping originals") + alignment_validated = list(validated_gaps) # start with all + for gi in interior_indices: + right_col_w = gap_suspicion[gi] + if right_col_w <= suspicious_threshold: + continue # normal gap, keep it + + # Suspicious — check left-edge alignment + gap_start_rel, gap_end_rel = sorted_gaps[gi] + next_gs = sorted_gaps[gi + 1][0] if gi + 1 < len(sorted_gaps) else content_w + right_words = [w for w in segment_words + if gap_end_rel <= w['left'] < next_gs] + + if len(right_words) < 3: + continue # too few words, keep gap + + # Cluster left-edges + right_lefts = sorted(w['left'] for w in right_words) + bins = [] + cur_bin = [right_lefts[0]] + for le in right_lefts[1:]: + if le - cur_bin[-1] <= edge_tolerance_align: + cur_bin.append(le) + else: + bins.append(len(cur_bin)) + cur_bin = [le] + bins.append(len(cur_bin)) + + max_bin = max(bins) + ratio = max_bin / len(right_words) + + if ratio < min_aligned_ratio: + # Remove this gap + alignment_validated.remove((gap_start_rel, gap_end_rel)) + logger.info(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] " + f"REJECTED — suspicious (right_col={right_col_w}px > 2x median={median_col_w:.0f}px) " + f"and poor left-edge alignment (best_bin={max_bin}/{len(right_words)}={ratio:.2f})") + else: + logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] " + f"suspicious but passed alignment (best_bin={max_bin}/{len(right_words)}={ratio:.2f})") + + if len(alignment_validated) >= 2: + validated_gaps = alignment_validated # --- Step 6: Fallback to clustering if too few gaps --- if len(validated_gaps) < 2: