From 7a0ded7562e27690bf8a22e060c3302e2c8458f9 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 9 Mar 2026 16:11:58 +0100 Subject: [PATCH] fix: Left-Edge-Alignment-Validierung fuer Spalten-Gaps Interiore Gaps werden jetzt geprueft: rechts des Gaps muessen mindestens 25% der Woerter eine gemeinsame linke Kante teilen. Verhindert falsche Spaltentrennungen innerhalb breiter Spalten (z.B. Example-Spalte mit kurzen und langen Eintraegen). Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_layout.py | 69 ++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/klausur-service/backend/cv_layout.py b/klausur-service/backend/cv_layout.py index b9709b8..abf7506 100644 --- a/klausur-service/backend/cv_layout.py +++ b/klausur-service/backend/cv_layout.py @@ -1265,6 +1265,75 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt if len(wc_gaps) >= 2: validated_gaps = wc_gaps + # --- Step 5c: Left-edge alignment validation --- + # A real column gap must have words to its right whose left-edges are + # consistently aligned (i.e. many words start at nearly the same x). + # If words to the right of a gap have scattered left-edges, the gap is + # just a natural gap within a wide column (e.g. short words ending + # before longer example sentences in the same column). + if len(validated_gaps) > 2: + edge_tolerance_align = max(8, content_w // 150) + min_aligned_ratio = 0.25 # at least 25% of words must share a left-edge bin + + margin_left_end = edge_tolerance if validated_gaps and validated_gaps[0][0] <= max(10, int(content_w * 0.02)) else -1 + margin_right_start = content_w - max(10, int(content_w * 0.02)) + + alignment_validated = [] + for gap_start_rel, gap_end_rel in validated_gaps: + # Skip margin gaps — they don't need alignment validation + if gap_start_rel <= max(10, int(content_w * 0.02)): + alignment_validated.append((gap_start_rel, gap_end_rel)) + continue + if gap_end_rel >= margin_right_start: + alignment_validated.append((gap_start_rel, gap_end_rel)) + continue + + # Find the next gap after this one (or content end) + next_gap_start = content_w + for gs, ge in validated_gaps: + if gs > gap_end_rel: + next_gap_start = gs + break + + # Collect words to the right of this gap (up to the next gap) + right_words = [w for w in segment_words + if gap_end_rel <= w['left'] < next_gap_start] + + if len(right_words) < 3: + # Too few words — keep the gap (benefit of the doubt) + alignment_validated.append((gap_start_rel, gap_end_rel)) + continue + + # Cluster left-edges of right-side words + right_lefts = sorted(w['left'] for w in right_words) + bins = [] + cur_bin = [right_lefts[0]] + for le in right_lefts[1:]: + if le - cur_bin[-1] <= edge_tolerance_align: + cur_bin.append(le) + else: + bins.append(len(cur_bin)) + cur_bin = [le] + bins.append(len(cur_bin)) + + # The largest bin must contain a significant fraction of words + max_bin = max(bins) + ratio = max_bin / len(right_words) + + if ratio >= min_aligned_ratio: + alignment_validated.append((gap_start_rel, gap_end_rel)) + logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] " + f"passed alignment check (best_bin={max_bin}/{len(right_words)}={ratio:.2f})") + else: + logger.info(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] " + f"REJECTED — words to the right have no consistent left-edge alignment " + f"(best_bin={max_bin}/{len(right_words)}={ratio:.2f} < {min_aligned_ratio})") + + if len(alignment_validated) >= 2: + validated_gaps = alignment_validated + else: + logger.info("ColumnGeometry: alignment filter removed too many gaps, keeping originals") + # --- Step 6: Fallback to clustering if too few gaps --- if len(validated_gaps) < 2: logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")