fix: Left-Edge-Alignment-Validierung fuer Spalten-Gaps

Interiore Gaps werden jetzt geprueft: rechts des Gaps muessen mindestens 25% der Woerter eine gemeinsame linke Kante teilen. Verhindert falsche Spaltentrennungen innerhalb breiter Spalten (z.B. Example-Spalte mit kurzen und langen Eintraegen). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 16:11:58 +01:00
parent 04be24a89e
commit 7a0ded7562
1 changed files with 69 additions and 0 deletions
@@ -1265,6 +1265,75 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
        if len(wc_gaps) >= 2:
            validated_gaps = wc_gaps

+    # --- Step 5c: Left-edge alignment validation ---
+    # A real column gap must have words to its right whose left-edges are
+    # consistently aligned (i.e. many words start at nearly the same x).
+    # If words to the right of a gap have scattered left-edges, the gap is
+    # just a natural gap within a wide column (e.g. short words ending
+    # before longer example sentences in the same column).
+    if len(validated_gaps) > 2:
+        edge_tolerance_align = max(8, content_w // 150)
+        min_aligned_ratio = 0.25  # at least 25% of words must share a left-edge bin
+
+        margin_left_end = edge_tolerance if validated_gaps and validated_gaps[0][0] <= max(10, int(content_w * 0.02)) else -1
+        margin_right_start = content_w - max(10, int(content_w * 0.02))
+
+        alignment_validated = []
+        for gap_start_rel, gap_end_rel in validated_gaps:
+            # Skip margin gaps — they don't need alignment validation
+            if gap_start_rel <= max(10, int(content_w * 0.02)):
+                alignment_validated.append((gap_start_rel, gap_end_rel))
+                continue
+            if gap_end_rel >= margin_right_start:
+                alignment_validated.append((gap_start_rel, gap_end_rel))
+                continue
+
+            # Find the next gap after this one (or content end)
+            next_gap_start = content_w
+            for gs, ge in validated_gaps:
+                if gs > gap_end_rel:
+                    next_gap_start = gs
+                    break
+
+            # Collect words to the right of this gap (up to the next gap)
+            right_words = [w for w in segment_words
+                           if gap_end_rel <= w['left'] < next_gap_start]
+
+            if len(right_words) < 3:
+                # Too few words — keep the gap (benefit of the doubt)
+                alignment_validated.append((gap_start_rel, gap_end_rel))
+                continue
+
+            # Cluster left-edges of right-side words
+            right_lefts = sorted(w['left'] for w in right_words)
+            bins = []
+            cur_bin = [right_lefts[0]]
+            for le in right_lefts[1:]:
+                if le - cur_bin[-1] <= edge_tolerance_align:
+                    cur_bin.append(le)
+                else:
+                    bins.append(len(cur_bin))
+                    cur_bin = [le]
+            bins.append(len(cur_bin))
+
+            # The largest bin must contain a significant fraction of words
+            max_bin = max(bins)
+            ratio = max_bin / len(right_words)
+
+            if ratio >= min_aligned_ratio:
+                alignment_validated.append((gap_start_rel, gap_end_rel))
+                logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
+                             f"passed alignment check (best_bin={max_bin}/{len(right_words)}={ratio:.2f})")
+            else:
+                logger.info(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
+                            f"REJECTED — words to the right have no consistent left-edge alignment "
+                            f"(best_bin={max_bin}/{len(right_words)}={ratio:.2f} < {min_aligned_ratio})")
+
+        if len(alignment_validated) >= 2:
+            validated_gaps = alignment_validated
+        else:
+            logger.info("ColumnGeometry: alignment filter removed too many gaps, keeping originals")
+
    # --- Step 6: Fallback to clustering if too few gaps ---
    if len(validated_gaps) < 2:
        logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")