fix: Alignment-Validierung nur fuer verdaechtige Gaps (>2x Median-Breite)

Vorher wurden alle internen Gaps geprueft, was echte Spaltentrennungen (EN→DE) faelschlicherweise entfernte. Jetzt werden nur Gaps geprueft, die eine unverhaeltnismaessig breite rechte Spalte erzeugen wuerden (>2x Median-Spaltenbreite). Schwelle auf 15% gesenkt. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 16:27:14 +01:00
parent 11126c4436
commit fb46450802
1 changed files with 80 additions and 57 deletions
@@ -1265,72 +1265,95 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
        if len(wc_gaps) >= 2:
            validated_gaps = wc_gaps

-    # --- Step 5c: Left-edge alignment validation ---
-    # A real column gap must have words to its right whose left-edges are
-    # consistently aligned (i.e. many words start at nearly the same x).
-    # If words to the right of a gap have scattered left-edges, the gap is
-    # just a natural gap within a wide column (e.g. short words ending
-    # before longer example sentences in the same column).
+    # --- Step 5c: Left-edge alignment validation (suspicious gaps only) ---
+    # Only check gaps that would create an unusually wide column to the right.
+    # These are likely false splits within a single wide column (e.g. short EN
+    # words followed by longer DE example sentences in the same column).
+    # Gaps that produce columns of similar width to their neighbors are trusted.
    if len(validated_gaps) > 2:
        edge_tolerance_align = max(8, content_w // 150)
-        min_aligned_ratio = 0.25  # at least 25% of words must share a left-edge bin
+        min_aligned_ratio = 0.15  # at least 15% of words must share a left-edge bin
        margin_thresh = max(10, int(content_w * 0.02))

-        alignment_validated = []
-        for gap_start_rel, gap_end_rel in validated_gaps:
-            # Skip margin gaps — they don't need alignment validation
-            if gap_start_rel <= margin_thresh:
-                alignment_validated.append((gap_start_rel, gap_end_rel))
-                continue
-            if gap_end_rel >= content_w - margin_thresh:
-                alignment_validated.append((gap_start_rel, gap_end_rel))
-                continue
+        # Compute tentative column widths from all gaps
+        sorted_gaps = sorted(validated_gaps, key=lambda g: g[0])
+        # Interior gaps only (exclude margins)
+        interior_indices = []
+        for gi, (gs, ge) in enumerate(sorted_gaps):
+            if gs > margin_thresh and ge < content_w - margin_thresh:
+                interior_indices.append(gi)

-            # Find the next gap after this one (or content end)
-            next_gap_start = content_w
-            for gs, ge in validated_gaps:
-                if gs > gap_end_rel:
-                    next_gap_start = gs
-                    break
-
-            # Collect words to the right of this gap (up to the next gap)
-            right_words = [w for w in segment_words
-                           if gap_end_rel <= w['left'] < next_gap_start]
-
-            if len(right_words) < 3:
-                # Too few words — keep the gap (benefit of the doubt)
-                alignment_validated.append((gap_start_rel, gap_end_rel))
-                continue
-
-            # Cluster left-edges of right-side words
-            right_lefts = sorted(w['left'] for w in right_words)
-            bins = []
-            cur_bin = [right_lefts[0]]
-            for le in right_lefts[1:]:
-                if le - cur_bin[-1] <= edge_tolerance_align:
-                    cur_bin.append(le)
+        if interior_indices:
+            # For each interior gap, compute the width of the column it starts
+            gap_suspicion: dict = {}  # gap_index → right_col_width
+            for gi in interior_indices:
+                gap_end = sorted_gaps[gi][1]
+                # Next gap start (or content right edge)
+                if gi + 1 < len(sorted_gaps):
+                    next_gs = sorted_gaps[gi + 1][0]
                else:
-                    bins.append(len(cur_bin))
-                    cur_bin = [le]
-            bins.append(len(cur_bin))
+                    next_gs = content_w
+                right_col_w = next_gs - gap_end
+                gap_suspicion[gi] = right_col_w

-            # The largest bin must contain a significant fraction of words
-            max_bin = max(bins)
-            ratio = max_bin / len(right_words)
+            # Median column width (from all gaps, including margins)
+            all_col_widths = []
+            prev_end = 0
+            for gs, ge in sorted_gaps:
+                cw = gs - prev_end
+                if cw > 0:
+                    all_col_widths.append(cw)
+                prev_end = ge
+            trailing = content_w - prev_end
+            if trailing > 0:
+                all_col_widths.append(trailing)
+            median_col_w = sorted(all_col_widths)[len(all_col_widths) // 2] if all_col_widths else content_w

-            if ratio >= min_aligned_ratio:
-                alignment_validated.append((gap_start_rel, gap_end_rel))
-                logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
-                             f"passed alignment check (best_bin={max_bin}/{len(right_words)}={ratio:.2f})")
-            else:
-                logger.info(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
-                            f"REJECTED — words to the right have no consistent left-edge alignment "
-                            f"(best_bin={max_bin}/{len(right_words)}={ratio:.2f} < {min_aligned_ratio})")
+            # A gap is suspicious if the column to its right is > 2x median width
+            suspicious_threshold = median_col_w * 2.0

-        if len(alignment_validated) >= 2:
-            validated_gaps = alignment_validated
-        else:
-            logger.info("ColumnGeometry: alignment filter removed too many gaps, keeping originals")
+            alignment_validated = list(validated_gaps)  # start with all
+            for gi in interior_indices:
+                right_col_w = gap_suspicion[gi]
+                if right_col_w <= suspicious_threshold:
+                    continue  # normal gap, keep it
+
+                # Suspicious — check left-edge alignment
+                gap_start_rel, gap_end_rel = sorted_gaps[gi]
+                next_gs = sorted_gaps[gi + 1][0] if gi + 1 < len(sorted_gaps) else content_w
+                right_words = [w for w in segment_words
+                               if gap_end_rel <= w['left'] < next_gs]
+
+                if len(right_words) < 3:
+                    continue  # too few words, keep gap
+
+                # Cluster left-edges
+                right_lefts = sorted(w['left'] for w in right_words)
+                bins = []
+                cur_bin = [right_lefts[0]]
+                for le in right_lefts[1:]:
+                    if le - cur_bin[-1] <= edge_tolerance_align:
+                        cur_bin.append(le)
+                    else:
+                        bins.append(len(cur_bin))
+                        cur_bin = [le]
+                bins.append(len(cur_bin))
+
+                max_bin = max(bins)
+                ratio = max_bin / len(right_words)
+
+                if ratio < min_aligned_ratio:
+                    # Remove this gap
+                    alignment_validated.remove((gap_start_rel, gap_end_rel))
+                    logger.info(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
+                                f"REJECTED — suspicious (right_col={right_col_w}px > 2x median={median_col_w:.0f}px) "
+                                f"and poor left-edge alignment (best_bin={max_bin}/{len(right_words)}={ratio:.2f})")
+                else:
+                    logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
+                                 f"suspicious but passed alignment (best_bin={max_bin}/{len(right_words)}={ratio:.2f})")
+
+            if len(alignment_validated) >= 2:
+                validated_gaps = alignment_validated

    # --- Step 6: Fallback to clustering if too few gaps ---
    if len(validated_gaps) < 2: