fix: Box-Bereiche aus Bild entfernen statt pro Zone separat Spalten erkennen

Content-Streifen oberhalb/unterhalb von Boxen werden zu einem Bild zusammengefügt, Spaltenerkennung läuft einmal auf dem kombinierten Bild. Entfernt Step 5c (suspicion-based gap alignment), da der neue Ansatz das Problem an der Wurzel löst. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 17:03:05 +01:00
parent fb46450802
commit 4610137ecc
1 changed files with 68 additions and 129 deletions
@@ -1265,96 +1265,6 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
        if len(wc_gaps) >= 2:
            validated_gaps = wc_gaps

-    # --- Step 5c: Left-edge alignment validation (suspicious gaps only) ---
-    # Only check gaps that would create an unusually wide column to the right.
-    # These are likely false splits within a single wide column (e.g. short EN
-    # words followed by longer DE example sentences in the same column).
-    # Gaps that produce columns of similar width to their neighbors are trusted.
-    if len(validated_gaps) > 2:
-        edge_tolerance_align = max(8, content_w // 150)
-        min_aligned_ratio = 0.15  # at least 15% of words must share a left-edge bin
-        margin_thresh = max(10, int(content_w * 0.02))
-
-        # Compute tentative column widths from all gaps
-        sorted_gaps = sorted(validated_gaps, key=lambda g: g[0])
-        # Interior gaps only (exclude margins)
-        interior_indices = []
-        for gi, (gs, ge) in enumerate(sorted_gaps):
-            if gs > margin_thresh and ge < content_w - margin_thresh:
-                interior_indices.append(gi)
-
-        if interior_indices:
-            # For each interior gap, compute the width of the column it starts
-            gap_suspicion: dict = {}  # gap_index → right_col_width
-            for gi in interior_indices:
-                gap_end = sorted_gaps[gi][1]
-                # Next gap start (or content right edge)
-                if gi + 1 < len(sorted_gaps):
-                    next_gs = sorted_gaps[gi + 1][0]
-                else:
-                    next_gs = content_w
-                right_col_w = next_gs - gap_end
-                gap_suspicion[gi] = right_col_w
-
-            # Median column width (from all gaps, including margins)
-            all_col_widths = []
-            prev_end = 0
-            for gs, ge in sorted_gaps:
-                cw = gs - prev_end
-                if cw > 0:
-                    all_col_widths.append(cw)
-                prev_end = ge
-            trailing = content_w - prev_end
-            if trailing > 0:
-                all_col_widths.append(trailing)
-            median_col_w = sorted(all_col_widths)[len(all_col_widths) // 2] if all_col_widths else content_w
-
-            # A gap is suspicious if the column to its right is > 2x median width
-            suspicious_threshold = median_col_w * 2.0
-
-            alignment_validated = list(validated_gaps)  # start with all
-            for gi in interior_indices:
-                right_col_w = gap_suspicion[gi]
-                if right_col_w <= suspicious_threshold:
-                    continue  # normal gap, keep it
-
-                # Suspicious — check left-edge alignment
-                gap_start_rel, gap_end_rel = sorted_gaps[gi]
-                next_gs = sorted_gaps[gi + 1][0] if gi + 1 < len(sorted_gaps) else content_w
-                right_words = [w for w in segment_words
-                               if gap_end_rel <= w['left'] < next_gs]
-
-                if len(right_words) < 3:
-                    continue  # too few words, keep gap
-
-                # Cluster left-edges
-                right_lefts = sorted(w['left'] for w in right_words)
-                bins = []
-                cur_bin = [right_lefts[0]]
-                for le in right_lefts[1:]:
-                    if le - cur_bin[-1] <= edge_tolerance_align:
-                        cur_bin.append(le)
-                    else:
-                        bins.append(len(cur_bin))
-                        cur_bin = [le]
-                bins.append(len(cur_bin))
-
-                max_bin = max(bins)
-                ratio = max_bin / len(right_words)
-
-                if ratio < min_aligned_ratio:
-                    # Remove this gap
-                    alignment_validated.remove((gap_start_rel, gap_end_rel))
-                    logger.info(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
-                                f"REJECTED — suspicious (right_col={right_col_w}px > 2x median={median_col_w:.0f}px) "
-                                f"and poor left-edge alignment (best_bin={max_bin}/{len(right_words)}={ratio:.2f})")
-                else:
-                    logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
-                                 f"suspicious but passed alignment (best_bin={max_bin}/{len(right_words)}={ratio:.2f})")
-
-            if len(alignment_validated) >= 2:
-                validated_gaps = alignment_validated
-
    # --- Step 6: Fallback to clustering if too few gaps ---
    if len(validated_gaps) < 2:
        logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
@@ -3187,13 +3097,75 @@ def detect_column_geometry_zoned(
        return (geometries, left_x, right_x, top_y, bottom_y,
                word_dicts, inv, zone_data, boxes)

-    # Split into zones
+    # --- New approach: concatenate content regions (skip boxes), run column
+    # detection ONCE on the combined image, then map coordinates back. ---
+
+    # Split into zones (for metadata / overlay purposes)
    zones = split_page_into_zones(left_x, top_y, content_w, content_h, boxes)

-    # Run column detection per content zone
-    all_geometries: List[ColumnGeometry] = []
-    zones_data: List[Dict] = []
+    # Collect content strips (above/between/below boxes)
+    content_strips: List[Tuple[int, int]] = []  # (y_start, y_end) in absolute coords
+    for zone in zones:
+        if zone.zone_type == 'content' and zone.height >= 40:
+            content_strips.append((zone.y, zone.y + zone.height))

+    if not content_strips:
+        # Only box zones — fall back to original detection
+        logger.info("ZonedColumns: no content zones with height >= 40, using original result")
+        zone_data = [{"index": 0, "zone_type": "content", "y": top_y,
+                       "height": content_h, "x": left_x, "width": content_w, "columns": []}]
+        return (geometries, left_x, right_x, top_y, bottom_y,
+                word_dicts, inv, zone_data, boxes)
+
+    # Build combined image by vertically stacking content strips
+    ocr_strips = [ocr_img[ys:ye, :] for ys, ye in content_strips]
+    bgr_strips = [dewarped_bgr[ys:ye, :] for ys, ye in content_strips]
+    combined_ocr = np.vstack(ocr_strips)
+    combined_bgr = np.vstack(bgr_strips)
+
+    logger.info(f"ZonedColumns: {len(boxes)} box(es), concatenating {len(content_strips)} "
+                f"content strips into combined image {combined_ocr.shape[1]}x{combined_ocr.shape[0]}")
+
+    # Run column detection on the combined (box-free) image
+    combined_result = detect_column_geometry(combined_ocr, combined_bgr)
+    if combined_result is not None:
+        combined_geoms, c_lx, c_rx, c_ty, c_by, combined_words, combined_inv = combined_result
+    else:
+        # Fallback to original full-page result
+        logger.info("ZonedColumns: combined image column detection failed, using original")
+        combined_geoms = geometries
+
+    # Map combined-image y-coordinates back to absolute page coordinates.
+    # In the combined image, strip i starts at cumulative_y = sum of heights
+    # of strips 0..i-1. We need to add the offset between the strip's
+    # original y-position and its position in the combined image.
+    # Build a mapping: combined_y → absolute_y
+    strip_offsets: List[Tuple[int, int, int]] = []  # (combined_y_start, strip_height, abs_y_start)
+    cum_y = 0
+    for ys, ye in content_strips:
+        h = ye - ys
+        strip_offsets.append((cum_y, h, ys))
+        cum_y += h
+
+    def _combined_y_to_abs(cy: int) -> int:
+        """Map a y-coordinate in combined image back to absolute page coords."""
+        for c_start, s_h, abs_start in strip_offsets:
+            if cy < c_start + s_h:
+                return abs_start + (cy - c_start)
+        # Past last strip — clamp to end of last strip
+        last_c, last_h, last_abs = strip_offsets[-1]
+        return last_abs + last_h
+
+    # Adjust geometries: y and height need remapping
+    if combined_result is not None:
+        for g in combined_geoms:
+            abs_y = _combined_y_to_abs(g.y)
+            abs_y_end = _combined_y_to_abs(g.y + g.height)
+            g.y = abs_y
+            g.height = abs_y_end - abs_y
+
+    # Build zones_data for the response
+    zones_data: List[Dict] = []
    for zone in zones:
        zone_dict: Dict = {
            "index": zone.index,
@@ -3215,45 +3187,12 @@ def detect_column_geometry_zoned(
                "border_thickness": zone.box.border_thickness,
            }

-        if zone.zone_type == 'content' and zone.height >= 40:
-            # Extract sub-image for this zone
-            zone_y_end = zone.y + zone.height
-            sub_ocr = ocr_img[zone.y:zone_y_end, :]
-            sub_bgr = dewarped_bgr[zone.y:zone_y_end, :]
-
-            sub_result = detect_column_geometry(sub_ocr, sub_bgr)
-            if sub_result is not None:
-                sub_geoms, sub_lx, sub_rx, sub_ty, sub_by, _sub_words, _sub_inv = sub_result
-
-                # Offset column y-coordinates back to absolute page coords
-                for g in sub_geoms:
-                    g.y += zone.y
-
-                zone_cols = []
-                for g in sub_geoms:
-                    zone_cols.append({
-                        "index": g.index,
-                        "x": g.x,
-                        "y": g.y,
-                        "width": g.width,
-                        "height": g.height,
-                        "word_count": g.word_count,
-                        "width_ratio": g.width_ratio,
-                        "zone_index": zone.index,
-                    })
-                zone_dict["columns"] = zone_cols
-                all_geometries.extend(sub_geoms)
-            else:
-                logger.debug(f"ZonedColumns: zone {zone.index} column detection returned None")
-
        zones_data.append(zone_dict)

-    # If per-zone detection produced no columns, fall back to the original
-    if not all_geometries:
-        all_geometries = geometries
+    all_geometries = combined_geoms if combined_geoms else geometries

    logger.info(f"ZonedColumns: {len(boxes)} box(es), {len(zones)} zone(s), "
-                f"{len(all_geometries)} total columns")
+                f"{len(all_geometries)} total columns (combined-image approach)")

    return (all_geometries, left_x, right_x, top_y, bottom_y,
            word_dicts, inv, zones_data, boxes)