fix: Zeilen an Box-Grenze nicht mehr abschneiden (border_thickness Margin)

- detect_rows: Content-Strips nutzen jetzt box_ranges_inner (geschrumpft um border_thickness, min 5px) statt der vollen Box-Range - detect_words: _row_in_box Filter nutzt ebenfalls inner Range - Dadurch wird die letzte Zeile oberhalb einer Box nicht mehr faelschlicherweise der Box zugeordnet und ausgeschlossen Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 17:44:02 +01:00
parent 080fcb5e3c
commit 618c82ef42
1 changed files with 20 additions and 8 deletions
@@ -1627,19 +1627,28 @@ async def detect_rows(session_id: str):
    else:
        zones = column_result.get("zones") or []  # zones can be None for sub-sessions

-        # Collect box y-ranges for filtering
+        # Collect box y-ranges for filtering.
+        # Use border_thickness to shrink the exclusion zone: the border pixels
+        # belong visually to the box frame, but text rows above/below the box
+        # may overlap with the border area and must not be clipped.
        box_ranges = []  # [(y_start, y_end)]
+        box_ranges_inner = []  # [(y_start + border, y_end - border)] for row filtering
        for zone in zones:
            if zone.get("zone_type") == "box" and zone.get("box"):
                box = zone["box"]
+                bt = max(box.get("border_thickness", 0), 5)  # minimum 5px margin
                box_ranges.append((box["y"], box["y"] + box["height"]))
+                # Inner range: shrink by border thickness so boundary rows aren't excluded
+                box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt))

        if box_ranges and inv is not None:
            # Combined-image approach: strip box regions from inv image,
            # run row detection on the combined image, then remap y-coords back.
            content_strips = []  # [(y_start, y_end)] in absolute coords
-            # Build content strips by subtracting box ranges from [top_y, bottom_y]
-            sorted_boxes = sorted(box_ranges, key=lambda r: r[0])
+            # Build content strips by subtracting box inner ranges from [top_y, bottom_y].
+            # Using inner ranges means the border area is included in the content
+            # strips, so the last row above a box isn't clipped by the border.
+            sorted_boxes = sorted(box_ranges_inner, key=lambda r: r[0])
            strip_start = top_y
            for by_start, by_end in sorted_boxes:
                if by_start > strip_start:
@@ -1934,18 +1943,21 @@ async def detect_words(
            ]
            row.word_count = len(row.words)

-    # Exclude rows that fall within box zones
+    # Exclude rows that fall within box zones.
+    # Use inner box range (shrunk by border_thickness) so that rows at
+    # the boundary (overlapping with the box border) are NOT excluded.
    zones = column_result.get("zones") or []
-    box_ranges = []
+    box_ranges_inner = []
    for zone in zones:
        if zone.get("zone_type") == "box" and zone.get("box"):
            box = zone["box"]
-            box_ranges.append((box["y"], box["y"] + box["height"]))
+            bt = max(box.get("border_thickness", 0), 5)  # minimum 5px margin
+            box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt))

-    if box_ranges:
+    if box_ranges_inner:
        def _row_in_box(r):
            center_y = r.y + r.height / 2
-            return any(by_s <= center_y < by_e for by_s, by_e in box_ranges)
+            return any(by_s <= center_y < by_e for by_s, by_e in box_ranges_inner)

        before_count = len(row_geoms)
        row_geoms = [r for r in row_geoms if not _row_in_box(r)]