fix: Zeilen an Box-Grenze nicht mehr abschneiden (border_thickness Margin)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 32s
CI / test-go-edu-search (push) Successful in 35s
CI / test-python-klausur (push) Failing after 2m1s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 25s

- detect_rows: Content-Strips nutzen jetzt box_ranges_inner (geschrumpft
  um border_thickness, min 5px) statt der vollen Box-Range
- detect_words: _row_in_box Filter nutzt ebenfalls inner Range
- Dadurch wird die letzte Zeile oberhalb einer Box nicht mehr
  faelschlicherweise der Box zugeordnet und ausgeschlossen

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-10 17:44:02 +01:00
parent 080fcb5e3c
commit 618c82ef42

View File

@@ -1627,19 +1627,28 @@ async def detect_rows(session_id: str):
else:
zones = column_result.get("zones") or [] # zones can be None for sub-sessions
# Collect box y-ranges for filtering
# Collect box y-ranges for filtering.
# Use border_thickness to shrink the exclusion zone: the border pixels
# belong visually to the box frame, but text rows above/below the box
# may overlap with the border area and must not be clipped.
box_ranges = [] # [(y_start, y_end)]
box_ranges_inner = [] # [(y_start + border, y_end - border)] for row filtering
for zone in zones:
if zone.get("zone_type") == "box" and zone.get("box"):
box = zone["box"]
bt = max(box.get("border_thickness", 0), 5) # minimum 5px margin
box_ranges.append((box["y"], box["y"] + box["height"]))
# Inner range: shrink by border thickness so boundary rows aren't excluded
box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt))
if box_ranges and inv is not None:
# Combined-image approach: strip box regions from inv image,
# run row detection on the combined image, then remap y-coords back.
content_strips = [] # [(y_start, y_end)] in absolute coords
# Build content strips by subtracting box ranges from [top_y, bottom_y]
sorted_boxes = sorted(box_ranges, key=lambda r: r[0])
# Build content strips by subtracting box inner ranges from [top_y, bottom_y].
# Using inner ranges means the border area is included in the content
# strips, so the last row above a box isn't clipped by the border.
sorted_boxes = sorted(box_ranges_inner, key=lambda r: r[0])
strip_start = top_y
for by_start, by_end in sorted_boxes:
if by_start > strip_start:
@@ -1934,18 +1943,21 @@ async def detect_words(
]
row.word_count = len(row.words)
# Exclude rows that fall within box zones
# Exclude rows that fall within box zones.
# Use inner box range (shrunk by border_thickness) so that rows at
# the boundary (overlapping with the box border) are NOT excluded.
zones = column_result.get("zones") or []
box_ranges = []
box_ranges_inner = []
for zone in zones:
if zone.get("zone_type") == "box" and zone.get("box"):
box = zone["box"]
box_ranges.append((box["y"], box["y"] + box["height"]))
bt = max(box.get("border_thickness", 0), 5) # minimum 5px margin
box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt))
if box_ranges:
if box_ranges_inner:
def _row_in_box(r):
center_y = r.y + r.height / 2
return any(by_s <= center_y < by_e for by_s, by_e in box_ranges)
return any(by_s <= center_y < by_e for by_s, by_e in box_ranges_inner)
before_count = len(row_geoms)
row_geoms = [r for r in row_geoms if not _row_in_box(r)]