fix: Zeilen an Box-Grenze nicht mehr abschneiden (border_thickness Margin)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 32s
CI / test-go-edu-search (push) Successful in 35s
CI / test-python-klausur (push) Failing after 2m1s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 25s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 32s
CI / test-go-edu-search (push) Successful in 35s
CI / test-python-klausur (push) Failing after 2m1s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 25s
- detect_rows: Content-Strips nutzen jetzt box_ranges_inner (geschrumpft um border_thickness, min 5px) statt der vollen Box-Range - detect_words: _row_in_box Filter nutzt ebenfalls inner Range - Dadurch wird die letzte Zeile oberhalb einer Box nicht mehr faelschlicherweise der Box zugeordnet und ausgeschlossen Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1627,19 +1627,28 @@ async def detect_rows(session_id: str):
|
||||
else:
|
||||
zones = column_result.get("zones") or [] # zones can be None for sub-sessions
|
||||
|
||||
# Collect box y-ranges for filtering
|
||||
# Collect box y-ranges for filtering.
|
||||
# Use border_thickness to shrink the exclusion zone: the border pixels
|
||||
# belong visually to the box frame, but text rows above/below the box
|
||||
# may overlap with the border area and must not be clipped.
|
||||
box_ranges = [] # [(y_start, y_end)]
|
||||
box_ranges_inner = [] # [(y_start + border, y_end - border)] for row filtering
|
||||
for zone in zones:
|
||||
if zone.get("zone_type") == "box" and zone.get("box"):
|
||||
box = zone["box"]
|
||||
bt = max(box.get("border_thickness", 0), 5) # minimum 5px margin
|
||||
box_ranges.append((box["y"], box["y"] + box["height"]))
|
||||
# Inner range: shrink by border thickness so boundary rows aren't excluded
|
||||
box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt))
|
||||
|
||||
if box_ranges and inv is not None:
|
||||
# Combined-image approach: strip box regions from inv image,
|
||||
# run row detection on the combined image, then remap y-coords back.
|
||||
content_strips = [] # [(y_start, y_end)] in absolute coords
|
||||
# Build content strips by subtracting box ranges from [top_y, bottom_y]
|
||||
sorted_boxes = sorted(box_ranges, key=lambda r: r[0])
|
||||
# Build content strips by subtracting box inner ranges from [top_y, bottom_y].
|
||||
# Using inner ranges means the border area is included in the content
|
||||
# strips, so the last row above a box isn't clipped by the border.
|
||||
sorted_boxes = sorted(box_ranges_inner, key=lambda r: r[0])
|
||||
strip_start = top_y
|
||||
for by_start, by_end in sorted_boxes:
|
||||
if by_start > strip_start:
|
||||
@@ -1934,18 +1943,21 @@ async def detect_words(
|
||||
]
|
||||
row.word_count = len(row.words)
|
||||
|
||||
# Exclude rows that fall within box zones
|
||||
# Exclude rows that fall within box zones.
|
||||
# Use inner box range (shrunk by border_thickness) so that rows at
|
||||
# the boundary (overlapping with the box border) are NOT excluded.
|
||||
zones = column_result.get("zones") or []
|
||||
box_ranges = []
|
||||
box_ranges_inner = []
|
||||
for zone in zones:
|
||||
if zone.get("zone_type") == "box" and zone.get("box"):
|
||||
box = zone["box"]
|
||||
box_ranges.append((box["y"], box["y"] + box["height"]))
|
||||
bt = max(box.get("border_thickness", 0), 5) # minimum 5px margin
|
||||
box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt))
|
||||
|
||||
if box_ranges:
|
||||
if box_ranges_inner:
|
||||
def _row_in_box(r):
|
||||
center_y = r.y + r.height / 2
|
||||
return any(by_s <= center_y < by_e for by_s, by_e in box_ranges)
|
||||
return any(by_s <= center_y < by_e for by_s, by_e in box_ranges_inner)
|
||||
|
||||
before_count = len(row_geoms)
|
||||
row_geoms = [r for r in row_geoms if not _row_in_box(r)]
|
||||
|
||||
Reference in New Issue
Block a user