fix: Sub-Session Zeilenerkennung nutzt Word-Grouping statt Gap-Detection
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m0s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 23s

Gap-basierte Erkennung findet bei kleinen Box-Bildern zu wenige Gaps
und mergt Zeilen (7 raw gaps -> 4 validated -> nur 3 rows statt 6).
Sub-Sessions nutzen jetzt direkt _build_rows_from_word_grouping(),
das Woerter nach Y-Position clustert — robuster fuer komplexe Box-Layouts.

Zusaetzlich: alle zones=None Crashes gefixt (replace_all .get("zones") or []).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-10 09:05:24 +01:00
parent 785b4d7655
commit f65bd11919

View File

@@ -419,7 +419,7 @@ async def create_box_sessions(session_id: str):
if not column_result:
raise HTTPException(status_code=400, detail="Column detection must be completed first")
zones = column_result.get("zones", [])
zones = column_result.get("zones") or []
box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")]
if not box_zones:
return {"session_id": session_id, "sub_sessions": [], "message": "No boxes detected"}
@@ -1532,7 +1532,7 @@ async def _get_columns_overlay(session_id: str) -> Response:
cv2.addWeighted(overlay, 0.2, img, 0.8, 0, img)
# Draw detected box boundaries as dashed rectangles
zones = column_result.get("zones", [])
zones = column_result.get("zones") or []
for zone in zones:
if zone.get("zone_type") == "box" and zone.get("box"):
box = zone["box"]
@@ -1600,83 +1600,99 @@ async def detect_rows(session_id: str):
# Read zones from column_result to exclude box regions
session = await get_session_db(session_id)
column_result = (session or {}).get("column_result") or {}
zones = column_result.get("zones") or [] # zones can be None for sub-sessions
is_sub_session = bool((session or {}).get("parent_session_id"))
# Collect box y-ranges for filtering
box_ranges = [] # [(y_start, y_end)]
for zone in zones:
if zone.get("zone_type") == "box" and zone.get("box"):
box = zone["box"]
box_ranges.append((box["y"], box["y"] + box["height"]))
if box_ranges and inv is not None:
# Combined-image approach: strip box regions from inv image,
# run row detection on the combined image, then remap y-coords back.
content_strips = [] # [(y_start, y_end)] in absolute coords
# Build content strips by subtracting box ranges from [top_y, bottom_y]
sorted_boxes = sorted(box_ranges, key=lambda r: r[0])
strip_start = top_y
for by_start, by_end in sorted_boxes:
if by_start > strip_start:
content_strips.append((strip_start, by_start))
strip_start = max(strip_start, by_end)
if strip_start < bottom_y:
content_strips.append((strip_start, bottom_y))
# Filter to strips with meaningful height
content_strips = [(ys, ye) for ys, ye in content_strips if ye - ys >= 20]
if content_strips:
# Stack content strips vertically
inv_strips = [inv[ys:ye, :] for ys, ye in content_strips]
combined_inv = np.vstack(inv_strips)
# Filter word_dicts to only include words from content strips
combined_words = []
cum_y = 0
strip_offsets = [] # (combined_y_start, strip_height, abs_y_start)
for ys, ye in content_strips:
h = ye - ys
strip_offsets.append((cum_y, h, ys))
for w in word_dicts:
w_abs_y = w['top'] + top_y # word y is relative to content top
w_center = w_abs_y + w['height'] / 2
if ys <= w_center < ye:
# Remap to combined coordinates
w_copy = dict(w)
w_copy['top'] = cum_y + (w_abs_y - ys)
combined_words.append(w_copy)
cum_y += h
# Run row detection on combined image
combined_h = combined_inv.shape[0]
rows = detect_row_geometry(
combined_inv, combined_words, left_x, right_x, 0, combined_h,
)
# Remap y-coordinates back to absolute page coords
def _combined_y_to_abs(cy: int) -> int:
for c_start, s_h, abs_start in strip_offsets:
if cy < c_start + s_h:
return abs_start + (cy - c_start)
last_c, last_h, last_abs = strip_offsets[-1]
return last_abs + last_h
for r in rows:
abs_y = _combined_y_to_abs(r.y)
abs_y_end = _combined_y_to_abs(r.y + r.height)
r.y = abs_y
r.height = abs_y_end - abs_y
else:
rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
# Sub-sessions (box crops): use word-grouping instead of gap-based
# row detection. Box images are small with complex internal layouts
# (headings, sub-columns) where the horizontal projection approach
# merges rows. Word-grouping directly clusters words by Y proximity,
# which is more robust for these cases.
if is_sub_session and word_dicts:
from cv_layout import _build_rows_from_word_grouping
rows = _build_rows_from_word_grouping(
word_dicts, left_x, right_x, top_y, bottom_y,
right_x - left_x, bottom_y - top_y,
)
logger.info(f"OCR Pipeline: sub-session {session_id}: word-grouping found {len(rows)} rows")
else:
# No boxes — standard row detection
rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
zones = column_result.get("zones") or [] # zones can be None for sub-sessions
# Collect box y-ranges for filtering
box_ranges = [] # [(y_start, y_end)]
for zone in zones:
if zone.get("zone_type") == "box" and zone.get("box"):
box = zone["box"]
box_ranges.append((box["y"], box["y"] + box["height"]))
if box_ranges and inv is not None:
# Combined-image approach: strip box regions from inv image,
# run row detection on the combined image, then remap y-coords back.
content_strips = [] # [(y_start, y_end)] in absolute coords
# Build content strips by subtracting box ranges from [top_y, bottom_y]
sorted_boxes = sorted(box_ranges, key=lambda r: r[0])
strip_start = top_y
for by_start, by_end in sorted_boxes:
if by_start > strip_start:
content_strips.append((strip_start, by_start))
strip_start = max(strip_start, by_end)
if strip_start < bottom_y:
content_strips.append((strip_start, bottom_y))
# Filter to strips with meaningful height
content_strips = [(ys, ye) for ys, ye in content_strips if ye - ys >= 20]
if content_strips:
# Stack content strips vertically
inv_strips = [inv[ys:ye, :] for ys, ye in content_strips]
combined_inv = np.vstack(inv_strips)
# Filter word_dicts to only include words from content strips
combined_words = []
cum_y = 0
strip_offsets = [] # (combined_y_start, strip_height, abs_y_start)
for ys, ye in content_strips:
h = ye - ys
strip_offsets.append((cum_y, h, ys))
for w in word_dicts:
w_abs_y = w['top'] + top_y # word y is relative to content top
w_center = w_abs_y + w['height'] / 2
if ys <= w_center < ye:
# Remap to combined coordinates
w_copy = dict(w)
w_copy['top'] = cum_y + (w_abs_y - ys)
combined_words.append(w_copy)
cum_y += h
# Run row detection on combined image
combined_h = combined_inv.shape[0]
rows = detect_row_geometry(
combined_inv, combined_words, left_x, right_x, 0, combined_h,
)
# Remap y-coordinates back to absolute page coords
def _combined_y_to_abs(cy: int) -> int:
for c_start, s_h, abs_start in strip_offsets:
if cy < c_start + s_h:
return abs_start + (cy - c_start)
last_c, last_h, last_abs = strip_offsets[-1]
return last_abs + last_h
for r in rows:
abs_y = _combined_y_to_abs(r.y)
abs_y_end = _combined_y_to_abs(r.y + r.height)
r.y = abs_y
r.height = abs_y_end - abs_y
else:
rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
else:
# No boxes — standard row detection
rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
duration = time.time() - t0
# Assign zone_index based on which content zone each row falls in
# Build content zone list with indices
zones = column_result.get("zones") or []
content_zones = [(i, z) for i, z in enumerate(zones) if z.get("zone_type") == "content"] if zones else []
# Build serializable result (exclude words to keep payload small)
@@ -1909,7 +1925,7 @@ async def detect_words(
row.word_count = len(row.words)
# Exclude rows that fall within box zones
zones = column_result.get("zones", [])
zones = column_result.get("zones") or []
box_ranges = []
for zone in zones:
if zone.get("zone_type") == "box" and zone.get("box"):
@@ -2676,7 +2692,7 @@ async def get_fabric_json(session_id: str):
subs = await get_sub_sessions(session_id)
if subs:
column_result = session.get("column_result") or {}
zones = column_result.get("zones", [])
zones = column_result.get("zones") or []
box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")]
for sub in subs:
@@ -2733,7 +2749,7 @@ async def get_merged_vocab_entries(session_id: str):
subs = await get_sub_sessions(session_id)
if subs:
column_result = session.get("column_result") or {}
zones = column_result.get("zones", [])
zones = column_result.get("zones") or []
box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")]
for sub in subs:
@@ -3289,7 +3305,7 @@ async def _get_rows_overlay(session_id: str) -> Response:
# Draw zone separator lines if zones exist
column_result = session.get("column_result") or {}
zones = column_result.get("zones", [])
zones = column_result.get("zones") or []
if zones:
img_w_px = img.shape[1]
zone_color = (0, 200, 255) # Yellow (BGR)
@@ -3445,7 +3461,7 @@ async def _get_words_overlay(session_id: str) -> Response:
# Red semi-transparent overlay for box zones
column_result = session.get("column_result") or {}
zones = column_result.get("zones", [])
zones = column_result.get("zones") or []
_draw_box_exclusion_overlay(img, zones)
success, result_png = cv2.imencode(".png", img)