fix: Sub-Session Zeilenerkennung nutzt Word-Grouping statt Gap-Detection
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m0s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 23s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m0s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 23s
Gap-basierte Erkennung findet bei kleinen Box-Bildern zu wenige Gaps
und mergt Zeilen (7 raw gaps -> 4 validated -> nur 3 rows statt 6).
Sub-Sessions nutzen jetzt direkt _build_rows_from_word_grouping(),
das Woerter nach Y-Position clustert — robuster fuer komplexe Box-Layouts.
Zusaetzlich: alle zones=None Crashes gefixt (replace_all .get("zones") or []).
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -419,7 +419,7 @@ async def create_box_sessions(session_id: str):
|
||||
if not column_result:
|
||||
raise HTTPException(status_code=400, detail="Column detection must be completed first")
|
||||
|
||||
zones = column_result.get("zones", [])
|
||||
zones = column_result.get("zones") or []
|
||||
box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")]
|
||||
if not box_zones:
|
||||
return {"session_id": session_id, "sub_sessions": [], "message": "No boxes detected"}
|
||||
@@ -1532,7 +1532,7 @@ async def _get_columns_overlay(session_id: str) -> Response:
|
||||
cv2.addWeighted(overlay, 0.2, img, 0.8, 0, img)
|
||||
|
||||
# Draw detected box boundaries as dashed rectangles
|
||||
zones = column_result.get("zones", [])
|
||||
zones = column_result.get("zones") or []
|
||||
for zone in zones:
|
||||
if zone.get("zone_type") == "box" and zone.get("box"):
|
||||
box = zone["box"]
|
||||
@@ -1600,83 +1600,99 @@ async def detect_rows(session_id: str):
|
||||
# Read zones from column_result to exclude box regions
|
||||
session = await get_session_db(session_id)
|
||||
column_result = (session or {}).get("column_result") or {}
|
||||
zones = column_result.get("zones") or [] # zones can be None for sub-sessions
|
||||
is_sub_session = bool((session or {}).get("parent_session_id"))
|
||||
|
||||
# Collect box y-ranges for filtering
|
||||
box_ranges = [] # [(y_start, y_end)]
|
||||
for zone in zones:
|
||||
if zone.get("zone_type") == "box" and zone.get("box"):
|
||||
box = zone["box"]
|
||||
box_ranges.append((box["y"], box["y"] + box["height"]))
|
||||
|
||||
if box_ranges and inv is not None:
|
||||
# Combined-image approach: strip box regions from inv image,
|
||||
# run row detection on the combined image, then remap y-coords back.
|
||||
content_strips = [] # [(y_start, y_end)] in absolute coords
|
||||
# Build content strips by subtracting box ranges from [top_y, bottom_y]
|
||||
sorted_boxes = sorted(box_ranges, key=lambda r: r[0])
|
||||
strip_start = top_y
|
||||
for by_start, by_end in sorted_boxes:
|
||||
if by_start > strip_start:
|
||||
content_strips.append((strip_start, by_start))
|
||||
strip_start = max(strip_start, by_end)
|
||||
if strip_start < bottom_y:
|
||||
content_strips.append((strip_start, bottom_y))
|
||||
|
||||
# Filter to strips with meaningful height
|
||||
content_strips = [(ys, ye) for ys, ye in content_strips if ye - ys >= 20]
|
||||
|
||||
if content_strips:
|
||||
# Stack content strips vertically
|
||||
inv_strips = [inv[ys:ye, :] for ys, ye in content_strips]
|
||||
combined_inv = np.vstack(inv_strips)
|
||||
|
||||
# Filter word_dicts to only include words from content strips
|
||||
combined_words = []
|
||||
cum_y = 0
|
||||
strip_offsets = [] # (combined_y_start, strip_height, abs_y_start)
|
||||
for ys, ye in content_strips:
|
||||
h = ye - ys
|
||||
strip_offsets.append((cum_y, h, ys))
|
||||
for w in word_dicts:
|
||||
w_abs_y = w['top'] + top_y # word y is relative to content top
|
||||
w_center = w_abs_y + w['height'] / 2
|
||||
if ys <= w_center < ye:
|
||||
# Remap to combined coordinates
|
||||
w_copy = dict(w)
|
||||
w_copy['top'] = cum_y + (w_abs_y - ys)
|
||||
combined_words.append(w_copy)
|
||||
cum_y += h
|
||||
|
||||
# Run row detection on combined image
|
||||
combined_h = combined_inv.shape[0]
|
||||
rows = detect_row_geometry(
|
||||
combined_inv, combined_words, left_x, right_x, 0, combined_h,
|
||||
)
|
||||
|
||||
# Remap y-coordinates back to absolute page coords
|
||||
def _combined_y_to_abs(cy: int) -> int:
|
||||
for c_start, s_h, abs_start in strip_offsets:
|
||||
if cy < c_start + s_h:
|
||||
return abs_start + (cy - c_start)
|
||||
last_c, last_h, last_abs = strip_offsets[-1]
|
||||
return last_abs + last_h
|
||||
|
||||
for r in rows:
|
||||
abs_y = _combined_y_to_abs(r.y)
|
||||
abs_y_end = _combined_y_to_abs(r.y + r.height)
|
||||
r.y = abs_y
|
||||
r.height = abs_y_end - abs_y
|
||||
else:
|
||||
rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
|
||||
# Sub-sessions (box crops): use word-grouping instead of gap-based
|
||||
# row detection. Box images are small with complex internal layouts
|
||||
# (headings, sub-columns) where the horizontal projection approach
|
||||
# merges rows. Word-grouping directly clusters words by Y proximity,
|
||||
# which is more robust for these cases.
|
||||
if is_sub_session and word_dicts:
|
||||
from cv_layout import _build_rows_from_word_grouping
|
||||
rows = _build_rows_from_word_grouping(
|
||||
word_dicts, left_x, right_x, top_y, bottom_y,
|
||||
right_x - left_x, bottom_y - top_y,
|
||||
)
|
||||
logger.info(f"OCR Pipeline: sub-session {session_id}: word-grouping found {len(rows)} rows")
|
||||
else:
|
||||
# No boxes — standard row detection
|
||||
rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
|
||||
zones = column_result.get("zones") or [] # zones can be None for sub-sessions
|
||||
|
||||
# Collect box y-ranges for filtering
|
||||
box_ranges = [] # [(y_start, y_end)]
|
||||
for zone in zones:
|
||||
if zone.get("zone_type") == "box" and zone.get("box"):
|
||||
box = zone["box"]
|
||||
box_ranges.append((box["y"], box["y"] + box["height"]))
|
||||
|
||||
if box_ranges and inv is not None:
|
||||
# Combined-image approach: strip box regions from inv image,
|
||||
# run row detection on the combined image, then remap y-coords back.
|
||||
content_strips = [] # [(y_start, y_end)] in absolute coords
|
||||
# Build content strips by subtracting box ranges from [top_y, bottom_y]
|
||||
sorted_boxes = sorted(box_ranges, key=lambda r: r[0])
|
||||
strip_start = top_y
|
||||
for by_start, by_end in sorted_boxes:
|
||||
if by_start > strip_start:
|
||||
content_strips.append((strip_start, by_start))
|
||||
strip_start = max(strip_start, by_end)
|
||||
if strip_start < bottom_y:
|
||||
content_strips.append((strip_start, bottom_y))
|
||||
|
||||
# Filter to strips with meaningful height
|
||||
content_strips = [(ys, ye) for ys, ye in content_strips if ye - ys >= 20]
|
||||
|
||||
if content_strips:
|
||||
# Stack content strips vertically
|
||||
inv_strips = [inv[ys:ye, :] for ys, ye in content_strips]
|
||||
combined_inv = np.vstack(inv_strips)
|
||||
|
||||
# Filter word_dicts to only include words from content strips
|
||||
combined_words = []
|
||||
cum_y = 0
|
||||
strip_offsets = [] # (combined_y_start, strip_height, abs_y_start)
|
||||
for ys, ye in content_strips:
|
||||
h = ye - ys
|
||||
strip_offsets.append((cum_y, h, ys))
|
||||
for w in word_dicts:
|
||||
w_abs_y = w['top'] + top_y # word y is relative to content top
|
||||
w_center = w_abs_y + w['height'] / 2
|
||||
if ys <= w_center < ye:
|
||||
# Remap to combined coordinates
|
||||
w_copy = dict(w)
|
||||
w_copy['top'] = cum_y + (w_abs_y - ys)
|
||||
combined_words.append(w_copy)
|
||||
cum_y += h
|
||||
|
||||
# Run row detection on combined image
|
||||
combined_h = combined_inv.shape[0]
|
||||
rows = detect_row_geometry(
|
||||
combined_inv, combined_words, left_x, right_x, 0, combined_h,
|
||||
)
|
||||
|
||||
# Remap y-coordinates back to absolute page coords
|
||||
def _combined_y_to_abs(cy: int) -> int:
|
||||
for c_start, s_h, abs_start in strip_offsets:
|
||||
if cy < c_start + s_h:
|
||||
return abs_start + (cy - c_start)
|
||||
last_c, last_h, last_abs = strip_offsets[-1]
|
||||
return last_abs + last_h
|
||||
|
||||
for r in rows:
|
||||
abs_y = _combined_y_to_abs(r.y)
|
||||
abs_y_end = _combined_y_to_abs(r.y + r.height)
|
||||
r.y = abs_y
|
||||
r.height = abs_y_end - abs_y
|
||||
else:
|
||||
rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
|
||||
else:
|
||||
# No boxes — standard row detection
|
||||
rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
|
||||
|
||||
duration = time.time() - t0
|
||||
|
||||
# Assign zone_index based on which content zone each row falls in
|
||||
# Build content zone list with indices
|
||||
zones = column_result.get("zones") or []
|
||||
content_zones = [(i, z) for i, z in enumerate(zones) if z.get("zone_type") == "content"] if zones else []
|
||||
|
||||
# Build serializable result (exclude words to keep payload small)
|
||||
@@ -1909,7 +1925,7 @@ async def detect_words(
|
||||
row.word_count = len(row.words)
|
||||
|
||||
# Exclude rows that fall within box zones
|
||||
zones = column_result.get("zones", [])
|
||||
zones = column_result.get("zones") or []
|
||||
box_ranges = []
|
||||
for zone in zones:
|
||||
if zone.get("zone_type") == "box" and zone.get("box"):
|
||||
@@ -2676,7 +2692,7 @@ async def get_fabric_json(session_id: str):
|
||||
subs = await get_sub_sessions(session_id)
|
||||
if subs:
|
||||
column_result = session.get("column_result") or {}
|
||||
zones = column_result.get("zones", [])
|
||||
zones = column_result.get("zones") or []
|
||||
box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")]
|
||||
|
||||
for sub in subs:
|
||||
@@ -2733,7 +2749,7 @@ async def get_merged_vocab_entries(session_id: str):
|
||||
subs = await get_sub_sessions(session_id)
|
||||
if subs:
|
||||
column_result = session.get("column_result") or {}
|
||||
zones = column_result.get("zones", [])
|
||||
zones = column_result.get("zones") or []
|
||||
box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")]
|
||||
|
||||
for sub in subs:
|
||||
@@ -3289,7 +3305,7 @@ async def _get_rows_overlay(session_id: str) -> Response:
|
||||
|
||||
# Draw zone separator lines if zones exist
|
||||
column_result = session.get("column_result") or {}
|
||||
zones = column_result.get("zones", [])
|
||||
zones = column_result.get("zones") or []
|
||||
if zones:
|
||||
img_w_px = img.shape[1]
|
||||
zone_color = (0, 200, 255) # Yellow (BGR)
|
||||
@@ -3445,7 +3461,7 @@ async def _get_words_overlay(session_id: str) -> Response:
|
||||
|
||||
# Red semi-transparent overlay for box zones
|
||||
column_result = session.get("column_result") or {}
|
||||
zones = column_result.get("zones", [])
|
||||
zones = column_result.get("zones") or []
|
||||
_draw_box_exclusion_overlay(img, zones)
|
||||
|
||||
success, result_png = cv2.imencode(".png", img)
|
||||
|
||||
Reference in New Issue
Block a user