fix: Sub-Session Zeilenerkennung nutzt Word-Grouping statt Gap-Detection

Gap-basierte Erkennung findet bei kleinen Box-Bildern zu wenige Gaps und mergt Zeilen (7 raw gaps -> 4 validated -> nur 3 rows statt 6). Sub-Sessions nutzen jetzt direkt _build_rows_from_word_grouping(), das Woerter nach Y-Position clustert — robuster fuer komplexe Box-Layouts. Zusaetzlich: alle zones=None Crashes gefixt (replace_all .get("zones") or []). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 09:05:24 +01:00
parent 785b4d7655
commit f65bd11919
1 changed files with 93 additions and 77 deletions
@@ -419,7 +419,7 @@ async def create_box_sessions(session_id: str):
    if not column_result:
        raise HTTPException(status_code=400, detail="Column detection must be completed first")

-    zones = column_result.get("zones", [])
+    zones = column_result.get("zones") or []
    box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")]
    if not box_zones:
        return {"session_id": session_id, "sub_sessions": [], "message": "No boxes detected"}
@@ -1532,7 +1532,7 @@ async def _get_columns_overlay(session_id: str) -> Response:
    cv2.addWeighted(overlay, 0.2, img, 0.8, 0, img)

    # Draw detected box boundaries as dashed rectangles
-    zones = column_result.get("zones", [])
+    zones = column_result.get("zones") or []
    for zone in zones:
        if zone.get("zone_type") == "box" and zone.get("box"):
            box = zone["box"]
@@ -1600,83 +1600,99 @@ async def detect_rows(session_id: str):
    # Read zones from column_result to exclude box regions
    session = await get_session_db(session_id)
    column_result = (session or {}).get("column_result") or {}
-    zones = column_result.get("zones") or []  # zones can be None for sub-sessions
+    is_sub_session = bool((session or {}).get("parent_session_id"))

-    # Collect box y-ranges for filtering
-    box_ranges = []  # [(y_start, y_end)]
-    for zone in zones:
-        if zone.get("zone_type") == "box" and zone.get("box"):
-            box = zone["box"]
-            box_ranges.append((box["y"], box["y"] + box["height"]))
-
-    if box_ranges and inv is not None:
-        # Combined-image approach: strip box regions from inv image,
-        # run row detection on the combined image, then remap y-coords back.
-        content_strips = []  # [(y_start, y_end)] in absolute coords
-        # Build content strips by subtracting box ranges from [top_y, bottom_y]
-        sorted_boxes = sorted(box_ranges, key=lambda r: r[0])
-        strip_start = top_y
-        for by_start, by_end in sorted_boxes:
-            if by_start > strip_start:
-                content_strips.append((strip_start, by_start))
-            strip_start = max(strip_start, by_end)
-        if strip_start < bottom_y:
-            content_strips.append((strip_start, bottom_y))
-
-        # Filter to strips with meaningful height
-        content_strips = [(ys, ye) for ys, ye in content_strips if ye - ys >= 20]
-
-        if content_strips:
-            # Stack content strips vertically
-            inv_strips = [inv[ys:ye, :] for ys, ye in content_strips]
-            combined_inv = np.vstack(inv_strips)
-
-            # Filter word_dicts to only include words from content strips
-            combined_words = []
-            cum_y = 0
-            strip_offsets = []  # (combined_y_start, strip_height, abs_y_start)
-            for ys, ye in content_strips:
-                h = ye - ys
-                strip_offsets.append((cum_y, h, ys))
-                for w in word_dicts:
-                    w_abs_y = w['top'] + top_y  # word y is relative to content top
-                    w_center = w_abs_y + w['height'] / 2
-                    if ys <= w_center < ye:
-                        # Remap to combined coordinates
-                        w_copy = dict(w)
-                        w_copy['top'] = cum_y + (w_abs_y - ys)
-                        combined_words.append(w_copy)
-                cum_y += h
-
-            # Run row detection on combined image
-            combined_h = combined_inv.shape[0]
-            rows = detect_row_geometry(
-                combined_inv, combined_words, left_x, right_x, 0, combined_h,
-            )
-
-            # Remap y-coordinates back to absolute page coords
-            def _combined_y_to_abs(cy: int) -> int:
-                for c_start, s_h, abs_start in strip_offsets:
-                    if cy < c_start + s_h:
-                        return abs_start + (cy - c_start)
-                last_c, last_h, last_abs = strip_offsets[-1]
-                return last_abs + last_h
-
-            for r in rows:
-                abs_y = _combined_y_to_abs(r.y)
-                abs_y_end = _combined_y_to_abs(r.y + r.height)
-                r.y = abs_y
-                r.height = abs_y_end - abs_y
-        else:
-            rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
+    # Sub-sessions (box crops): use word-grouping instead of gap-based
+    # row detection.  Box images are small with complex internal layouts
+    # (headings, sub-columns) where the horizontal projection approach
+    # merges rows.  Word-grouping directly clusters words by Y proximity,
+    # which is more robust for these cases.
+    if is_sub_session and word_dicts:
+        from cv_layout import _build_rows_from_word_grouping
+        rows = _build_rows_from_word_grouping(
+            word_dicts, left_x, right_x, top_y, bottom_y,
+            right_x - left_x, bottom_y - top_y,
+        )
+        logger.info(f"OCR Pipeline: sub-session {session_id}: word-grouping found {len(rows)} rows")
    else:
-        # No boxes — standard row detection
-        rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
+        zones = column_result.get("zones") or []  # zones can be None for sub-sessions
+
+        # Collect box y-ranges for filtering
+        box_ranges = []  # [(y_start, y_end)]
+        for zone in zones:
+            if zone.get("zone_type") == "box" and zone.get("box"):
+                box = zone["box"]
+                box_ranges.append((box["y"], box["y"] + box["height"]))
+
+        if box_ranges and inv is not None:
+            # Combined-image approach: strip box regions from inv image,
+            # run row detection on the combined image, then remap y-coords back.
+            content_strips = []  # [(y_start, y_end)] in absolute coords
+            # Build content strips by subtracting box ranges from [top_y, bottom_y]
+            sorted_boxes = sorted(box_ranges, key=lambda r: r[0])
+            strip_start = top_y
+            for by_start, by_end in sorted_boxes:
+                if by_start > strip_start:
+                    content_strips.append((strip_start, by_start))
+                strip_start = max(strip_start, by_end)
+            if strip_start < bottom_y:
+                content_strips.append((strip_start, bottom_y))
+
+            # Filter to strips with meaningful height
+            content_strips = [(ys, ye) for ys, ye in content_strips if ye - ys >= 20]
+
+            if content_strips:
+                # Stack content strips vertically
+                inv_strips = [inv[ys:ye, :] for ys, ye in content_strips]
+                combined_inv = np.vstack(inv_strips)
+
+                # Filter word_dicts to only include words from content strips
+                combined_words = []
+                cum_y = 0
+                strip_offsets = []  # (combined_y_start, strip_height, abs_y_start)
+                for ys, ye in content_strips:
+                    h = ye - ys
+                    strip_offsets.append((cum_y, h, ys))
+                    for w in word_dicts:
+                        w_abs_y = w['top'] + top_y  # word y is relative to content top
+                        w_center = w_abs_y + w['height'] / 2
+                        if ys <= w_center < ye:
+                            # Remap to combined coordinates
+                            w_copy = dict(w)
+                            w_copy['top'] = cum_y + (w_abs_y - ys)
+                            combined_words.append(w_copy)
+                    cum_y += h
+
+                # Run row detection on combined image
+                combined_h = combined_inv.shape[0]
+                rows = detect_row_geometry(
+                    combined_inv, combined_words, left_x, right_x, 0, combined_h,
+                )
+
+                # Remap y-coordinates back to absolute page coords
+                def _combined_y_to_abs(cy: int) -> int:
+                    for c_start, s_h, abs_start in strip_offsets:
+                        if cy < c_start + s_h:
+                            return abs_start + (cy - c_start)
+                    last_c, last_h, last_abs = strip_offsets[-1]
+                    return last_abs + last_h
+
+                for r in rows:
+                    abs_y = _combined_y_to_abs(r.y)
+                    abs_y_end = _combined_y_to_abs(r.y + r.height)
+                    r.y = abs_y
+                    r.height = abs_y_end - abs_y
+            else:
+                rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
+        else:
+            # No boxes — standard row detection
+            rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)

    duration = time.time() - t0

    # Assign zone_index based on which content zone each row falls in
    # Build content zone list with indices
+    zones = column_result.get("zones") or []
    content_zones = [(i, z) for i, z in enumerate(zones) if z.get("zone_type") == "content"] if zones else []

    # Build serializable result (exclude words to keep payload small)
@@ -1909,7 +1925,7 @@ async def detect_words(
            row.word_count = len(row.words)

    # Exclude rows that fall within box zones
-    zones = column_result.get("zones", [])
+    zones = column_result.get("zones") or []
    box_ranges = []
    for zone in zones:
        if zone.get("zone_type") == "box" and zone.get("box"):
@@ -2676,7 +2692,7 @@ async def get_fabric_json(session_id: str):
    subs = await get_sub_sessions(session_id)
    if subs:
        column_result = session.get("column_result") or {}
-        zones = column_result.get("zones", [])
+        zones = column_result.get("zones") or []
        box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")]

        for sub in subs:
@@ -2733,7 +2749,7 @@ async def get_merged_vocab_entries(session_id: str):
    subs = await get_sub_sessions(session_id)
    if subs:
        column_result = session.get("column_result") or {}
-        zones = column_result.get("zones", [])
+        zones = column_result.get("zones") or []
        box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")]

        for sub in subs:
@@ -3289,7 +3305,7 @@ async def _get_rows_overlay(session_id: str) -> Response:

    # Draw zone separator lines if zones exist
    column_result = session.get("column_result") or {}
-    zones = column_result.get("zones", [])
+    zones = column_result.get("zones") or []
    if zones:
        img_w_px = img.shape[1]
        zone_color = (0, 200, 255)  # Yellow (BGR)
@@ -3445,7 +3461,7 @@ async def _get_words_overlay(session_id: str) -> Response:

    # Red semi-transparent overlay for box zones
    column_result = session.get("column_result") or {}
-    zones = column_result.get("zones", [])
+    zones = column_result.get("zones") or []
    _draw_box_exclusion_overlay(img, zones)

    success, result_png = cv2.imencode(".png", img)