diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index b898d0d..342fbaa 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -419,7 +419,7 @@ async def create_box_sessions(session_id: str): if not column_result: raise HTTPException(status_code=400, detail="Column detection must be completed first") - zones = column_result.get("zones", []) + zones = column_result.get("zones") or [] box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")] if not box_zones: return {"session_id": session_id, "sub_sessions": [], "message": "No boxes detected"} @@ -1532,7 +1532,7 @@ async def _get_columns_overlay(session_id: str) -> Response: cv2.addWeighted(overlay, 0.2, img, 0.8, 0, img) # Draw detected box boundaries as dashed rectangles - zones = column_result.get("zones", []) + zones = column_result.get("zones") or [] for zone in zones: if zone.get("zone_type") == "box" and zone.get("box"): box = zone["box"] @@ -1600,83 +1600,99 @@ async def detect_rows(session_id: str): # Read zones from column_result to exclude box regions session = await get_session_db(session_id) column_result = (session or {}).get("column_result") or {} - zones = column_result.get("zones") or [] # zones can be None for sub-sessions + is_sub_session = bool((session or {}).get("parent_session_id")) - # Collect box y-ranges for filtering - box_ranges = [] # [(y_start, y_end)] - for zone in zones: - if zone.get("zone_type") == "box" and zone.get("box"): - box = zone["box"] - box_ranges.append((box["y"], box["y"] + box["height"])) - - if box_ranges and inv is not None: - # Combined-image approach: strip box regions from inv image, - # run row detection on the combined image, then remap y-coords back. - content_strips = [] # [(y_start, y_end)] in absolute coords - # Build content strips by subtracting box ranges from [top_y, bottom_y] - sorted_boxes = sorted(box_ranges, key=lambda r: r[0]) - strip_start = top_y - for by_start, by_end in sorted_boxes: - if by_start > strip_start: - content_strips.append((strip_start, by_start)) - strip_start = max(strip_start, by_end) - if strip_start < bottom_y: - content_strips.append((strip_start, bottom_y)) - - # Filter to strips with meaningful height - content_strips = [(ys, ye) for ys, ye in content_strips if ye - ys >= 20] - - if content_strips: - # Stack content strips vertically - inv_strips = [inv[ys:ye, :] for ys, ye in content_strips] - combined_inv = np.vstack(inv_strips) - - # Filter word_dicts to only include words from content strips - combined_words = [] - cum_y = 0 - strip_offsets = [] # (combined_y_start, strip_height, abs_y_start) - for ys, ye in content_strips: - h = ye - ys - strip_offsets.append((cum_y, h, ys)) - for w in word_dicts: - w_abs_y = w['top'] + top_y # word y is relative to content top - w_center = w_abs_y + w['height'] / 2 - if ys <= w_center < ye: - # Remap to combined coordinates - w_copy = dict(w) - w_copy['top'] = cum_y + (w_abs_y - ys) - combined_words.append(w_copy) - cum_y += h - - # Run row detection on combined image - combined_h = combined_inv.shape[0] - rows = detect_row_geometry( - combined_inv, combined_words, left_x, right_x, 0, combined_h, - ) - - # Remap y-coordinates back to absolute page coords - def _combined_y_to_abs(cy: int) -> int: - for c_start, s_h, abs_start in strip_offsets: - if cy < c_start + s_h: - return abs_start + (cy - c_start) - last_c, last_h, last_abs = strip_offsets[-1] - return last_abs + last_h - - for r in rows: - abs_y = _combined_y_to_abs(r.y) - abs_y_end = _combined_y_to_abs(r.y + r.height) - r.y = abs_y - r.height = abs_y_end - abs_y - else: - rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y) + # Sub-sessions (box crops): use word-grouping instead of gap-based + # row detection. Box images are small with complex internal layouts + # (headings, sub-columns) where the horizontal projection approach + # merges rows. Word-grouping directly clusters words by Y proximity, + # which is more robust for these cases. + if is_sub_session and word_dicts: + from cv_layout import _build_rows_from_word_grouping + rows = _build_rows_from_word_grouping( + word_dicts, left_x, right_x, top_y, bottom_y, + right_x - left_x, bottom_y - top_y, + ) + logger.info(f"OCR Pipeline: sub-session {session_id}: word-grouping found {len(rows)} rows") else: - # No boxes — standard row detection - rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y) + zones = column_result.get("zones") or [] # zones can be None for sub-sessions + + # Collect box y-ranges for filtering + box_ranges = [] # [(y_start, y_end)] + for zone in zones: + if zone.get("zone_type") == "box" and zone.get("box"): + box = zone["box"] + box_ranges.append((box["y"], box["y"] + box["height"])) + + if box_ranges and inv is not None: + # Combined-image approach: strip box regions from inv image, + # run row detection on the combined image, then remap y-coords back. + content_strips = [] # [(y_start, y_end)] in absolute coords + # Build content strips by subtracting box ranges from [top_y, bottom_y] + sorted_boxes = sorted(box_ranges, key=lambda r: r[0]) + strip_start = top_y + for by_start, by_end in sorted_boxes: + if by_start > strip_start: + content_strips.append((strip_start, by_start)) + strip_start = max(strip_start, by_end) + if strip_start < bottom_y: + content_strips.append((strip_start, bottom_y)) + + # Filter to strips with meaningful height + content_strips = [(ys, ye) for ys, ye in content_strips if ye - ys >= 20] + + if content_strips: + # Stack content strips vertically + inv_strips = [inv[ys:ye, :] for ys, ye in content_strips] + combined_inv = np.vstack(inv_strips) + + # Filter word_dicts to only include words from content strips + combined_words = [] + cum_y = 0 + strip_offsets = [] # (combined_y_start, strip_height, abs_y_start) + for ys, ye in content_strips: + h = ye - ys + strip_offsets.append((cum_y, h, ys)) + for w in word_dicts: + w_abs_y = w['top'] + top_y # word y is relative to content top + w_center = w_abs_y + w['height'] / 2 + if ys <= w_center < ye: + # Remap to combined coordinates + w_copy = dict(w) + w_copy['top'] = cum_y + (w_abs_y - ys) + combined_words.append(w_copy) + cum_y += h + + # Run row detection on combined image + combined_h = combined_inv.shape[0] + rows = detect_row_geometry( + combined_inv, combined_words, left_x, right_x, 0, combined_h, + ) + + # Remap y-coordinates back to absolute page coords + def _combined_y_to_abs(cy: int) -> int: + for c_start, s_h, abs_start in strip_offsets: + if cy < c_start + s_h: + return abs_start + (cy - c_start) + last_c, last_h, last_abs = strip_offsets[-1] + return last_abs + last_h + + for r in rows: + abs_y = _combined_y_to_abs(r.y) + abs_y_end = _combined_y_to_abs(r.y + r.height) + r.y = abs_y + r.height = abs_y_end - abs_y + else: + rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y) + else: + # No boxes — standard row detection + rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y) duration = time.time() - t0 # Assign zone_index based on which content zone each row falls in # Build content zone list with indices + zones = column_result.get("zones") or [] content_zones = [(i, z) for i, z in enumerate(zones) if z.get("zone_type") == "content"] if zones else [] # Build serializable result (exclude words to keep payload small) @@ -1909,7 +1925,7 @@ async def detect_words( row.word_count = len(row.words) # Exclude rows that fall within box zones - zones = column_result.get("zones", []) + zones = column_result.get("zones") or [] box_ranges = [] for zone in zones: if zone.get("zone_type") == "box" and zone.get("box"): @@ -2676,7 +2692,7 @@ async def get_fabric_json(session_id: str): subs = await get_sub_sessions(session_id) if subs: column_result = session.get("column_result") or {} - zones = column_result.get("zones", []) + zones = column_result.get("zones") or [] box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")] for sub in subs: @@ -2733,7 +2749,7 @@ async def get_merged_vocab_entries(session_id: str): subs = await get_sub_sessions(session_id) if subs: column_result = session.get("column_result") or {} - zones = column_result.get("zones", []) + zones = column_result.get("zones") or [] box_zones = [z for z in zones if z.get("zone_type") == "box" and z.get("box")] for sub in subs: @@ -3289,7 +3305,7 @@ async def _get_rows_overlay(session_id: str) -> Response: # Draw zone separator lines if zones exist column_result = session.get("column_result") or {} - zones = column_result.get("zones", []) + zones = column_result.get("zones") or [] if zones: img_w_px = img.shape[1] zone_color = (0, 200, 255) # Yellow (BGR) @@ -3445,7 +3461,7 @@ async def _get_words_overlay(session_id: str) -> Response: # Red semi-transparent overlay for box zones column_result = session.get("column_result") or {} - zones = column_result.get("zones", []) + zones = column_result.get("zones") or [] _draw_box_exclusion_overlay(img, zones) success, result_png = cv2.imencode(".png", img)