diff --git a/klausur-service/backend/cv_layout.py b/klausur-service/backend/cv_layout.py index 9598668..6b49a47 100644 --- a/klausur-service/backend/cv_layout.py +++ b/klausur-service/backend/cv_layout.py @@ -1265,96 +1265,6 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt if len(wc_gaps) >= 2: validated_gaps = wc_gaps - # --- Step 5c: Left-edge alignment validation (suspicious gaps only) --- - # Only check gaps that would create an unusually wide column to the right. - # These are likely false splits within a single wide column (e.g. short EN - # words followed by longer DE example sentences in the same column). - # Gaps that produce columns of similar width to their neighbors are trusted. - if len(validated_gaps) > 2: - edge_tolerance_align = max(8, content_w // 150) - min_aligned_ratio = 0.15 # at least 15% of words must share a left-edge bin - margin_thresh = max(10, int(content_w * 0.02)) - - # Compute tentative column widths from all gaps - sorted_gaps = sorted(validated_gaps, key=lambda g: g[0]) - # Interior gaps only (exclude margins) - interior_indices = [] - for gi, (gs, ge) in enumerate(sorted_gaps): - if gs > margin_thresh and ge < content_w - margin_thresh: - interior_indices.append(gi) - - if interior_indices: - # For each interior gap, compute the width of the column it starts - gap_suspicion: dict = {} # gap_index → right_col_width - for gi in interior_indices: - gap_end = sorted_gaps[gi][1] - # Next gap start (or content right edge) - if gi + 1 < len(sorted_gaps): - next_gs = sorted_gaps[gi + 1][0] - else: - next_gs = content_w - right_col_w = next_gs - gap_end - gap_suspicion[gi] = right_col_w - - # Median column width (from all gaps, including margins) - all_col_widths = [] - prev_end = 0 - for gs, ge in sorted_gaps: - cw = gs - prev_end - if cw > 0: - all_col_widths.append(cw) - prev_end = ge - trailing = content_w - prev_end - if trailing > 0: - all_col_widths.append(trailing) - median_col_w = sorted(all_col_widths)[len(all_col_widths) // 2] if all_col_widths else content_w - - # A gap is suspicious if the column to its right is > 2x median width - suspicious_threshold = median_col_w * 2.0 - - alignment_validated = list(validated_gaps) # start with all - for gi in interior_indices: - right_col_w = gap_suspicion[gi] - if right_col_w <= suspicious_threshold: - continue # normal gap, keep it - - # Suspicious — check left-edge alignment - gap_start_rel, gap_end_rel = sorted_gaps[gi] - next_gs = sorted_gaps[gi + 1][0] if gi + 1 < len(sorted_gaps) else content_w - right_words = [w for w in segment_words - if gap_end_rel <= w['left'] < next_gs] - - if len(right_words) < 3: - continue # too few words, keep gap - - # Cluster left-edges - right_lefts = sorted(w['left'] for w in right_words) - bins = [] - cur_bin = [right_lefts[0]] - for le in right_lefts[1:]: - if le - cur_bin[-1] <= edge_tolerance_align: - cur_bin.append(le) - else: - bins.append(len(cur_bin)) - cur_bin = [le] - bins.append(len(cur_bin)) - - max_bin = max(bins) - ratio = max_bin / len(right_words) - - if ratio < min_aligned_ratio: - # Remove this gap - alignment_validated.remove((gap_start_rel, gap_end_rel)) - logger.info(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] " - f"REJECTED — suspicious (right_col={right_col_w}px > 2x median={median_col_w:.0f}px) " - f"and poor left-edge alignment (best_bin={max_bin}/{len(right_words)}={ratio:.2f})") - else: - logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] " - f"suspicious but passed alignment (best_bin={max_bin}/{len(right_words)}={ratio:.2f})") - - if len(alignment_validated) >= 2: - validated_gaps = alignment_validated - # --- Step 6: Fallback to clustering if too few gaps --- if len(validated_gaps) < 2: logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering") @@ -3187,13 +3097,75 @@ def detect_column_geometry_zoned( return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv, zone_data, boxes) - # Split into zones + # --- New approach: concatenate content regions (skip boxes), run column + # detection ONCE on the combined image, then map coordinates back. --- + + # Split into zones (for metadata / overlay purposes) zones = split_page_into_zones(left_x, top_y, content_w, content_h, boxes) - # Run column detection per content zone - all_geometries: List[ColumnGeometry] = [] - zones_data: List[Dict] = [] + # Collect content strips (above/between/below boxes) + content_strips: List[Tuple[int, int]] = [] # (y_start, y_end) in absolute coords + for zone in zones: + if zone.zone_type == 'content' and zone.height >= 40: + content_strips.append((zone.y, zone.y + zone.height)) + if not content_strips: + # Only box zones — fall back to original detection + logger.info("ZonedColumns: no content zones with height >= 40, using original result") + zone_data = [{"index": 0, "zone_type": "content", "y": top_y, + "height": content_h, "x": left_x, "width": content_w, "columns": []}] + return (geometries, left_x, right_x, top_y, bottom_y, + word_dicts, inv, zone_data, boxes) + + # Build combined image by vertically stacking content strips + ocr_strips = [ocr_img[ys:ye, :] for ys, ye in content_strips] + bgr_strips = [dewarped_bgr[ys:ye, :] for ys, ye in content_strips] + combined_ocr = np.vstack(ocr_strips) + combined_bgr = np.vstack(bgr_strips) + + logger.info(f"ZonedColumns: {len(boxes)} box(es), concatenating {len(content_strips)} " + f"content strips into combined image {combined_ocr.shape[1]}x{combined_ocr.shape[0]}") + + # Run column detection on the combined (box-free) image + combined_result = detect_column_geometry(combined_ocr, combined_bgr) + if combined_result is not None: + combined_geoms, c_lx, c_rx, c_ty, c_by, combined_words, combined_inv = combined_result + else: + # Fallback to original full-page result + logger.info("ZonedColumns: combined image column detection failed, using original") + combined_geoms = geometries + + # Map combined-image y-coordinates back to absolute page coordinates. + # In the combined image, strip i starts at cumulative_y = sum of heights + # of strips 0..i-1. We need to add the offset between the strip's + # original y-position and its position in the combined image. + # Build a mapping: combined_y → absolute_y + strip_offsets: List[Tuple[int, int, int]] = [] # (combined_y_start, strip_height, abs_y_start) + cum_y = 0 + for ys, ye in content_strips: + h = ye - ys + strip_offsets.append((cum_y, h, ys)) + cum_y += h + + def _combined_y_to_abs(cy: int) -> int: + """Map a y-coordinate in combined image back to absolute page coords.""" + for c_start, s_h, abs_start in strip_offsets: + if cy < c_start + s_h: + return abs_start + (cy - c_start) + # Past last strip — clamp to end of last strip + last_c, last_h, last_abs = strip_offsets[-1] + return last_abs + last_h + + # Adjust geometries: y and height need remapping + if combined_result is not None: + for g in combined_geoms: + abs_y = _combined_y_to_abs(g.y) + abs_y_end = _combined_y_to_abs(g.y + g.height) + g.y = abs_y + g.height = abs_y_end - abs_y + + # Build zones_data for the response + zones_data: List[Dict] = [] for zone in zones: zone_dict: Dict = { "index": zone.index, @@ -3215,45 +3187,12 @@ def detect_column_geometry_zoned( "border_thickness": zone.box.border_thickness, } - if zone.zone_type == 'content' and zone.height >= 40: - # Extract sub-image for this zone - zone_y_end = zone.y + zone.height - sub_ocr = ocr_img[zone.y:zone_y_end, :] - sub_bgr = dewarped_bgr[zone.y:zone_y_end, :] - - sub_result = detect_column_geometry(sub_ocr, sub_bgr) - if sub_result is not None: - sub_geoms, sub_lx, sub_rx, sub_ty, sub_by, _sub_words, _sub_inv = sub_result - - # Offset column y-coordinates back to absolute page coords - for g in sub_geoms: - g.y += zone.y - - zone_cols = [] - for g in sub_geoms: - zone_cols.append({ - "index": g.index, - "x": g.x, - "y": g.y, - "width": g.width, - "height": g.height, - "word_count": g.word_count, - "width_ratio": g.width_ratio, - "zone_index": zone.index, - }) - zone_dict["columns"] = zone_cols - all_geometries.extend(sub_geoms) - else: - logger.debug(f"ZonedColumns: zone {zone.index} column detection returned None") - zones_data.append(zone_dict) - # If per-zone detection produced no columns, fall back to the original - if not all_geometries: - all_geometries = geometries + all_geometries = combined_geoms if combined_geoms else geometries logger.info(f"ZonedColumns: {len(boxes)} box(es), {len(zones)} zone(s), " - f"{len(all_geometries)} total columns") + f"{len(all_geometries)} total columns (combined-image approach)") return (all_geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv, zones_data, boxes)