feat(ocr-pipeline): move oversized row splitting from Step 5 to Step 4

Implement _split_oversized_rows() in detect_row_geometry() (Step 7) to split content rows >1.5× median height using local horizontal projection. This produces correctly-sized rows before word OCR runs, instead of working around the issue in Step 5 with sub-cell splitting hacks. Removed Step 5 workarounds: _split_oversized_entries(), sub-cell splitting in build_word_grid(), and median_row_h calculation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-01 11:46:18 +01:00
parent 8507e2e035
commit ba65e47654
1 changed files with 188 additions and 156 deletions
@@ -1539,6 +1539,11 @@ def detect_row_geometry(
            gap_before=gap_before,
        ))

+    # --- Step 7: Split oversized rows ---
+    # If a content row is >1.5× the median height, re-analyze it with a local
+    # horizontal projection to find missed row boundaries within.
+    rows = _split_oversized_rows(rows, inv, left_x, right_x, top_y, word_dicts)
+
    type_counts = {}
    for r in rows:
        type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
@@ -1547,6 +1552,176 @@ def detect_row_geometry(
    return rows


+def _split_oversized_rows(
+    rows: List['RowGeometry'],
+    inv: np.ndarray,
+    left_x: int, right_x: int,
+    top_y: int,
+    word_dicts: List[Dict],
+) -> List['RowGeometry']:
+    """Split content rows that are >1.5× the median height.
+
+    Re-analyses oversized rows with a local horizontal projection profile
+    to find missed row boundaries within.  This catches cases where Step 4's
+    initial gap analysis merged multiple vocabulary lines (e.g. because an
+    image or dense text obscured the gap).
+
+    Returns a new list with oversized rows replaced by sub-rows, re-indexed.
+    """
+    content_rows = [r for r in rows if r.row_type == 'content']
+    if len(content_rows) < 3:
+        return rows
+
+    heights = sorted(r.height for r in content_rows)
+    median_h = heights[len(heights) // 2]
+
+    if median_h <= 10:
+        return rows
+
+    threshold = median_h * 1.5
+    content_w = right_x - left_x
+
+    result: List[RowGeometry] = []
+    split_total = 0
+
+    for row in rows:
+        if row.row_type != 'content' or row.height <= threshold:
+            result.append(row)
+            continue
+
+        # --- Local horizontal projection on this row's strip ---
+        row_y_abs = row.y
+        row_h = row.height
+        strip = inv[row_y_abs:row_y_abs + row_h, left_x:right_x]
+
+        if strip.size == 0:
+            result.append(row)
+            continue
+
+        # Word-coverage mask (same approach as main detection)
+        pad_y = max(2, row_h // 50)
+        word_mask = np.zeros_like(strip)
+        row_words = [w for w in word_dicts
+                     if w['top'] + top_y >= row_y_abs - pad_y
+                     and w['top'] + top_y < row_y_abs + row_h + pad_y]
+
+        for wd in row_words:
+            wy = wd['top'] + top_y - row_y_abs  # relative to strip
+            y1 = max(0, wy - pad_y)
+            y2 = min(row_h, wy + wd['height'] + pad_y)
+            x1 = max(0, wd['left'])
+            x2 = min(content_w, wd['left'] + wd['width'])
+            word_mask[y1:y2, x1:x2] = 255
+
+        masked = cv2.bitwise_and(strip, word_mask)
+        h_proj = np.sum(masked, axis=1).astype(float)
+        h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
+
+        # Smooth
+        k = max(3, row_h // 40)
+        if k % 2 == 0:
+            k += 1
+        h_smooth = np.convolve(h_proj_norm, np.ones(k) / k, mode='same')
+
+        # Gap detection within the row
+        med_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01
+        gap_thresh = max(med_density * 0.15, 0.003)
+        in_gap = h_smooth < gap_thresh
+
+        min_gap_h = max(2, row_h // 30)  # smaller threshold for sub-gaps
+        local_gaps = []
+        gap_start = None
+        for y in range(len(in_gap)):
+            if in_gap[y]:
+                if gap_start is None:
+                    gap_start = y
+            else:
+                if gap_start is not None:
+                    if y - gap_start >= min_gap_h:
+                        local_gaps.append((gap_start, y))
+                    gap_start = None
+        if gap_start is not None and len(in_gap) - gap_start >= min_gap_h:
+            local_gaps.append((gap_start, len(in_gap)))
+
+        if not local_gaps:
+            # No sub-gaps found — keep original row
+            result.append(row)
+            continue
+
+        # Validate gaps against words (don't split through a word)
+        valid_gaps = []
+        for gs, ge in local_gaps:
+            overlapping = False
+            for wd in row_words:
+                wy = wd['top'] + top_y - row_y_abs
+                wy2 = wy + wd['height']
+                if wy < ge and wy2 > gs:
+                    overlapping = True
+                    break
+            if not overlapping:
+                valid_gaps.append((gs, ge))
+
+        if not valid_gaps:
+            result.append(row)
+            continue
+
+        valid_gaps.sort()
+
+        # Build sub-row boundaries from gaps
+        sub_bounds = []  # (start_rel, end_rel) within the row strip
+        # Before first gap
+        if valid_gaps[0][0] > 0:
+            sub_bounds.append((0, valid_gaps[0][0]))
+        # Between gaps
+        for i in range(len(valid_gaps) - 1):
+            sub_bounds.append((valid_gaps[i][1], valid_gaps[i + 1][0]))
+        # After last gap
+        if valid_gaps[-1][1] < row_h:
+            sub_bounds.append((valid_gaps[-1][1], row_h))
+
+        # Filter out empty sub-rows
+        sub_bounds = [(s, e) for s, e in sub_bounds if e - s > 2]
+
+        if len(sub_bounds) <= 1:
+            result.append(row)
+            continue
+
+        # Create sub-rows
+        for sb_start, sb_end in sub_bounds:
+            sub_y_abs = row_y_abs + sb_start
+            sub_h = sb_end - sb_start
+            # Assign words to this sub-row
+            sub_words = [w for w in row_words
+                         if w['top'] + top_y >= sub_y_abs - 2
+                         and w['top'] + top_y + w['height'] <= sub_y_abs + sub_h + 2]
+            result.append(RowGeometry(
+                index=0,  # re-indexed below
+                x=row.x,
+                y=sub_y_abs,
+                width=row.width,
+                height=sub_h,
+                word_count=len(sub_words),
+                words=sub_words,
+                row_type='content',
+                gap_before=0,
+            ))
+
+        split_total += len(sub_bounds) - 1
+        logger.info(f"RowGeometry: split oversized row (h={row_h}) "
+                    f"into {len(sub_bounds)} sub-rows "
+                    f"(median_h={median_h}, {len(valid_gaps)} gaps)")
+
+    if split_total > 0:
+        # Re-index all rows
+        result.sort(key=lambda r: r.y)
+        for i, r in enumerate(result):
+            r.index = i
+        logger.info(f"RowGeometry: {split_total} oversized splits → "
+                    f"{len(result)} total rows (was {len(rows)})")
+
+    return result
+
+
 def _build_rows_from_word_grouping(
    word_dicts: List[Dict],
    left_x: int, right_x: int,
@@ -2708,119 +2883,6 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
    return _PHONETIC_BRACKET_RE.sub(replacer, text)


-def _split_oversized_entries(
-    entries: List[Dict[str, Any]],
-    content_rows: List[RowGeometry],
-    img_w: int,
-    img_h: int,
-) -> List[Dict[str, Any]]:
-    """Split entries from oversized rows into multiple entries.
-
-    If a row is >1.5× the median height, it likely contains multiple vocabulary
-    entries that Step 4 failed to separate. We split based on line count:
-    if EN and DE have the same number of newline-separated lines, each line
-    becomes its own entry.
-
-    This is a deterministic plausibility check — no LLM needed.
-    """
-    if len(entries) < 3:
-        return entries
-
-    # Calculate median row height from pixel heights
-    row_heights = [r.height for r in content_rows]
-    row_heights_sorted = sorted(row_heights)
-    median_h = row_heights_sorted[len(row_heights_sorted) // 2]
-
-    if median_h <= 0:
-        return entries
-
-    height_threshold = median_h * 1.5
-    result: List[Dict[str, Any]] = []
-    split_count = 0
-
-    for entry in entries:
-        # Get pixel height from bbox percent
-        entry_h_px = entry['bbox']['h'] / 100.0 * img_h
-
-        if entry_h_px <= height_threshold:
-            result.append(entry)
-            continue
-
-        # This row is oversized — check if we can split
-        en_lines = entry['english'].split('\n') if entry['english'] else ['']
-        de_lines = entry['german'].split('\n') if entry['german'] else ['']
-        ex_lines = entry['example'].split('\n') if entry['example'] else ['']
-
-        # Filter empty lines
-        en_lines = [l for l in en_lines if l.strip()] or ['']
-        de_lines = [l for l in de_lines if l.strip()] or ['']
-        ex_lines = [l for l in ex_lines if l.strip()] or ['']
-
-        # Determine split count: EN and DE must agree (or one is empty)
-        n_en = len(en_lines)
-        n_de = len(de_lines)
-        n_ex = len(ex_lines)
-
-        can_split = False
-        n_split = 1
-
-        if n_en > 1 and n_de > 1 and n_en == n_de:
-            n_split = n_en
-            can_split = True
-        elif n_en > 1 and n_de <= 1:
-            # Only EN has multiple lines — still split, DE goes to first
-            n_split = n_en
-            can_split = True
-        elif n_de > 1 and n_en <= 1:
-            # Only DE has multiple lines
-            n_split = n_de
-            can_split = True
-
-        if not can_split or n_split <= 1:
-            result.append(entry)
-            continue
-
-        # Split into n_split sub-entries
-        orig_y = entry['bbox']['y']
-        orig_h = entry['bbox']['h']
-        sub_h = orig_h / n_split
-
-        for k in range(n_split):
-            sub_entry = {
-                'row_index': entry['row_index'],
-                'english': en_lines[k] if k < len(en_lines) else '',
-                'german': de_lines[k] if k < len(de_lines) else '',
-                'example': ex_lines[k] if k < len(ex_lines) else '',
-                'confidence': entry['confidence'],
-                'bbox': {
-                    'x': entry['bbox']['x'],
-                    'y': round(orig_y + k * sub_h, 2),
-                    'w': entry['bbox']['w'],
-                    'h': round(sub_h, 2),
-                },
-                'bbox_en': entry['bbox_en'],
-                'bbox_de': entry['bbox_de'],
-                'bbox_ex': entry['bbox_ex'],
-                'ocr_engine': entry.get('ocr_engine', ''),
-                'split_from_row': entry['row_index'],
-            }
-            result.append(sub_entry)
-
-        split_count += 1
-        logger.info(f"split_oversized: row {entry['row_index']} "
-                    f"(h={entry_h_px:.0f}px > {height_threshold:.0f}px) "
-                    f"→ {n_split} sub-entries")
-
-    if split_count > 0:
-        # Re-number row indices
-        for i, e in enumerate(result):
-            e['row_index'] = i
-        logger.info(f"split_oversized: {split_count} rows split, "
-                    f"{len(entries)} → {len(result)} entries")
-
-    return result
-
-
 def build_word_grid(
    ocr_img: np.ndarray,
    column_regions: List[PageRegion],
@@ -2885,10 +2947,6 @@ def build_word_grid(

    entries: List[Dict[str, Any]] = []

-    # Calculate median row height for oversized detection
-    row_heights = sorted(r.height for r in content_rows)
-    median_row_h = row_heights[len(row_heights) // 2] if row_heights else 100
-
    for row_idx, row in enumerate(content_rows):
        entry: Dict[str, Any] = {
            'row_index': row_idx,
@@ -2930,40 +2988,17 @@ def build_word_grid(
            if cell_w <= 0 or cell_h <= 0:
                continue

-            # For oversized cells (>1.5× median), split vertically into sub-cells
-            # and OCR each separately. This prevents OCR from missing text at
-            # the bottom of tall cells (RapidOCR downscales tall narrow crops).
-            is_oversized = row.height > median_row_h * 1.5 and median_row_h > 20
-            if is_oversized:
-                n_splits = max(2, round(row.height / median_row_h))
-                sub_h = cell_h / n_splits
-                words = []
-                for s in range(n_splits):
-                    sub_y = int(cell_y + s * sub_h)
-                    sub_height = int(sub_h) if s < n_splits - 1 else (cell_y + cell_h - sub_y)
-                    sub_region = PageRegion(
-                        type=col.type,
-                        x=cell_x, y=sub_y,
-                        width=cell_w, height=max(1, sub_height),
-                    )
-                    if use_rapid:
-                        sub_words = ocr_region_rapid(img_bgr, sub_region)
-                    else:
-                        cell_lang = lang_map.get(col.type, lang)
-                        sub_words = ocr_region(ocr_img, sub_region, lang=cell_lang, psm=6)
-                    words.extend(sub_words)
+            cell_region = PageRegion(
+                type=col.type,
+                x=cell_x, y=cell_y,
+                width=cell_w, height=cell_h,
+            )
+            # OCR the cell
+            if use_rapid:
+                words = ocr_region_rapid(img_bgr, cell_region)
            else:
-                cell_region = PageRegion(
-                    type=col.type,
-                    x=cell_x, y=cell_y,
-                    width=cell_w, height=cell_h,
-                )
-                # OCR the cell
-                if use_rapid:
-                    words = ocr_region_rapid(img_bgr, cell_region)
-                else:
-                    cell_lang = lang_map.get(col.type, lang)
-                    words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
+                cell_lang = lang_map.get(col.type, lang)
+                words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)

            # Group into lines, then join in reading order (Fix A)
            # Use half of average word height as Y-tolerance
@@ -3007,16 +3042,13 @@ def build_word_grid(
    # --- Post-processing pipeline (deterministic, no LLM) ---
    n_raw = len(entries)

-    # 1. Split oversized rows (missed Step 4 boundaries)
-    entries = _split_oversized_entries(entries, content_rows, img_w, img_h)
-
-    # 2. Fix character confusion (I/1/l based on context)
+    # 1. Fix character confusion (I/1/l based on context)
    entries = _fix_character_confusion(entries)

-    # 3. Replace OCR'd phonetics with dictionary IPA
+    # 2. Replace OCR'd phonetics with dictionary IPA
    entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)

-    # 4. Split comma-separated word forms (break, broke, broken → 3 entries)
+    # 3. Split comma-separated word forms (break, broke, broken → 3 entries)
    entries = _split_comma_entries(entries)

    # 5. Attach example sentences (rows without DE → examples for preceding entry)