feat(ocr-pipeline): move oversized row splitting from Step 5 to Step 4

Implement _split_oversized_rows() in detect_row_geometry() (Step 7) to split content rows >1.5× median height using local horizontal projection. This produces correctly-sized rows before word OCR runs, instead of working around the issue in Step 5 with sub-cell splitting hacks. Removed Step 5 workarounds: _split_oversized_entries(), sub-cell splitting in build_word_grid(), and median_row_h calculation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-01 11:46:18 +01:00
parent 8507e2e035
commit ba65e47654
1 changed files with 188 additions and 156 deletions
@@ -1539,6 +1539,11 @@ def detect_row_geometry(
            gap_before=gap_before,
        ))
    # --- Step 7: Split oversized rows ---
    # If a content row is >1.5× the median height, re-analyze it with a local
    # horizontal projection to find missed row boundaries within.
    rows = _split_oversized_rows(rows, inv, left_x, right_x, top_y, word_dicts)
    type_counts = {}
    for r in rows:
        type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
@@ -1547,6 +1552,176 @@ def detect_row_geometry(
    return rows
 def _split_oversized_rows(
    rows: List['RowGeometry'],
    inv: np.ndarray,
    left_x: int, right_x: int,
    top_y: int,
    word_dicts: List[Dict],
 ) -> List['RowGeometry']:
    """Split content rows that are >1.5× the median height.
    Re-analyses oversized rows with a local horizontal projection profile
    to find missed row boundaries within.  This catches cases where Step 4's
    initial gap analysis merged multiple vocabulary lines (e.g. because an
    image or dense text obscured the gap).
    Returns a new list with oversized rows replaced by sub-rows, re-indexed.
    """
    content_rows = [r for r in rows if r.row_type == 'content']
    if len(content_rows) < 3:
        return rows
    heights = sorted(r.height for r in content_rows)
    median_h = heights[len(heights) // 2]
    if median_h <= 10:
        return rows
    threshold = median_h * 1.5
    content_w = right_x - left_x
    result: List[RowGeometry] = []
    split_total = 0
    for row in rows:
        if row.row_type != 'content' or row.height <= threshold:
            result.append(row)
            continue
        # --- Local horizontal projection on this row's strip ---
        row_y_abs = row.y
        row_h = row.height
        strip = inv[row_y_abs:row_y_abs + row_h, left_x:right_x]
        if strip.size == 0:
            result.append(row)
            continue
        # Word-coverage mask (same approach as main detection)
        pad_y = max(2, row_h // 50)
        word_mask = np.zeros_like(strip)
        row_words = [w for w in word_dicts
                     if w['top'] + top_y >= row_y_abs - pad_y
                     and w['top'] + top_y < row_y_abs + row_h + pad_y]
        for wd in row_words:
            wy = wd['top'] + top_y - row_y_abs  # relative to strip
            y1 = max(0, wy - pad_y)
            y2 = min(row_h, wy + wd['height'] + pad_y)
            x1 = max(0, wd['left'])
            x2 = min(content_w, wd['left'] + wd['width'])
            word_mask[y1:y2, x1:x2] = 255
        masked = cv2.bitwise_and(strip, word_mask)
        h_proj = np.sum(masked, axis=1).astype(float)
        h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
        # Smooth
        k = max(3, row_h // 40)
        if k % 2 == 0:
            k += 1
        h_smooth = np.convolve(h_proj_norm, np.ones(k) / k, mode='same')
        # Gap detection within the row
        med_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01
        gap_thresh = max(med_density * 0.15, 0.003)
        in_gap = h_smooth < gap_thresh
        min_gap_h = max(2, row_h // 30)  # smaller threshold for sub-gaps
        local_gaps = []
        gap_start = None
        for y in range(len(in_gap)):
            if in_gap[y]:
                if gap_start is None:
                    gap_start = y
            else:
                if gap_start is not None:
                    if y - gap_start >= min_gap_h:
                        local_gaps.append((gap_start, y))
                    gap_start = None
        if gap_start is not None and len(in_gap) - gap_start >= min_gap_h:
            local_gaps.append((gap_start, len(in_gap)))
        if not local_gaps:
            # No sub-gaps found — keep original row
            result.append(row)
            continue
        # Validate gaps against words (don't split through a word)
        valid_gaps = []
        for gs, ge in local_gaps:
            overlapping = False
            for wd in row_words:
                wy = wd['top'] + top_y - row_y_abs
                wy2 = wy + wd['height']
                if wy < ge and wy2 > gs:
                    overlapping = True
                    break
            if not overlapping:
                valid_gaps.append((gs, ge))
        if not valid_gaps:
            result.append(row)
            continue
        valid_gaps.sort()
        # Build sub-row boundaries from gaps
        sub_bounds = []  # (start_rel, end_rel) within the row strip
        # Before first gap
        if valid_gaps[0][0] > 0:
            sub_bounds.append((0, valid_gaps[0][0]))
        # Between gaps
        for i in range(len(valid_gaps) - 1):
            sub_bounds.append((valid_gaps[i][1], valid_gaps[i + 1][0]))
        # After last gap
        if valid_gaps[-1][1] < row_h:
            sub_bounds.append((valid_gaps[-1][1], row_h))
        # Filter out empty sub-rows
        sub_bounds = [(s, e) for s, e in sub_bounds if e - s > 2]
        if len(sub_bounds) <= 1:
            result.append(row)
            continue
        # Create sub-rows
        for sb_start, sb_end in sub_bounds:
            sub_y_abs = row_y_abs + sb_start
            sub_h = sb_end - sb_start
            # Assign words to this sub-row
            sub_words = [w for w in row_words
                         if w['top'] + top_y >= sub_y_abs - 2
                         and w['top'] + top_y + w['height'] <= sub_y_abs + sub_h + 2]
            result.append(RowGeometry(
                index=0,  # re-indexed below
                x=row.x,
                y=sub_y_abs,
                width=row.width,
                height=sub_h,
                word_count=len(sub_words),
                words=sub_words,
                row_type='content',
                gap_before=0,
            ))
        split_total += len(sub_bounds) - 1
        logger.info(f"RowGeometry: split oversized row (h={row_h}) "
                    f"into {len(sub_bounds)} sub-rows "
                    f"(median_h={median_h}, {len(valid_gaps)} gaps)")
    if split_total > 0:
        # Re-index all rows
        result.sort(key=lambda r: r.y)
        for i, r in enumerate(result):
            r.index = i
        logger.info(f"RowGeometry: {split_total} oversized splits → "
                    f"{len(result)} total rows (was {len(rows)})")
    return result
 def _build_rows_from_word_grouping(
    word_dicts: List[Dict],
    left_x: int, right_x: int,
@@ -2708,119 +2883,6 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
    return _PHONETIC_BRACKET_RE.sub(replacer, text)
 def _split_oversized_entries(
    entries: List[Dict[str, Any]],
    content_rows: List[RowGeometry],
    img_w: int,
    img_h: int,
 ) -> List[Dict[str, Any]]:
    """Split entries from oversized rows into multiple entries.
    If a row is >1.5× the median height, it likely contains multiple vocabulary
    entries that Step 4 failed to separate. We split based on line count:
    if EN and DE have the same number of newline-separated lines, each line
    becomes its own entry.
    This is a deterministic plausibility check — no LLM needed.
    """
    if len(entries) < 3:
        return entries
    # Calculate median row height from pixel heights
    row_heights = [r.height for r in content_rows]
    row_heights_sorted = sorted(row_heights)
    median_h = row_heights_sorted[len(row_heights_sorted) // 2]
    if median_h <= 0:
        return entries
    height_threshold = median_h * 1.5
    result: List[Dict[str, Any]] = []
    split_count = 0
    for entry in entries:
        # Get pixel height from bbox percent
        entry_h_px = entry['bbox']['h'] / 100.0 * img_h
        if entry_h_px <= height_threshold:
            result.append(entry)
            continue
        # This row is oversized — check if we can split
        en_lines = entry['english'].split('\n') if entry['english'] else ['']
        de_lines = entry['german'].split('\n') if entry['german'] else ['']
        ex_lines = entry['example'].split('\n') if entry['example'] else ['']
        # Filter empty lines
        en_lines = [l for l in en_lines if l.strip()] or ['']
        de_lines = [l for l in de_lines if l.strip()] or ['']
        ex_lines = [l for l in ex_lines if l.strip()] or ['']
        # Determine split count: EN and DE must agree (or one is empty)
        n_en = len(en_lines)
        n_de = len(de_lines)
        n_ex = len(ex_lines)
        can_split = False
        n_split = 1
        if n_en > 1 and n_de > 1 and n_en == n_de:
            n_split = n_en
            can_split = True
        elif n_en > 1 and n_de <= 1:
            # Only EN has multiple lines — still split, DE goes to first
            n_split = n_en
            can_split = True
        elif n_de > 1 and n_en <= 1:
            # Only DE has multiple lines
            n_split = n_de
            can_split = True
        if not can_split or n_split <= 1:
            result.append(entry)
            continue
        # Split into n_split sub-entries
        orig_y = entry['bbox']['y']
        orig_h = entry['bbox']['h']
        sub_h = orig_h / n_split
        for k in range(n_split):
            sub_entry = {
                'row_index': entry['row_index'],
                'english': en_lines[k] if k < len(en_lines) else '',
                'german': de_lines[k] if k < len(de_lines) else '',
                'example': ex_lines[k] if k < len(ex_lines) else '',
                'confidence': entry['confidence'],
                'bbox': {
                    'x': entry['bbox']['x'],
                    'y': round(orig_y + k * sub_h, 2),
                    'w': entry['bbox']['w'],
                    'h': round(sub_h, 2),
                },
                'bbox_en': entry['bbox_en'],
                'bbox_de': entry['bbox_de'],
                'bbox_ex': entry['bbox_ex'],
                'ocr_engine': entry.get('ocr_engine', ''),
                'split_from_row': entry['row_index'],
            }
            result.append(sub_entry)
        split_count += 1
        logger.info(f"split_oversized: row {entry['row_index']} "
                    f"(h={entry_h_px:.0f}px > {height_threshold:.0f}px) "
                    f"→ {n_split} sub-entries")
    if split_count > 0:
        # Re-number row indices
        for i, e in enumerate(result):
            e['row_index'] = i
        logger.info(f"split_oversized: {split_count} rows split, "
                    f"{len(entries)} → {len(result)} entries")
    return result
 def build_word_grid(
    ocr_img: np.ndarray,
    column_regions: List[PageRegion],
@@ -2885,10 +2947,6 @@ def build_word_grid(
    entries: List[Dict[str, Any]] = []
    # Calculate median row height for oversized detection
    row_heights = sorted(r.height for r in content_rows)
    median_row_h = row_heights[len(row_heights) // 2] if row_heights else 100
    for row_idx, row in enumerate(content_rows):
        entry: Dict[str, Any] = {
            'row_index': row_idx,
@@ -2930,40 +2988,17 @@ def build_word_grid(
            if cell_w <= 0 or cell_h <= 0:
                continue
-            # For oversized cells (>1.5× median), split vertically into sub-cells
+            cell_region = PageRegion(
-            # and OCR each separately. This prevents OCR from missing text at
+                type=col.type,
-            # the bottom of tall cells (RapidOCR downscales tall narrow crops).
+                x=cell_x, y=cell_y,
-            is_oversized = row.height > median_row_h * 1.5 and median_row_h > 20
+                width=cell_w, height=cell_h,
-            if is_oversized:
+            )
-                n_splits = max(2, round(row.height / median_row_h))
+            # OCR the cell
-                sub_h = cell_h / n_splits
+            if use_rapid:
-                words = []
+                words = ocr_region_rapid(img_bgr, cell_region)
                for s in range(n_splits):
                    sub_y = int(cell_y + s * sub_h)
                    sub_height = int(sub_h) if s < n_splits - 1 else (cell_y + cell_h - sub_y)
                    sub_region = PageRegion(
                        type=col.type,
                        x=cell_x, y=sub_y,
                        width=cell_w, height=max(1, sub_height),
                    )
                    if use_rapid:
                        sub_words = ocr_region_rapid(img_bgr, sub_region)
                    else:
                        cell_lang = lang_map.get(col.type, lang)
                        sub_words = ocr_region(ocr_img, sub_region, lang=cell_lang, psm=6)
                    words.extend(sub_words)
            else:
-                cell_region = PageRegion(
+                cell_lang = lang_map.get(col.type, lang)
-                    type=col.type,
+                words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
                    x=cell_x, y=cell_y,
                    width=cell_w, height=cell_h,
                )
                # OCR the cell
                if use_rapid:
                    words = ocr_region_rapid(img_bgr, cell_region)
                else:
                    cell_lang = lang_map.get(col.type, lang)
                    words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
            # Group into lines, then join in reading order (Fix A)
            # Use half of average word height as Y-tolerance
@@ -3007,16 +3042,13 @@ def build_word_grid(
    # --- Post-processing pipeline (deterministic, no LLM) ---
    n_raw = len(entries)
-    # 1. Split oversized rows (missed Step 4 boundaries)
+    # 1. Fix character confusion (I/1/l based on context)
    entries = _split_oversized_entries(entries, content_rows, img_w, img_h)
    # 2. Fix character confusion (I/1/l based on context)
    entries = _fix_character_confusion(entries)
-    # 3. Replace OCR'd phonetics with dictionary IPA
+    # 2. Replace OCR'd phonetics with dictionary IPA
    entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
-    # 4. Split comma-separated word forms (break, broke, broken → 3 entries)
+    # 3. Split comma-separated word forms (break, broke, broken → 3 entries)
    entries = _split_comma_entries(entries)
    # 5. Attach example sentences (rows without DE → examples for preceding entry)