diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index ea283b7..905c11e 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -1539,6 +1539,11 @@ def detect_row_geometry( gap_before=gap_before, )) + # --- Step 7: Split oversized rows --- + # If a content row is >1.5× the median height, re-analyze it with a local + # horizontal projection to find missed row boundaries within. + rows = _split_oversized_rows(rows, inv, left_x, right_x, top_y, word_dicts) + type_counts = {} for r in rows: type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1 @@ -1547,6 +1552,176 @@ def detect_row_geometry( return rows +def _split_oversized_rows( + rows: List['RowGeometry'], + inv: np.ndarray, + left_x: int, right_x: int, + top_y: int, + word_dicts: List[Dict], +) -> List['RowGeometry']: + """Split content rows that are >1.5× the median height. + + Re-analyses oversized rows with a local horizontal projection profile + to find missed row boundaries within. This catches cases where Step 4's + initial gap analysis merged multiple vocabulary lines (e.g. because an + image or dense text obscured the gap). + + Returns a new list with oversized rows replaced by sub-rows, re-indexed. + """ + content_rows = [r for r in rows if r.row_type == 'content'] + if len(content_rows) < 3: + return rows + + heights = sorted(r.height for r in content_rows) + median_h = heights[len(heights) // 2] + + if median_h <= 10: + return rows + + threshold = median_h * 1.5 + content_w = right_x - left_x + + result: List[RowGeometry] = [] + split_total = 0 + + for row in rows: + if row.row_type != 'content' or row.height <= threshold: + result.append(row) + continue + + # --- Local horizontal projection on this row's strip --- + row_y_abs = row.y + row_h = row.height + strip = inv[row_y_abs:row_y_abs + row_h, left_x:right_x] + + if strip.size == 0: + result.append(row) + continue + + # Word-coverage mask (same approach as main detection) + pad_y = max(2, row_h // 50) + word_mask = np.zeros_like(strip) + row_words = [w for w in word_dicts + if w['top'] + top_y >= row_y_abs - pad_y + and w['top'] + top_y < row_y_abs + row_h + pad_y] + + for wd in row_words: + wy = wd['top'] + top_y - row_y_abs # relative to strip + y1 = max(0, wy - pad_y) + y2 = min(row_h, wy + wd['height'] + pad_y) + x1 = max(0, wd['left']) + x2 = min(content_w, wd['left'] + wd['width']) + word_mask[y1:y2, x1:x2] = 255 + + masked = cv2.bitwise_and(strip, word_mask) + h_proj = np.sum(masked, axis=1).astype(float) + h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj + + # Smooth + k = max(3, row_h // 40) + if k % 2 == 0: + k += 1 + h_smooth = np.convolve(h_proj_norm, np.ones(k) / k, mode='same') + + # Gap detection within the row + med_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01 + gap_thresh = max(med_density * 0.15, 0.003) + in_gap = h_smooth < gap_thresh + + min_gap_h = max(2, row_h // 30) # smaller threshold for sub-gaps + local_gaps = [] + gap_start = None + for y in range(len(in_gap)): + if in_gap[y]: + if gap_start is None: + gap_start = y + else: + if gap_start is not None: + if y - gap_start >= min_gap_h: + local_gaps.append((gap_start, y)) + gap_start = None + if gap_start is not None and len(in_gap) - gap_start >= min_gap_h: + local_gaps.append((gap_start, len(in_gap))) + + if not local_gaps: + # No sub-gaps found — keep original row + result.append(row) + continue + + # Validate gaps against words (don't split through a word) + valid_gaps = [] + for gs, ge in local_gaps: + overlapping = False + for wd in row_words: + wy = wd['top'] + top_y - row_y_abs + wy2 = wy + wd['height'] + if wy < ge and wy2 > gs: + overlapping = True + break + if not overlapping: + valid_gaps.append((gs, ge)) + + if not valid_gaps: + result.append(row) + continue + + valid_gaps.sort() + + # Build sub-row boundaries from gaps + sub_bounds = [] # (start_rel, end_rel) within the row strip + # Before first gap + if valid_gaps[0][0] > 0: + sub_bounds.append((0, valid_gaps[0][0])) + # Between gaps + for i in range(len(valid_gaps) - 1): + sub_bounds.append((valid_gaps[i][1], valid_gaps[i + 1][0])) + # After last gap + if valid_gaps[-1][1] < row_h: + sub_bounds.append((valid_gaps[-1][1], row_h)) + + # Filter out empty sub-rows + sub_bounds = [(s, e) for s, e in sub_bounds if e - s > 2] + + if len(sub_bounds) <= 1: + result.append(row) + continue + + # Create sub-rows + for sb_start, sb_end in sub_bounds: + sub_y_abs = row_y_abs + sb_start + sub_h = sb_end - sb_start + # Assign words to this sub-row + sub_words = [w for w in row_words + if w['top'] + top_y >= sub_y_abs - 2 + and w['top'] + top_y + w['height'] <= sub_y_abs + sub_h + 2] + result.append(RowGeometry( + index=0, # re-indexed below + x=row.x, + y=sub_y_abs, + width=row.width, + height=sub_h, + word_count=len(sub_words), + words=sub_words, + row_type='content', + gap_before=0, + )) + + split_total += len(sub_bounds) - 1 + logger.info(f"RowGeometry: split oversized row (h={row_h}) " + f"into {len(sub_bounds)} sub-rows " + f"(median_h={median_h}, {len(valid_gaps)} gaps)") + + if split_total > 0: + # Re-index all rows + result.sort(key=lambda r: r.y) + for i, r in enumerate(result): + r.index = i + logger.info(f"RowGeometry: {split_total} oversized splits → " + f"{len(result)} total rows (was {len(rows)})") + + return result + + def _build_rows_from_word_grouping( word_dicts: List[Dict], left_x: int, right_x: int, @@ -2708,119 +2883,6 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str return _PHONETIC_BRACKET_RE.sub(replacer, text) -def _split_oversized_entries( - entries: List[Dict[str, Any]], - content_rows: List[RowGeometry], - img_w: int, - img_h: int, -) -> List[Dict[str, Any]]: - """Split entries from oversized rows into multiple entries. - - If a row is >1.5× the median height, it likely contains multiple vocabulary - entries that Step 4 failed to separate. We split based on line count: - if EN and DE have the same number of newline-separated lines, each line - becomes its own entry. - - This is a deterministic plausibility check — no LLM needed. - """ - if len(entries) < 3: - return entries - - # Calculate median row height from pixel heights - row_heights = [r.height for r in content_rows] - row_heights_sorted = sorted(row_heights) - median_h = row_heights_sorted[len(row_heights_sorted) // 2] - - if median_h <= 0: - return entries - - height_threshold = median_h * 1.5 - result: List[Dict[str, Any]] = [] - split_count = 0 - - for entry in entries: - # Get pixel height from bbox percent - entry_h_px = entry['bbox']['h'] / 100.0 * img_h - - if entry_h_px <= height_threshold: - result.append(entry) - continue - - # This row is oversized — check if we can split - en_lines = entry['english'].split('\n') if entry['english'] else [''] - de_lines = entry['german'].split('\n') if entry['german'] else [''] - ex_lines = entry['example'].split('\n') if entry['example'] else [''] - - # Filter empty lines - en_lines = [l for l in en_lines if l.strip()] or [''] - de_lines = [l for l in de_lines if l.strip()] or [''] - ex_lines = [l for l in ex_lines if l.strip()] or [''] - - # Determine split count: EN and DE must agree (or one is empty) - n_en = len(en_lines) - n_de = len(de_lines) - n_ex = len(ex_lines) - - can_split = False - n_split = 1 - - if n_en > 1 and n_de > 1 and n_en == n_de: - n_split = n_en - can_split = True - elif n_en > 1 and n_de <= 1: - # Only EN has multiple lines — still split, DE goes to first - n_split = n_en - can_split = True - elif n_de > 1 and n_en <= 1: - # Only DE has multiple lines - n_split = n_de - can_split = True - - if not can_split or n_split <= 1: - result.append(entry) - continue - - # Split into n_split sub-entries - orig_y = entry['bbox']['y'] - orig_h = entry['bbox']['h'] - sub_h = orig_h / n_split - - for k in range(n_split): - sub_entry = { - 'row_index': entry['row_index'], - 'english': en_lines[k] if k < len(en_lines) else '', - 'german': de_lines[k] if k < len(de_lines) else '', - 'example': ex_lines[k] if k < len(ex_lines) else '', - 'confidence': entry['confidence'], - 'bbox': { - 'x': entry['bbox']['x'], - 'y': round(orig_y + k * sub_h, 2), - 'w': entry['bbox']['w'], - 'h': round(sub_h, 2), - }, - 'bbox_en': entry['bbox_en'], - 'bbox_de': entry['bbox_de'], - 'bbox_ex': entry['bbox_ex'], - 'ocr_engine': entry.get('ocr_engine', ''), - 'split_from_row': entry['row_index'], - } - result.append(sub_entry) - - split_count += 1 - logger.info(f"split_oversized: row {entry['row_index']} " - f"(h={entry_h_px:.0f}px > {height_threshold:.0f}px) " - f"→ {n_split} sub-entries") - - if split_count > 0: - # Re-number row indices - for i, e in enumerate(result): - e['row_index'] = i - logger.info(f"split_oversized: {split_count} rows split, " - f"{len(entries)} → {len(result)} entries") - - return result - - def build_word_grid( ocr_img: np.ndarray, column_regions: List[PageRegion], @@ -2885,10 +2947,6 @@ def build_word_grid( entries: List[Dict[str, Any]] = [] - # Calculate median row height for oversized detection - row_heights = sorted(r.height for r in content_rows) - median_row_h = row_heights[len(row_heights) // 2] if row_heights else 100 - for row_idx, row in enumerate(content_rows): entry: Dict[str, Any] = { 'row_index': row_idx, @@ -2930,40 +2988,17 @@ def build_word_grid( if cell_w <= 0 or cell_h <= 0: continue - # For oversized cells (>1.5× median), split vertically into sub-cells - # and OCR each separately. This prevents OCR from missing text at - # the bottom of tall cells (RapidOCR downscales tall narrow crops). - is_oversized = row.height > median_row_h * 1.5 and median_row_h > 20 - if is_oversized: - n_splits = max(2, round(row.height / median_row_h)) - sub_h = cell_h / n_splits - words = [] - for s in range(n_splits): - sub_y = int(cell_y + s * sub_h) - sub_height = int(sub_h) if s < n_splits - 1 else (cell_y + cell_h - sub_y) - sub_region = PageRegion( - type=col.type, - x=cell_x, y=sub_y, - width=cell_w, height=max(1, sub_height), - ) - if use_rapid: - sub_words = ocr_region_rapid(img_bgr, sub_region) - else: - cell_lang = lang_map.get(col.type, lang) - sub_words = ocr_region(ocr_img, sub_region, lang=cell_lang, psm=6) - words.extend(sub_words) + cell_region = PageRegion( + type=col.type, + x=cell_x, y=cell_y, + width=cell_w, height=cell_h, + ) + # OCR the cell + if use_rapid: + words = ocr_region_rapid(img_bgr, cell_region) else: - cell_region = PageRegion( - type=col.type, - x=cell_x, y=cell_y, - width=cell_w, height=cell_h, - ) - # OCR the cell - if use_rapid: - words = ocr_region_rapid(img_bgr, cell_region) - else: - cell_lang = lang_map.get(col.type, lang) - words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6) + cell_lang = lang_map.get(col.type, lang) + words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6) # Group into lines, then join in reading order (Fix A) # Use half of average word height as Y-tolerance @@ -3007,16 +3042,13 @@ def build_word_grid( # --- Post-processing pipeline (deterministic, no LLM) --- n_raw = len(entries) - # 1. Split oversized rows (missed Step 4 boundaries) - entries = _split_oversized_entries(entries, content_rows, img_w, img_h) - - # 2. Fix character confusion (I/1/l based on context) + # 1. Fix character confusion (I/1/l based on context) entries = _fix_character_confusion(entries) - # 3. Replace OCR'd phonetics with dictionary IPA + # 2. Replace OCR'd phonetics with dictionary IPA entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) - # 4. Split comma-separated word forms (break, broke, broken → 3 entries) + # 3. Split comma-separated word forms (break, broke, broken → 3 entries) entries = _split_comma_entries(entries) # 5. Attach example sentences (rows without DE → examples for preceding entry)