""" Row grid regularization for document layout analysis. Provides word-center-based row boundary refinement to improve gap-based row detection. Extracted from cv_layout_rows.py. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging from typing import Dict, List import numpy as np from cv_vocab_types import RowGeometry logger = logging.getLogger(__name__) def _regularize_row_grid( rows: List['RowGeometry'], word_dicts: List[Dict], left_x: int, right_x: int, top_y: int, content_w: int, content_h: int, inv: np.ndarray, ) -> List['RowGeometry']: """Rebuild row boundaries from word center-lines with section-break awareness. Instead of overlaying a rigid grid, this derives row positions bottom-up from the words themselves: Step A: Group all content words into line clusters by Y-proximity. Tolerance = 40% of median gap-based row height. Step B: For each cluster compute: - center_y = median of (word_top + word_height/2) for all words - letter_h = median of word heights (excluding outliers > 2× median) Step B2: Merge clusters whose centers are closer than 30% of row height (spurious splits from OCR jitter). Step C: Compute pitches (distances between consecutive centers). Detect section breaks where gap > 1.8× median pitch. Step D: Split clusters into sections at the section breaks. Step E: Within each section, place row boundaries at midpoints between consecutive line centers: - First row top = center - local_pitch/2 - Last row bottom = center + local_pitch/2 - Interior boundaries = (center_i + center_{i+1}) / 2 This ensures rows tile seamlessly without gaps or overlaps. Step F: Re-assign words to the nearest grid row by vertical center distance. Step G: Validate that >= 85% of words land in a grid row; otherwise fall back to the original gap-based rows. Step H: Merge with preserved header/footer rows and re-index. Guard: Requires >= 5 content rows from gap-based detection to activate. This prevents the regularizer from running on very small images (e.g. box sub-sessions with only 3-6 rows) where the gap-based detection is already accurate enough. Header/footer rows from the gap-based detection are preserved. """ content_rows = [r for r in rows if r.row_type == 'content'] non_content = [r for r in rows if r.row_type != 'content'] if len(content_rows) < 5: return rows # --- Step A: Group ALL words into line clusters --- # Collect words that belong to content rows (deduplicated) content_words: List[Dict] = [] seen_keys: set = set() for r in content_rows: for w in r.words: key = (w['left'], w['top'], w['width'], w['height']) if key not in seen_keys: seen_keys.add(key) content_words.append(w) if len(content_words) < 5: return rows # Compute median word height (excluding outliers like tall brackets/IPA) word_heights = sorted(w['height'] for w in content_words) median_wh = word_heights[len(word_heights) // 2] # Compute median gap-based row height — this is the actual line height # as detected by the horizontal projection. We use 40% of this as # grouping tolerance. This is much more reliable than using word height # alone, because words on the same line can have very different heights # (e.g. lowercase vs uppercase, brackets, phonetic symbols). gap_row_heights = sorted(r.height for r in content_rows) median_row_h = gap_row_heights[len(gap_row_heights) // 2] # Tolerance: 40% of row height. Words on the same line should have # centers within this range. Even if a word's bbox is taller/shorter, # its center should stay within half a row height of the line center. y_tol = max(10, int(median_row_h * 0.4)) # Sort by center_y, then group by proximity words_by_center = sorted(content_words, key=lambda w: (w['top'] + w['height'] / 2, w['left'])) line_clusters: List[List[Dict]] = [] current_line: List[Dict] = [words_by_center[0]] current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2 for w in words_by_center[1:]: w_center = w['top'] + w['height'] / 2 if abs(w_center - current_center) <= y_tol: current_line.append(w) else: current_line.sort(key=lambda w: w['left']) line_clusters.append(current_line) current_line = [w] current_center = w_center if current_line: current_line.sort(key=lambda w: w['left']) line_clusters.append(current_line) if len(line_clusters) < 3: return rows # --- Step B: Compute center_y per cluster --- # center_y = median of (word_top + word_height/2) across all words in cluster # letter_h = median of word heights, but excluding outlier-height words # (>2× median) so that tall brackets/IPA don't skew the height cluster_info: List[Dict] = [] for cl_words in line_clusters: centers = [w['top'] + w['height'] / 2 for w in cl_words] # Filter outlier heights for letter_h computation normal_heights = [w['height'] for w in cl_words if w['height'] <= median_wh * 2.0] if not normal_heights: normal_heights = [w['height'] for w in cl_words] center_y = float(np.median(centers)) letter_h = float(np.median(normal_heights)) cluster_info.append({ 'center_y_rel': center_y, # relative to content ROI 'center_y_abs': center_y + top_y, # absolute 'letter_h': letter_h, 'words': cl_words, }) cluster_info.sort(key=lambda c: c['center_y_rel']) # --- Step B2: Merge clusters that are too close together --- # Even with center-based grouping, some edge cases can produce # spurious clusters. Merge any pair whose centers are closer # than 30% of the row height (they're definitely the same text line). merge_threshold = max(8, median_row_h * 0.3) merged: List[Dict] = [cluster_info[0]] for cl in cluster_info[1:]: prev = merged[-1] if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold: # Merge: combine words, recompute center combined_words = prev['words'] + cl['words'] centers = [w['top'] + w['height'] / 2 for w in combined_words] normal_heights = [w['height'] for w in combined_words if w['height'] <= median_wh * 2.0] if not normal_heights: normal_heights = [w['height'] for w in combined_words] prev['center_y_rel'] = float(np.median(centers)) prev['center_y_abs'] = prev['center_y_rel'] + top_y prev['letter_h'] = float(np.median(normal_heights)) prev['words'] = combined_words else: merged.append(cl) cluster_info = merged if len(cluster_info) < 3: return rows # --- Step C: Compute pitches and detect section breaks --- pitches: List[float] = [] for i in range(1, len(cluster_info)): pitch = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel'] pitches.append(pitch) if not pitches: return rows median_pitch = float(np.median(pitches)) if median_pitch <= 5: return rows # A section break is where the gap between line centers is much larger # than the normal pitch (sub-headings, section titles, etc.) BREAK_FACTOR = 1.8 # --- Step D: Build sections (groups of consecutive lines with normal spacing) --- sections: List[List[Dict]] = [] current_section: List[Dict] = [cluster_info[0]] for i in range(1, len(cluster_info)): gap = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel'] if gap > median_pitch * BREAK_FACTOR: sections.append(current_section) current_section = [cluster_info[i]] else: current_section.append(cluster_info[i]) if current_section: sections.append(current_section) # --- Step E: Build row boundaries per section --- grid_rows: List[RowGeometry] = [] for section in sections: if not section: continue if len(section) == 1: # Single-line section (likely a heading) cl = section[0] half_h = max(cl['letter_h'], median_pitch * 0.4) row_top = cl['center_y_abs'] - half_h row_bot = cl['center_y_abs'] + half_h grid_rows.append(RowGeometry( index=0, x=left_x, y=round(row_top), width=content_w, height=round(row_bot - row_top), word_count=len(cl['words']), words=cl['words'], row_type='content', gap_before=0, )) continue # Compute local pitch for this section local_pitches = [] for i in range(1, len(section)): local_pitches.append( section[i]['center_y_rel'] - section[i - 1]['center_y_rel'] ) local_pitch = float(np.median(local_pitches)) if local_pitches else median_pitch # Row boundaries are placed at midpoints between consecutive centers. # First row: top = center - local_pitch/2 # Last row: bottom = center + local_pitch/2 for i, cl in enumerate(section): if i == 0: row_top = cl['center_y_abs'] - local_pitch / 2 else: # Midpoint between this center and previous center prev_center = section[i - 1]['center_y_abs'] row_top = (prev_center + cl['center_y_abs']) / 2 if i == len(section) - 1: row_bot = cl['center_y_abs'] + local_pitch / 2 else: next_center = section[i + 1]['center_y_abs'] row_bot = (cl['center_y_abs'] + next_center) / 2 # Clamp to reasonable bounds row_top = max(top_y, row_top) row_bot = min(top_y + content_h, row_bot) if row_bot - row_top < 5: continue grid_rows.append(RowGeometry( index=0, x=left_x, y=round(row_top), width=content_w, height=round(row_bot - row_top), word_count=len(cl['words']), words=cl['words'], row_type='content', gap_before=0, )) if not grid_rows: return rows # --- Step F: Re-assign words to grid rows --- # Words may have shifted slightly; assign each word to the row whose # center is closest to the word's vertical center. for gr in grid_rows: gr.words = [] for w in content_words: w_center = w['top'] + top_y + w['height'] / 2 best_row = None best_dist = float('inf') for gr in grid_rows: row_center = gr.y + gr.height / 2 dist = abs(w_center - row_center) if dist < best_dist: best_dist = dist best_row = gr if best_row is not None and best_dist < median_pitch: best_row.words.append(w) for gr in grid_rows: gr.word_count = len(gr.words) # --- Step G: Validate --- words_placed = sum(gr.word_count for gr in grid_rows) if len(content_words) > 0: match_ratio = words_placed / len(content_words) if match_ratio < 0.85: logger.info(f"RowGrid: word-center grid only matches {match_ratio:.0%} " f"of words, keeping gap-based rows") return rows # Remove empty grid rows (no words assigned) grid_rows = [gr for gr in grid_rows if gr.word_count > 0] # --- Step H: Merge header/footer + re-index --- result = list(non_content) + grid_rows result.sort(key=lambda r: r.y) for i, r in enumerate(result): r.index = i row_heights = [gr.height for gr in grid_rows] min_h = min(row_heights) if row_heights else 0 max_h = max(row_heights) if row_heights else 0 logger.info(f"RowGrid: word-center grid applied " f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, " f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, " f"{len(sections)} sections, " f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], " f"was {len(content_rows)} gap-based rows)") return result