""" Row geometry detection for document layout analysis. Provides horizontal whitespace-gap analysis to detect text rows, word-center grid regularization, and fallback word-grouping. Extracted from cv_layout.py. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging from typing import Dict, List import numpy as np try: import cv2 except ImportError: cv2 = None # type: ignore[assignment] from cv_vocab_types import RowGeometry from cv_ocr_word_assembly import _group_words_into_lines from cv_layout_row_regularize import _regularize_row_grid logger = logging.getLogger(__name__) # ============================================================================= # Row Geometry Detection (horizontal whitespace-gap analysis) # ============================================================================= def detect_row_geometry( inv: np.ndarray, word_dicts: List[Dict], left_x: int, right_x: int, top_y: int, bottom_y: int, ) -> List['RowGeometry']: """Detect row geometry using horizontal whitespace-gap analysis. Algorithm overview (two phases): Phase 1 — Gap-based detection (Steps 1–6): 1. Build a horizontal projection profile: for each y-pixel, sum the ink density across the content width. Only pixels within/near Tesseract word bounding boxes contribute (word_mask), so that images/illustrations don't merge adjacent text rows. 2. Smooth the projection and find contiguous regions below a threshold (= gaps / horizontal whitespace between text lines). The threshold is 15% of the median non-zero density. 3. Validate gaps against word bounding boxes — discard any gap that overlaps a word, or shift the gap boundary to avoid the word. 4. Build rows from the spans between validated gaps. 5. Detect header/footer rows: gaps in the top/bottom 15% of the page that are >= 2× the median gap size mark section boundaries. Phase 2 — Word-center regularization (_regularize_row_grid, Step 7): For each word, compute its vertical center (top + height/2). Group words into line clusters by Y-proximity (tolerance = 40% of the median gap-based row height). For each cluster, the line center = median of all word centers. The "pitch" = distance between consecutive line centers. Section breaks are detected where the pitch exceeds 1.8× the median. Within each section, row boundaries are placed at the midpoints between consecutive line centers: - Row top = midpoint to previous line center (or center - pitch/2 for first) - Row bottom = midpoint to next line center (or center + pitch/2 for last) This ensures rows tile without gaps or overlaps. Fallback: If < 2 gaps are found (very dense or uniform text), falls back to _build_rows_from_word_grouping() which groups words by Y proximity. Args: inv: Inverted binarized image (white text on black bg, full page). word_dicts: Word bounding boxes from Tesseract (relative to content ROI). left_x, right_x: Absolute X bounds of the content area. top_y, bottom_y: Absolute Y bounds of the content area. Returns: List of RowGeometry objects sorted top to bottom. """ content_w = right_x - left_x content_h = bottom_y - top_y if content_h < 10 or content_w < 10: logger.warning("detect_row_geometry: content area too small") return [] # --- Step 1: Horizontal projection profile --- # For each y-pixel row, sum ink density across the content width. # A word-coverage mask ensures only pixels near Tesseract words contribute, # so that illustrations/images don't inflate the density and merge rows. content_strip = inv[top_y:bottom_y, left_x:right_x] WORD_PAD_Y = max(4, content_h // 300) # small vertical padding around words word_mask = np.zeros((content_h, content_w), dtype=np.uint8) for wd in word_dicts: y1 = max(0, wd['top'] - WORD_PAD_Y) y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y) x1 = max(0, wd['left']) x2 = min(content_w, wd['left'] + wd['width']) word_mask[y1:y2, x1:x2] = 255 masked_strip = cv2.bitwise_and(content_strip, word_mask) h_proj = np.sum(masked_strip, axis=1).astype(float) h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj # --- Step 2: Smoothing + gap threshold --- # Smooth the projection to reduce noise, then threshold at 15% of the # median non-zero density. Pixels below this threshold are considered # "gap" (horizontal whitespace between text lines). # MIN_GAP_HEIGHT prevents tiny noise gaps from splitting rows. kernel_size = max(3, content_h // 200) if kernel_size % 2 == 0: kernel_size += 1 h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same') median_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01 gap_threshold = max(median_density * 0.15, 0.003) in_gap = h_smooth < gap_threshold MIN_GAP_HEIGHT = max(3, content_h // 500) # --- Step 3: Collect contiguous gap regions --- raw_gaps = [] # (start_y_rel, end_y_rel) relative to content ROI gap_start = None for y in range(len(in_gap)): if in_gap[y]: if gap_start is None: gap_start = y else: if gap_start is not None: gap_height = y - gap_start if gap_height >= MIN_GAP_HEIGHT: raw_gaps.append((gap_start, y)) gap_start = None if gap_start is not None: gap_height = len(in_gap) - gap_start if gap_height >= MIN_GAP_HEIGHT: raw_gaps.append((gap_start, len(in_gap))) logger.info(f"RowGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, " f"min_height={MIN_GAP_HEIGHT}px)") # --- Step 4: Validate gaps against word bounding boxes --- # A gap is valid only if no word's bounding box overlaps it vertically. # If a word overlaps, try to shift the gap boundary above or below the # word. If neither shift yields enough room (>= MIN_GAP_HEIGHT), discard. validated_gaps = [] for gap_start_rel, gap_end_rel in raw_gaps: overlapping = False for wd in word_dicts: word_top = wd['top'] word_bottom = wd['top'] + wd['height'] if word_top < gap_end_rel and word_bottom > gap_start_rel: overlapping = True break if not overlapping: validated_gaps.append((gap_start_rel, gap_end_rel)) else: # Try to shift the gap to avoid overlapping words min_word_top = content_h max_word_bottom = 0 for wd in word_dicts: word_top = wd['top'] word_bottom = wd['top'] + wd['height'] if word_top < gap_end_rel and word_bottom > gap_start_rel: min_word_top = min(min_word_top, word_top) max_word_bottom = max(max_word_bottom, word_bottom) if min_word_top - gap_start_rel >= MIN_GAP_HEIGHT: validated_gaps.append((gap_start_rel, min_word_top)) elif gap_end_rel - max_word_bottom >= MIN_GAP_HEIGHT: validated_gaps.append((max_word_bottom, gap_end_rel)) else: logger.debug(f"RowGeometry: gap [{gap_start_rel}..{gap_end_rel}] " f"discarded (word overlap, no room to shift)") logger.info(f"RowGeometry: {len(validated_gaps)} gaps after word validation") # --- Fallback if too few gaps --- if len(validated_gaps) < 2: logger.info("RowGeometry: < 2 gaps found, falling back to word grouping") return _build_rows_from_word_grouping( word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, ) validated_gaps.sort(key=lambda g: g[0]) # --- Step 5: Header/footer detection via gap size --- HEADER_FOOTER_ZONE = 0.15 GAP_MULTIPLIER = 2.0 gap_sizes = [g[1] - g[0] for g in validated_gaps] median_gap = float(np.median(gap_sizes)) if gap_sizes else 0 large_gap_threshold = median_gap * GAP_MULTIPLIER header_boundary_rel = None # y below which is header footer_boundary_rel = None # y above which is footer header_zone_limit = int(content_h * HEADER_FOOTER_ZONE) footer_zone_start = int(content_h * (1.0 - HEADER_FOOTER_ZONE)) # Find largest gap in header zone best_header_gap = None for gs, ge in validated_gaps: gap_mid = (gs + ge) / 2 gap_size = ge - gs if gap_mid < header_zone_limit and gap_size > large_gap_threshold: if best_header_gap is None or gap_size > (best_header_gap[1] - best_header_gap[0]): best_header_gap = (gs, ge) if best_header_gap is not None: header_boundary_rel = best_header_gap[1] logger.info(f"RowGeometry: header boundary at y_rel={header_boundary_rel} " f"(gap={best_header_gap[1] - best_header_gap[0]}px, " f"median_gap={median_gap:.0f}px)") # Find largest gap in footer zone best_footer_gap = None for gs, ge in validated_gaps: gap_mid = (gs + ge) / 2 gap_size = ge - gs if gap_mid > footer_zone_start and gap_size > large_gap_threshold: if best_footer_gap is None or gap_size > (best_footer_gap[1] - best_footer_gap[0]): best_footer_gap = (gs, ge) if best_footer_gap is not None: footer_boundary_rel = best_footer_gap[0] logger.info(f"RowGeometry: footer boundary at y_rel={footer_boundary_rel} " f"(gap={best_footer_gap[1] - best_footer_gap[0]}px)") # --- Step 6: Build RowGeometry objects from gaps --- # Rows are the spans between consecutive gaps. The gap midpoints define # where one row ends and the next begins. Each row's height extends # from the end of the previous gap to the start of the next gap. row_boundaries = [] # (start_y_rel, end_y_rel) # Top of content to first gap if validated_gaps[0][0] > MIN_GAP_HEIGHT: row_boundaries.append((0, validated_gaps[0][0])) # Between gaps for i in range(len(validated_gaps) - 1): row_start = validated_gaps[i][1] row_end = validated_gaps[i + 1][0] if row_end - row_start > 0: row_boundaries.append((row_start, row_end)) # Last gap to bottom of content if validated_gaps[-1][1] < content_h - MIN_GAP_HEIGHT: row_boundaries.append((validated_gaps[-1][1], content_h)) rows = [] for idx, (row_start_rel, row_end_rel) in enumerate(row_boundaries): # Determine row type row_mid = (row_start_rel + row_end_rel) / 2 if header_boundary_rel is not None and row_mid < header_boundary_rel: row_type = 'header' elif footer_boundary_rel is not None and row_mid > footer_boundary_rel: row_type = 'footer' else: row_type = 'content' # Collect words in this row row_words = [w for w in word_dicts if w['top'] + w['height'] / 2 >= row_start_rel and w['top'] + w['height'] / 2 < row_end_rel] # Gap before this row gap_before = 0 if idx == 0 and validated_gaps[0][0] > 0: gap_before = validated_gaps[0][0] elif idx > 0: # Find the gap just before this row boundary for gs, ge in validated_gaps: if ge == row_start_rel: gap_before = ge - gs break rows.append(RowGeometry( index=idx, x=left_x, y=top_y + row_start_rel, width=content_w, height=row_end_rel - row_start_rel, word_count=len(row_words), words=row_words, row_type=row_type, gap_before=gap_before, )) # --- Step 7: Word-center grid regularization --- # Refine the gap-based rows using word vertical centers. For each word, # compute center_y = top + height/2. Group into line clusters, compute # the pitch (distance between consecutive line centers), and place row # boundaries at the midpoints between centers. This gives more precise # and evenly-spaced rows than the gap-based approach alone. # Also detects section breaks (headings, paragraphs) where the pitch # exceeds 1.8× the median, and handles each section independently. rows = _regularize_row_grid(rows, word_dicts, left_x, right_x, top_y, content_w, content_h, inv) type_counts = {} for r in rows: type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1 logger.info(f"RowGeometry: {len(rows)} rows detected: {type_counts}") return rows def _build_rows_from_word_grouping( word_dicts: List[Dict], left_x: int, right_x: int, top_y: int, bottom_y: int, content_w: int, content_h: int, ) -> List['RowGeometry']: """Fallback: build rows by grouping words by Y position. Uses _group_words_into_lines() with a generous tolerance. No header/footer detection in fallback mode. """ if not word_dicts: return [] y_tolerance = max(20, content_h // 100) lines = _group_words_into_lines(word_dicts, y_tolerance_px=y_tolerance) rows = [] for idx, line_words in enumerate(lines): if not line_words: continue min_top = min(w['top'] for w in line_words) max_bottom = max(w['top'] + w['height'] for w in line_words) row_height = max_bottom - min_top rows.append(RowGeometry( index=idx, x=left_x, y=top_y + min_top, width=content_w, height=row_height, word_count=len(line_words), words=line_words, row_type='content', gap_before=0, )) logger.info(f"RowGeometry (fallback): {len(rows)} rows from word grouping") return rows