""" Core column detection: gap-based geometry and clustering fallback. Extracted from the original cv_layout_columns.py — contains: - _detect_columns_by_clustering() (fallback clustering) - _build_geometries_from_starts() (geometry construction) - detect_column_geometry() (main column detection) Post-processing (sub-columns, broad-column split, narrow expansion) lives in cv_layout_column_refine.py. Legacy projection-profile layout lives in cv_layout_analyze.py. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging from typing import Dict, List, Optional, Tuple import numpy as np from cv_vocab_types import ColumnGeometry from cv_layout_detection import _find_content_bounds logger = logging.getLogger(__name__) try: import cv2 except ImportError: cv2 = None # type: ignore[assignment] try: import pytesseract from PIL import Image except ImportError: pytesseract = None # type: ignore[assignment] Image = None # type: ignore[assignment,misc] # ============================================================================= # Stage 5b: Word-Based Layout Analysis (Two-Phase Column Detection) # ============================================================================= # --- Phase A: Geometry Detection --- def _detect_columns_by_clustering( word_dicts: List[Dict], left_edges: List[int], edge_word_indices: List[int], content_w: int, content_h: int, left_x: int, right_x: int, top_y: int, bottom_y: int, inv: Optional[np.ndarray] = None, ) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]]: """Fallback: detect columns by clustering left-aligned word positions. Used when the primary gap-based algorithm finds fewer than 2 gaps. """ tolerance = max(10, int(content_w * 0.01)) sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0]) clusters = [] cluster_widxs = [] cur_edges = [sorted_pairs[0][0]] cur_widxs = [sorted_pairs[0][1]] for edge, widx in sorted_pairs[1:]: if edge - cur_edges[-1] <= tolerance: cur_edges.append(edge) cur_widxs.append(widx) else: clusters.append(cur_edges) cluster_widxs.append(cur_widxs) cur_edges = [edge] cur_widxs = [widx] clusters.append(cur_edges) cluster_widxs.append(cur_widxs) MIN_Y_COVERAGE_PRIMARY = 0.30 MIN_Y_COVERAGE_SECONDARY = 0.15 MIN_WORDS_SECONDARY = 5 cluster_infos = [] for c_edges, c_widxs in zip(clusters, cluster_widxs): if len(c_edges) < 2: continue y_positions = [word_dicts[idx]['top'] for idx in c_widxs] y_span = max(y_positions) - min(y_positions) y_coverage = y_span / content_h if content_h > 0 else 0.0 cluster_infos.append({ 'mean_x': int(np.mean(c_edges)), 'count': len(c_edges), 'min_edge': min(c_edges), 'max_edge': max(c_edges), 'y_min': min(y_positions), 'y_max': max(y_positions), 'y_coverage': y_coverage, }) primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY] primary_set = set(id(c) for c in primary) secondary = [c for c in cluster_infos if id(c) not in primary_set and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY and c['count'] >= MIN_WORDS_SECONDARY] significant = sorted(primary + secondary, key=lambda c: c['mean_x']) if len(significant) < 3: logger.info("ColumnGeometry clustering fallback: < 3 significant clusters") return None merge_distance = max(30, int(content_w * 0.06)) merged = [significant[0].copy()] for s in significant[1:]: if s['mean_x'] - merged[-1]['mean_x'] < merge_distance: prev = merged[-1] total = prev['count'] + s['count'] avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total prev['mean_x'] = avg_x prev['count'] = total prev['min_edge'] = min(prev['min_edge'], s['min_edge']) prev['max_edge'] = max(prev['max_edge'], s['max_edge']) else: merged.append(s.copy()) if len(merged) < 3: logger.info("ColumnGeometry clustering fallback: < 3 merged clusters") return None logger.info(f"ColumnGeometry clustering fallback: {len(merged)} columns from clustering") margin_px = max(6, int(content_w * 0.003)) return _build_geometries_from_starts( [(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged], word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, inv, ) def _build_geometries_from_starts( col_starts: List[Tuple[int, int]], word_dicts: List[Dict], left_x: int, right_x: int, top_y: int, bottom_y: int, content_w: int, content_h: int, inv: Optional[np.ndarray] = None, ) -> Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]: """Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs.""" geometries = [] for i, (start_x, count) in enumerate(col_starts): if i + 1 < len(col_starts): col_width = col_starts[i + 1][0] - start_x else: col_width = right_x - start_x col_left_rel = start_x - left_x col_right_rel = col_left_rel + col_width col_words = [w for w in word_dicts if col_left_rel <= w['left'] < col_right_rel] geometries.append(ColumnGeometry( index=i, x=start_x, y=top_y, width=col_width, height=content_h, word_count=len(col_words), words=col_words, width_ratio=col_width / content_w if content_w > 0 else 0.0, )) logger.info(f"ColumnGeometry: {len(geometries)} columns: " f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}") return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv) def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], np.ndarray]]: """Detect column geometry using whitespace-gap analysis with word validation. Phase A of the two-phase column detection. Uses vertical projection profiles to find whitespace gaps between columns, then validates that no gap cuts through a word bounding box. Falls back to clustering-based detection if fewer than 2 gaps are found. Args: ocr_img: Binarized grayscale image for layout analysis. dewarped_bgr: Original BGR image (for Tesseract word detection). Returns: Tuple of (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv) or None if detection fails entirely. """ h, w = ocr_img.shape[:2] # --- Step 1: Find content bounds --- inv = cv2.bitwise_not(ocr_img) left_x, right_x, top_y, bottom_y = _find_content_bounds(inv) content_w = right_x - left_x content_h = bottom_y - top_y if content_w < w * 0.3 or content_h < h * 0.3: left_x, right_x = 0, w top_y, bottom_y = 0, h content_w, content_h = w, h logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), " f"y=[{top_y}..{bottom_y}] ({content_h}px)") # --- Step 2: Get word bounding boxes from Tesseract --- # Crop from left_x to full image width (not right_x) so words at the right # edge of the last column are included even if they extend past the detected # content boundary (right_x). content_roi = dewarped_bgr[top_y:bottom_y, left_x:w] pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB)) try: data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT) except Exception as e: logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}") return None word_dicts = [] left_edges = [] edge_word_indices = [] n_words = len(data['text']) for i in range(n_words): conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1 text = str(data['text'][i]).strip() if conf < 30 or not text: continue lx = int(data['left'][i]) ty = int(data['top'][i]) bw = int(data['width'][i]) bh = int(data['height'][i]) left_edges.append(lx) edge_word_indices.append(len(word_dicts)) word_dicts.append({ 'text': text, 'conf': conf, 'left': lx, 'top': ty, 'width': bw, 'height': bh, }) if len(left_edges) < 5: logger.warning(f"ColumnGeometry: only {len(left_edges)} words detected") return None logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area") # --- Step 2b: Segment by sub-headers --- # Pages with sub-headers (e.g. "Unit 4: Bonnie Scotland") have full-width # text bands that pollute the vertical projection. We detect large # horizontal gaps (= whitespace rows separating sections) and use only # the tallest content segment for the projection. This makes column # detection immune to sub-headers, illustrations, and section dividers. content_strip = inv[top_y:bottom_y, left_x:right_x] h_proj_row = np.sum(content_strip, axis=1).astype(float) h_proj_row_norm = h_proj_row / (content_w * 255) if content_w > 0 else h_proj_row # Find horizontal gaps (near-empty rows) H_GAP_THRESH = 0.02 # rows with <2% ink density are "empty" h_in_gap = h_proj_row_norm < H_GAP_THRESH H_MIN_GAP = max(5, content_h // 200) # min gap height ~5-7px h_gaps: List[Tuple[int, int]] = [] h_gap_start = None for y_idx in range(len(h_in_gap)): if h_in_gap[y_idx]: if h_gap_start is None: h_gap_start = y_idx else: if h_gap_start is not None: if y_idx - h_gap_start >= H_MIN_GAP: h_gaps.append((h_gap_start, y_idx)) h_gap_start = None if h_gap_start is not None and len(h_in_gap) - h_gap_start >= H_MIN_GAP: h_gaps.append((h_gap_start, len(h_in_gap))) # Identify "large" gaps (significantly bigger than median) that indicate # section boundaries (sub-headers, chapter titles). if len(h_gaps) >= 3: gap_sizes = sorted(g[1] - g[0] for g in h_gaps) median_gap_h = gap_sizes[len(gap_sizes) // 2] large_gap_thresh = max(median_gap_h * 1.8, H_MIN_GAP + 3) large_gaps = [(gs, ge) for gs, ge in h_gaps if ge - gs >= large_gap_thresh] else: large_gaps = h_gaps # Build content segments between large gaps and pick the tallest seg_boundaries = [0] for gs, ge in large_gaps: seg_boundaries.append(gs) seg_boundaries.append(ge) seg_boundaries.append(content_h) segments = [] for i in range(0, len(seg_boundaries) - 1, 2): seg_top = seg_boundaries[i] seg_bot = seg_boundaries[i + 1] if i + 1 < len(seg_boundaries) else content_h seg_height = seg_bot - seg_top if seg_height > 20: # ignore tiny fragments segments.append((seg_top, seg_bot, seg_height)) if segments: segments.sort(key=lambda s: s[2], reverse=True) best_seg = segments[0] proj_strip = content_strip[best_seg[0]:best_seg[1], :] effective_h = best_seg[2] if len(segments) > 1: logger.info(f"ColumnGeometry: {len(segments)} segments from {len(large_gaps)} " f"large h-gaps, using tallest: rows {best_seg[0]}..{best_seg[1]} " f"({effective_h}px, {effective_h*100/content_h:.0f}%)") else: proj_strip = content_strip effective_h = content_h # --- Step 3: Vertical projection profile --- v_proj = np.sum(proj_strip, axis=0).astype(float) v_proj_norm = v_proj / (effective_h * 255) if effective_h > 0 else v_proj # Smooth the projection to avoid noise-induced micro-gaps kernel_size = max(5, content_w // 80) if kernel_size % 2 == 0: kernel_size += 1 # keep odd for symmetry v_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same') # --- Step 4: Find whitespace gaps --- # Threshold: areas with very little ink density are gaps median_density = float(np.median(v_smooth[v_smooth > 0])) if np.any(v_smooth > 0) else 0.01 gap_threshold = max(median_density * 0.15, 0.005) in_gap = v_smooth < gap_threshold MIN_GAP_WIDTH = max(8, content_w // 200) # min ~8px or 0.5% of content width # Collect contiguous gap regions raw_gaps = [] # (start_x_rel, end_x_rel) relative to content ROI gap_start = None for x in range(len(in_gap)): if in_gap[x]: if gap_start is None: gap_start = x else: if gap_start is not None: gap_width = x - gap_start if gap_width >= MIN_GAP_WIDTH: raw_gaps.append((gap_start, x)) gap_start = None # Handle gap at the right edge if gap_start is not None: gap_width = len(in_gap) - gap_start if gap_width >= MIN_GAP_WIDTH: raw_gaps.append((gap_start, len(in_gap))) logger.info(f"ColumnGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, " f"min_width={MIN_GAP_WIDTH}px): " f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}") # --- Step 5: Validate gaps against word bounding boxes --- # When using a segment for projection, only validate against words # inside that segment — words from sub-headers or other sections # would incorrectly overlap with real column gaps. if segments and len(segments) > 1: seg_top_abs = best_seg[0] # relative to content strip seg_bot_abs = best_seg[1] segment_words = [wd for wd in word_dicts if wd['top'] >= seg_top_abs and wd['top'] + wd['height'] <= seg_bot_abs] logger.info(f"ColumnGeometry: filtering words to segment: " f"{len(segment_words)}/{len(word_dicts)} words") else: segment_words = word_dicts validated_gaps = [] for gap_start_rel, gap_end_rel in raw_gaps: # Check if any word overlaps with this gap region overlapping = False for wd in segment_words: word_left = wd['left'] word_right = wd['left'] + wd['width'] if word_left < gap_end_rel and word_right > gap_start_rel: overlapping = True break if not overlapping: validated_gaps.append((gap_start_rel, gap_end_rel)) else: # Try to shift the gap to avoid the overlapping word(s) # Find the tightest word boundaries within the gap region min_word_left = content_w max_word_right = 0 for wd in segment_words: word_left = wd['left'] word_right = wd['left'] + wd['width'] if word_left < gap_end_rel and word_right > gap_start_rel: min_word_left = min(min_word_left, word_left) max_word_right = max(max_word_right, word_right) # Try gap before the overlapping words if min_word_left - gap_start_rel >= MIN_GAP_WIDTH: validated_gaps.append((gap_start_rel, min_word_left)) logger.debug(f"ColumnGeometry: gap shifted left to avoid word at {min_word_left}") # Try gap after the overlapping words elif gap_end_rel - max_word_right >= MIN_GAP_WIDTH: validated_gaps.append((max_word_right, gap_end_rel)) logger.debug(f"ColumnGeometry: gap shifted right to avoid word at {max_word_right}") else: logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] " f"discarded (word overlap, no room to shift)") logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: " f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}") # --- Step 5b: Word-coverage gap detection (fallback for noisy scans) --- # When pixel-based projection fails (e.g. due to illustrations or colored # bands), use word bounding boxes to find clear vertical gaps. This is # immune to decorative graphics that Tesseract doesn't recognise as words. if len(validated_gaps) < 2: logger.info("ColumnGeometry: < 2 pixel-gaps, trying word-coverage gaps") word_coverage = np.zeros(content_w, dtype=np.int32) for wd in segment_words: wl = max(0, wd['left']) wr = min(wd['left'] + wd['width'], content_w) if wr > wl: word_coverage[wl:wr] += 1 # Smooth slightly to bridge tiny 1-2px noise gaps between words wc_kernel = max(3, content_w // 300) if wc_kernel % 2 == 0: wc_kernel += 1 wc_smooth = np.convolve(word_coverage.astype(float), np.ones(wc_kernel) / wc_kernel, mode='same') wc_in_gap = wc_smooth < 0.5 # effectively zero word coverage WC_MIN_GAP = max(4, content_w // 300) wc_gaps: List[Tuple[int, int]] = [] wc_gap_start = None for x in range(len(wc_in_gap)): if wc_in_gap[x]: if wc_gap_start is None: wc_gap_start = x else: if wc_gap_start is not None: if x - wc_gap_start >= WC_MIN_GAP: wc_gaps.append((wc_gap_start, x)) wc_gap_start = None if wc_gap_start is not None and len(wc_in_gap) - wc_gap_start >= WC_MIN_GAP: wc_gaps.append((wc_gap_start, len(wc_in_gap))) logger.info(f"ColumnGeometry: {len(wc_gaps)} word-coverage gaps found " f"(min_width={WC_MIN_GAP}px): " f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in wc_gaps]}") if len(wc_gaps) >= 2: validated_gaps = wc_gaps # --- Step 6: Fallback to clustering if too few gaps --- if len(validated_gaps) < 2: logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering") return _detect_columns_by_clustering( word_dicts, left_edges, edge_word_indices, content_w, content_h, left_x, right_x, top_y, bottom_y, inv, ) # --- Step 7: Derive column boundaries from gaps --- # Sort gaps by position validated_gaps.sort(key=lambda g: g[0]) # Identify margin gaps (first and last) vs interior gaps # A margin gap touches the edge of the content area (within 2% tolerance) edge_tolerance = max(10, int(content_w * 0.02)) is_left_margin = validated_gaps[0][0] <= edge_tolerance is_right_margin = validated_gaps[-1][1] >= content_w - edge_tolerance # Interior gaps define column boundaries # Column starts at the end of a gap, ends at the start of the next gap col_starts = [] if is_left_margin: # First column starts after the left margin gap first_gap_end = validated_gaps[0][1] interior_gaps = validated_gaps[1:] else: # No left margin gap — first column starts at content left edge first_gap_end = 0 interior_gaps = validated_gaps[:] if is_right_margin: # Last gap is right margin — don't use it as column start interior_gaps_for_boundaries = interior_gaps[:-1] right_boundary = validated_gaps[-1][0] # last column ends at right margin gap start else: interior_gaps_for_boundaries = interior_gaps right_boundary = content_w # First column col_starts.append(left_x + first_gap_end) # Columns between interior gaps for gap_start_rel, gap_end_rel in interior_gaps_for_boundaries: col_starts.append(left_x + gap_end_rel) # Count words per column region (for logging) col_start_counts = [] for i, start_x in enumerate(col_starts): if i + 1 < len(col_starts): next_start = col_starts[i + 1] else: # Rightmost column always extends to full image width (w). # The page margin contains only white space — extending the OCR # crop to the image edge is safe and prevents text near the right # border from being cut off. next_start = w col_left_rel = start_x - left_x col_right_rel = next_start - left_x n_words_in_col = sum(1 for w in word_dicts if col_left_rel <= w['left'] < col_right_rel) col_start_counts.append((start_x, n_words_in_col)) logger.info(f"ColumnGeometry: {len(col_starts)} columns from {len(validated_gaps)} gaps " f"(left_margin={is_left_margin}, right_margin={is_right_margin}): " f"{col_start_counts}") # --- Step 8: Build ColumnGeometry objects --- # Determine right edge for each column all_boundaries = [] for i, start_x in enumerate(col_starts): if i + 1 < len(col_starts): end_x = col_starts[i + 1] else: # Rightmost column always extends to full image width (w). end_x = w all_boundaries.append((start_x, end_x)) geometries = [] for i, (start_x, end_x) in enumerate(all_boundaries): col_width = end_x - start_x col_left_rel = start_x - left_x col_right_rel = col_left_rel + col_width col_words = [w for w in word_dicts if col_left_rel <= w['left'] < col_right_rel] geometries.append(ColumnGeometry( index=i, x=start_x, y=top_y, width=col_width, height=content_h, word_count=len(col_words), words=col_words, width_ratio=col_width / content_w if content_w > 0 else 0.0, )) logger.info(f"ColumnGeometry: {len(geometries)} columns: " f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}") # --- Step 9: Filter phantom narrow columns --- # Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow # columns (< 3% of content width) with zero or no words. These are not # real columns — remove them and close the gap between neighbors. min_real_col_w = max(20, int(content_w * 0.03)) filtered_geoms = [g for g in geometries if not (g.word_count < 3 and g.width < min_real_col_w)] if len(filtered_geoms) < len(geometries): n_removed = len(geometries) - len(filtered_geoms) logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) " f"(width < {min_real_col_w}px and words < 3)") # Extend each remaining column to close gaps with its right neighbor for i, g in enumerate(filtered_geoms): if i + 1 < len(filtered_geoms): g.width = filtered_geoms[i + 1].x - g.x else: g.width = w - g.x g.index = i col_left_rel = g.x - left_x col_right_rel = col_left_rel + g.width g.words = [w for w in word_dicts if col_left_rel <= w['left'] < col_right_rel] g.word_count = len(g.words) geometries = filtered_geoms logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: " f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}") return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)