""" Legacy layout analysis using projection profiles. Extracted from cv_layout_columns.py — contains: - analyze_layout() (projection-profile based column/header/footer detection) Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging from typing import List import numpy as np from cv_vocab_types import PageRegion from cv_layout_detection import _find_content_bounds logger = logging.getLogger(__name__) try: import cv2 except ImportError: cv2 = None # type: ignore[assignment] def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegion]: """Detect columns, header, and footer using projection profiles. Uses content-bounds detection to exclude page margins before searching for column separators within the actual text area. Args: layout_img: CLAHE-enhanced grayscale image. ocr_img: Binarized image for text density analysis. Returns: List of PageRegion objects describing detected regions. """ h, w = ocr_img.shape[:2] # Invert: black text on white → white text on black for projection inv = cv2.bitwise_not(ocr_img) # --- Find actual content bounds (exclude page margins) --- left_x, right_x, top_y, bottom_y = _find_content_bounds(inv) content_w = right_x - left_x content_h = bottom_y - top_y logger.info(f"Layout: content bounds x=[{left_x}..{right_x}] ({content_w}px), " f"y=[{top_y}..{bottom_y}] ({content_h}px) in {w}x{h} image") if content_w < w * 0.3 or content_h < h * 0.3: # Fallback if detection seems wrong left_x, right_x = 0, w top_y, bottom_y = 0, h content_w, content_h = w, h # --- Vertical projection within content area to find column separators --- content_strip = inv[top_y:bottom_y, left_x:right_x] v_proj = np.sum(content_strip, axis=0).astype(float) v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj # Smooth the projection profile kernel_size = max(5, content_w // 50) if kernel_size % 2 == 0: kernel_size += 1 v_proj_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same') # Debug: log projection profile statistics p_mean = float(np.mean(v_proj_smooth)) p_median = float(np.median(v_proj_smooth)) p_min = float(np.min(v_proj_smooth)) p_max = float(np.max(v_proj_smooth)) logger.info(f"Layout: v_proj stats — min={p_min:.4f}, max={p_max:.4f}, " f"mean={p_mean:.4f}, median={p_median:.4f}") # Find valleys using multiple threshold strategies # Strategy 1: relative to median (catches clear separators) # Strategy 2: local minima approach (catches subtle gaps) threshold = max(p_median * 0.3, p_mean * 0.2) logger.info(f"Layout: valley threshold={threshold:.4f}") in_valley = v_proj_smooth < threshold # Find contiguous valley regions all_valleys = [] start = None for x in range(len(v_proj_smooth)): if in_valley[x] and start is None: start = x elif not in_valley[x] and start is not None: valley_width = x - start valley_depth = float(np.min(v_proj_smooth[start:x])) # Valley must be at least 3px wide if valley_width >= 3: all_valleys.append((start, x, (start + x) // 2, valley_width, valley_depth)) start = None logger.info(f"Layout: raw valleys (before filter): {len(all_valleys)} — " f"{[(v[0]+left_x, v[1]+left_x, v[3], f'{v[4]:.4f}') for v in all_valleys[:10]]}") # Filter: valleys must be inside the content area (not at edges) inner_margin = int(content_w * 0.08) valleys = [v for v in all_valleys if inner_margin < v[2] < content_w - inner_margin] # If no valleys found with strict threshold, try local minima approach if len(valleys) < 2: logger.info("Layout: trying local minima approach for column detection") # Divide content into 20 segments, find the 2 lowest seg_count = 20 seg_width = content_w // seg_count seg_scores = [] for i in range(seg_count): sx = i * seg_width ex = min((i + 1) * seg_width, content_w) seg_mean = float(np.mean(v_proj_smooth[sx:ex])) seg_scores.append((i, sx, ex, seg_mean)) seg_scores.sort(key=lambda s: s[3]) logger.info(f"Layout: segment scores (lowest 5): " f"{[(s[0], s[1]+left_x, s[2]+left_x, f'{s[3]:.4f}') for s in seg_scores[:5]]}") # Find two lowest non-adjacent segments that create reasonable columns candidate_valleys = [] for seg_idx, sx, ex, seg_mean in seg_scores: # Must not be at the edges if seg_idx <= 1 or seg_idx >= seg_count - 2: continue # Must be significantly lower than overall mean if seg_mean < p_mean * 0.6: center = (sx + ex) // 2 candidate_valleys.append((sx, ex, center, ex - sx, seg_mean)) if len(candidate_valleys) >= 2: # Pick the best pair: non-adjacent, creating reasonable column widths candidate_valleys.sort(key=lambda v: v[2]) best_pair = None best_score = float('inf') for i in range(len(candidate_valleys)): for j in range(i + 1, len(candidate_valleys)): c1 = candidate_valleys[i][2] c2 = candidate_valleys[j][2] # Must be at least 20% apart if (c2 - c1) < content_w * 0.2: continue col1 = c1 col2 = c2 - c1 col3 = content_w - c2 # Each column at least 15% if col1 < content_w * 0.12 or col2 < content_w * 0.12 or col3 < content_w * 0.12: continue parts = sorted([col1, col2, col3]) score = parts[2] - parts[0] if score < best_score: best_score = score best_pair = (candidate_valleys[i], candidate_valleys[j]) if best_pair: valleys = list(best_pair) logger.info(f"Layout: local minima found 2 valleys: " f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}") logger.info(f"Layout: final {len(valleys)} valleys: " f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}") regions = [] if len(valleys) >= 2: # 3-column layout detected valleys.sort(key=lambda v: v[2]) if len(valleys) == 2: sep1_center = valleys[0][2] sep2_center = valleys[1][2] else: # Pick the two valleys that best divide into 3 parts # Prefer wider valleys (more likely true separators) best_pair = None best_score = float('inf') for i in range(len(valleys)): for j in range(i + 1, len(valleys)): c1, c2 = valleys[i][2], valleys[j][2] # Each column should be at least 15% of content width col1 = c1 col2 = c2 - c1 col3 = content_w - c2 if col1 < content_w * 0.15 or col2 < content_w * 0.15 or col3 < content_w * 0.15: continue # Score: lower is better (more even distribution) parts = sorted([col1, col2, col3]) score = parts[2] - parts[0] # Bonus for wider valleys (subtract valley width) score -= (valleys[i][3] + valleys[j][3]) * 0.5 if score < best_score: best_score = score best_pair = (c1, c2) if best_pair: sep1_center, sep2_center = best_pair else: sep1_center = valleys[0][2] sep2_center = valleys[1][2] # Convert from content-relative to absolute coordinates abs_sep1 = sep1_center + left_x abs_sep2 = sep2_center + left_x logger.info(f"Layout: 3 columns at separators x={abs_sep1}, x={abs_sep2} " f"(widths: {abs_sep1}, {abs_sep2-abs_sep1}, {w-abs_sep2})") regions.append(PageRegion( type='column_en', x=0, y=top_y, width=abs_sep1, height=content_h )) regions.append(PageRegion( type='column_de', x=abs_sep1, y=top_y, width=abs_sep2 - abs_sep1, height=content_h )) regions.append(PageRegion( type='column_example', x=abs_sep2, y=top_y, width=w - abs_sep2, height=content_h )) elif len(valleys) == 1: # 2-column layout abs_sep = valleys[0][2] + left_x logger.info(f"Layout: 2 columns at separator x={abs_sep}") regions.append(PageRegion( type='column_en', x=0, y=top_y, width=abs_sep, height=content_h )) regions.append(PageRegion( type='column_de', x=abs_sep, y=top_y, width=w - abs_sep, height=content_h )) else: # No columns detected — run full-page OCR as single column logger.warning("Layout: no column separators found, using full page") regions.append(PageRegion( type='column_en', x=0, y=top_y, width=w, height=content_h )) # Add header/footer info (gap-based detection with fallback) # Lazy import to avoid circular dependency with cv_layout.py from cv_layout_detection import _add_header_footer _add_header_footer(regions, top_y, bottom_y, w, h, inv=inv) top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none') bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none') col_count = len([r for r in regions if r.type.startswith('column')]) logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}") return regions