""" Document type detection, image preparation, content bounds, and header/footer detection. Extracted from cv_layout.py — these are the "input-side" helpers that run before column/row geometry analysis. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging from typing import List, Optional, Tuple import numpy as np from cv_vocab_types import ( DocumentTypeResult, PageRegion, ) logger = logging.getLogger(__name__) try: import cv2 except ImportError: cv2 = None # type: ignore[assignment] # ============================================================================= # Document Type Detection # ============================================================================= def detect_document_type(ocr_img: np.ndarray, img_bgr: np.ndarray) -> DocumentTypeResult: """Detect whether the page is a vocab table, generic table, or full text. Uses projection profiles and text density analysis — no OCR required. Runs in < 2 seconds. Args: ocr_img: Binarized grayscale image (for projection profiles). img_bgr: BGR color image. Returns: DocumentTypeResult with doc_type, confidence, pipeline, skip_steps. """ if ocr_img is None or ocr_img.size == 0: return DocumentTypeResult( doc_type='full_text', confidence=0.5, pipeline='full_page', skip_steps=['columns', 'rows'], features={'error': 'empty image'}, ) h, w = ocr_img.shape[:2] # --- 1. Vertical projection profile → detect column gaps --- # Sum dark pixels along each column (x-axis). Gaps = valleys in the profile. # Invert: dark pixels on white background → high values = text. vert_proj = np.sum(ocr_img < 128, axis=0).astype(float) # Smooth the profile to avoid noise spikes kernel_size = max(3, w // 100) if kernel_size % 2 == 0: kernel_size += 1 vert_smooth = np.convolve(vert_proj, np.ones(kernel_size) / kernel_size, mode='same') # Find significant vertical gaps (columns of near-zero text density) # A gap must be at least 1% of image width and have < 5% of max density max_density = max(vert_smooth.max(), 1) gap_threshold = max_density * 0.05 min_gap_width = max(5, w // 100) in_gap = False gap_count = 0 gap_start = 0 vert_gaps = [] for x in range(w): if vert_smooth[x] < gap_threshold: if not in_gap: in_gap = True gap_start = x else: if in_gap: gap_width = x - gap_start if gap_width >= min_gap_width: gap_count += 1 vert_gaps.append((gap_start, x, gap_width)) in_gap = False # Filter out margin gaps (within 10% of image edges) margin_threshold = w * 0.10 internal_gaps = [g for g in vert_gaps if g[0] > margin_threshold and g[1] < w - margin_threshold] internal_gap_count = len(internal_gaps) # --- 2. Horizontal projection profile → detect row gaps --- horiz_proj = np.sum(ocr_img < 128, axis=1).astype(float) h_kernel = max(3, h // 200) if h_kernel % 2 == 0: h_kernel += 1 horiz_smooth = np.convolve(horiz_proj, np.ones(h_kernel) / h_kernel, mode='same') h_max = max(horiz_smooth.max(), 1) h_gap_threshold = h_max * 0.05 min_row_gap = max(3, h // 200) row_gap_count = 0 in_gap = False for y in range(h): if horiz_smooth[y] < h_gap_threshold: if not in_gap: in_gap = True gap_start = y else: if in_gap: if y - gap_start >= min_row_gap: row_gap_count += 1 in_gap = False # --- 3. Text density distribution (4×4 grid) --- grid_rows, grid_cols = 4, 4 cell_h, cell_w = h // grid_rows, w // grid_cols densities = [] for gr in range(grid_rows): for gc in range(grid_cols): cell = ocr_img[gr * cell_h:(gr + 1) * cell_h, gc * cell_w:(gc + 1) * cell_w] if cell.size > 0: d = float(np.count_nonzero(cell < 128)) / cell.size densities.append(d) density_std = float(np.std(densities)) if densities else 0 density_mean = float(np.mean(densities)) if densities else 0 features = { 'vertical_gaps': gap_count, 'internal_vertical_gaps': internal_gap_count, 'vertical_gap_details': [(g[0], g[1], g[2]) for g in vert_gaps[:10]], 'row_gaps': row_gap_count, 'density_mean': round(density_mean, 4), 'density_std': round(density_std, 4), 'image_size': (w, h), } # --- 4. Decision tree --- # Use internal_gap_count (excludes margin gaps) for column detection. if internal_gap_count >= 2 and row_gap_count >= 5: # Multiple internal vertical gaps + many row gaps → table confidence = min(0.95, 0.7 + internal_gap_count * 0.05 + row_gap_count * 0.005) return DocumentTypeResult( doc_type='vocab_table', confidence=round(confidence, 2), pipeline='cell_first', skip_steps=[], features=features, ) elif internal_gap_count >= 1 and row_gap_count >= 3: # Some internal structure, likely a table confidence = min(0.85, 0.5 + internal_gap_count * 0.1 + row_gap_count * 0.01) return DocumentTypeResult( doc_type='generic_table', confidence=round(confidence, 2), pipeline='cell_first', skip_steps=[], features=features, ) elif internal_gap_count == 0: # No internal column gaps → full text (regardless of density) confidence = min(0.95, 0.8 + (1 - min(density_std, 0.1)) * 0.15) return DocumentTypeResult( doc_type='full_text', confidence=round(confidence, 2), pipeline='full_page', skip_steps=['columns', 'rows'], features=features, ) else: # Ambiguous — default to vocab_table (most common use case) return DocumentTypeResult( doc_type='vocab_table', confidence=0.5, pipeline='cell_first', skip_steps=[], features=features, ) # ============================================================================= # Image Creation (Dual Image Preparation) # ============================================================================= def create_ocr_image(img: np.ndarray) -> np.ndarray: """Create a binarized image optimized for Tesseract OCR. Steps: Grayscale → Background normalization → Adaptive threshold → Denoise. Args: img: BGR image. Returns: Binary image (white text on black background inverted to black on white). """ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # Background normalization: divide by blurred version bg = cv2.GaussianBlur(gray, (51, 51), 0) normalized = cv2.divide(gray, bg, scale=255) # Adaptive binarization binary = cv2.adaptiveThreshold( normalized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 10 ) # Light denoise denoised = cv2.medianBlur(binary, 3) return denoised def create_layout_image(img: np.ndarray) -> np.ndarray: """Create a CLAHE-enhanced grayscale image for layout analysis. Args: img: BGR image. Returns: Enhanced grayscale image. """ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) enhanced = clahe.apply(gray) return enhanced # ============================================================================= # Content Bounds Detection # ============================================================================= def _filter_narrow_runs(mask: np.ndarray, min_width: int) -> np.ndarray: """Remove contiguous True-runs shorter than *min_width* from a 1-D bool mask.""" out = mask.copy() n = len(out) i = 0 while i < n: if out[i]: start = i while i < n and out[i]: i += 1 if (i - start) < min_width: out[start:i] = False else: i += 1 return out def _find_content_bounds(inv: np.ndarray) -> Tuple[int, int, int, int]: """Find the bounding box of actual text content (excluding page margins). Scan artefacts (thin black lines at page edges) are filtered out by discarding contiguous projection runs narrower than 1 % of the image dimension (min 5 px). Returns: Tuple of (left_x, right_x, top_y, bottom_y). """ h, w = inv.shape[:2] threshold = 0.005 # --- Horizontal projection for top/bottom --- h_proj = np.sum(inv, axis=1).astype(float) / (w * 255) h_mask = h_proj > threshold min_h_run = max(5, h // 100) h_mask = _filter_narrow_runs(h_mask, min_h_run) top_y = 0 for y in range(h): if h_mask[y]: top_y = max(0, y - 5) break bottom_y = h for y in range(h - 1, 0, -1): if h_mask[y]: bottom_y = min(h, y + 5) break # --- Vertical projection for left/right margins --- v_proj = np.sum(inv[top_y:bottom_y, :], axis=0).astype(float) v_proj_norm = v_proj / ((bottom_y - top_y) * 255) if (bottom_y - top_y) > 0 else v_proj v_mask = v_proj_norm > threshold min_v_run = max(5, w // 100) v_mask = _filter_narrow_runs(v_mask, min_v_run) left_x = 0 for x in range(w): if v_mask[x]: left_x = max(0, x - 2) break right_x = w for x in range(w - 1, 0, -1): if v_mask[x]: right_x = min(w, x + 2) break return left_x, right_x, top_y, bottom_y # ============================================================================= # Header / Footer Detection # ============================================================================= def _detect_header_footer_gaps( inv: np.ndarray, img_w: int, img_h: int, ) -> Tuple[Optional[int], Optional[int]]: """Detect header/footer boundaries via horizontal projection gap analysis. Scans the full-page inverted image for large horizontal gaps in the top/bottom 20% that separate header/footer content from the main body. Returns: (header_y, footer_y) — absolute y-coordinates. header_y = bottom edge of header region (None if no header detected). footer_y = top edge of footer region (None if no footer detected). """ HEADER_FOOTER_ZONE = 0.20 GAP_MULTIPLIER = 2.0 # Step 1: Horizontal projection — clamp to img_h to avoid dewarp padding actual_h = min(inv.shape[0], img_h) roi = inv[:actual_h, :] h_proj = np.sum(roi, axis=1).astype(float) proj_w = roi.shape[1] h_proj_norm = h_proj / (proj_w * 255) if proj_w > 0 else h_proj # Step 2: Smoothing kernel_size = max(3, actual_h // 200) if kernel_size % 2 == 0: kernel_size += 1 h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same') # Step 3: Gap threshold positive = h_smooth[h_smooth > 0] median_density = float(np.median(positive)) if len(positive) > 0 else 0.01 gap_threshold = max(median_density * 0.15, 0.003) in_gap = h_smooth < gap_threshold MIN_GAP_HEIGHT = max(3, actual_h // 500) # Step 4: Collect contiguous gaps raw_gaps: List[Tuple[int, int]] = [] gap_start: Optional[int] = None for y in range(len(in_gap)): if in_gap[y]: if gap_start is None: gap_start = y else: if gap_start is not None: gap_height = y - gap_start if gap_height >= MIN_GAP_HEIGHT: raw_gaps.append((gap_start, y)) gap_start = None if gap_start is not None: gap_height = len(in_gap) - gap_start if gap_height >= MIN_GAP_HEIGHT: raw_gaps.append((gap_start, len(in_gap))) if not raw_gaps: return None, None # Step 5: Compute median gap size and large-gap threshold gap_sizes = [g[1] - g[0] for g in raw_gaps] median_gap = float(np.median(gap_sizes)) large_gap_threshold = median_gap * GAP_MULTIPLIER # Step 6: Find largest qualifying gap in header / footer zones # A separator gap must have content on BOTH sides — edge-touching gaps # (e.g. dewarp padding at bottom) are not valid separators. EDGE_MARGIN = max(5, actual_h // 400) header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE) footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE)) header_y: Optional[int] = None footer_y: Optional[int] = None best_header_size = 0 for gs, ge in raw_gaps: if gs <= EDGE_MARGIN: continue # skip gaps touching the top edge gap_mid = (gs + ge) / 2 gap_size = ge - gs if gap_mid < header_zone_limit and gap_size > large_gap_threshold: if gap_size > best_header_size: best_header_size = gap_size header_y = ge # bottom edge of gap best_footer_size = 0 for gs, ge in raw_gaps: if ge >= actual_h - EDGE_MARGIN: continue # skip gaps touching the bottom edge gap_mid = (gs + ge) / 2 gap_size = ge - gs if gap_mid > footer_zone_start and gap_size > large_gap_threshold: if gap_size > best_footer_size: best_footer_size = gap_size footer_y = gs # top edge of gap if header_y is not None: logger.info(f"HeaderFooterGaps: header boundary at y={header_y} " f"(gap={best_header_size}px, median_gap={median_gap:.0f}px)") if footer_y is not None: logger.info(f"HeaderFooterGaps: footer boundary at y={footer_y} " f"(gap={best_footer_size}px, median_gap={median_gap:.0f}px)") return header_y, footer_y def _region_has_content(inv: np.ndarray, y_start: int, y_end: int, min_density: float = 0.005) -> bool: """Check whether a horizontal strip contains meaningful ink. Args: inv: Inverted binarized image (white-on-black). y_start: Top of the region (inclusive). y_end: Bottom of the region (exclusive). min_density: Fraction of white pixels required to count as content. Returns: True if the region contains text/graphics, False if empty margin. """ if y_start >= y_end: return False strip = inv[y_start:y_end, :] density = float(np.sum(strip)) / (strip.shape[0] * strip.shape[1] * 255) return density > min_density def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int, img_w: int, img_h: int, inv: Optional[np.ndarray] = None) -> None: """Add header/footer/margin regions in-place. Uses gap-based detection when *inv* is provided, otherwise falls back to simple top_y/bottom_y bounds. Region types depend on whether there is actual content (text/graphics): - 'header' / 'footer' — region contains text (e.g. title, page number) - 'margin_top' / 'margin_bottom' — region is empty page margin """ header_y: Optional[int] = None footer_y: Optional[int] = None if inv is not None: header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h) # --- Top region --- top_boundary = header_y if header_y is not None and header_y > 10 else ( top_y if top_y > 10 else None ) if top_boundary is not None: has_content = inv is not None and _region_has_content(inv, 0, top_boundary) rtype = 'header' if has_content else 'margin_top' regions.append(PageRegion(type=rtype, x=0, y=0, width=img_w, height=top_boundary)) logger.info(f"HeaderFooter: top region type={rtype} height={top_boundary}px " f"(has_content={has_content})") # --- Bottom region --- bottom_boundary = footer_y if footer_y is not None and footer_y < img_h - 10 else ( bottom_y if bottom_y < img_h - 10 else None ) if bottom_boundary is not None: has_content = inv is not None and _region_has_content(inv, bottom_boundary, img_h) rtype = 'footer' if has_content else 'margin_bottom' regions.append(PageRegion(type=rtype, x=0, y=bottom_boundary, width=img_w, height=img_h - bottom_boundary)) logger.info(f"HeaderFooter: bottom region type={rtype} y={bottom_boundary} " f"height={img_h - bottom_boundary}px (has_content={has_content})")