""" Post-processing refinements for column geometry. Extracted from cv_layout_columns.py — contains: - _detect_sub_columns() (sub-column detection via left-edge alignment) - _split_broad_columns() (broad column splitting via word-coverage gaps) - expand_narrow_columns() (narrow column expansion into whitespace) Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging import statistics from typing import Dict, List, Optional, Tuple import numpy as np from cv_vocab_types import ColumnGeometry logger = logging.getLogger(__name__) def _detect_sub_columns( geometries: List[ColumnGeometry], content_w: int, left_x: int = 0, top_y: int = 0, header_y: Optional[int] = None, footer_y: Optional[int] = None, _edge_tolerance: int = 8, _min_col_start_ratio: float = 0.10, ) -> List[ColumnGeometry]: """Split columns that contain internal sub-columns based on left-edge alignment. For each column, clusters word left-edges into alignment bins (within ``_edge_tolerance`` px). The leftmost bin whose word count reaches ``_min_col_start_ratio`` of the column total is treated as the true column start. Any words to the left of that bin form a sub-column, provided they number >= 2 and < 35 % of total. Word ``left`` values are relative to the content ROI (offset by *left_x*), while ``ColumnGeometry.x`` is in absolute image coordinates. *left_x* bridges the two coordinate systems. If *header_y* / *footer_y* are provided (absolute y-coordinates), words in header/footer regions are excluded from alignment clustering to avoid polluting the bins with page numbers or chapter titles. Word ``top`` values are relative to *top_y*. Returns a new list of ColumnGeometry — potentially longer than the input. """ if content_w <= 0: return geometries result: List[ColumnGeometry] = [] for geo in geometries: # Only consider wide-enough columns with enough words if geo.width_ratio < 0.15 or geo.word_count < 5: result.append(geo) continue # Collect left-edges of confident words, excluding header/footer # Convert header_y/footer_y from absolute to relative (word 'top' is relative to top_y) min_top_rel = (header_y - top_y) if header_y is not None else None max_top_rel = (footer_y - top_y) if footer_y is not None else None confident = [w for w in geo.words if w.get('conf', 0) >= 30 and (min_top_rel is None or w['top'] >= min_top_rel) and (max_top_rel is None or w['top'] <= max_top_rel)] if len(confident) < 3: result.append(geo) continue # --- Cluster left-edges into alignment bins --- sorted_edges = sorted(w['left'] for w in confident) bins: List[Tuple[int, int, int, int]] = [] # (center, count, min_edge, max_edge) cur = [sorted_edges[0]] for i in range(1, len(sorted_edges)): if sorted_edges[i] - cur[-1] <= _edge_tolerance: cur.append(sorted_edges[i]) else: bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur))) cur = [sorted_edges[i]] bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur))) # --- Find the leftmost bin qualifying as a real column start --- total = len(confident) min_count = max(3, int(total * _min_col_start_ratio)) col_start_bin = None for b in bins: if b[1] >= min_count: col_start_bin = b break if col_start_bin is None: result.append(geo) continue # Words to the left of the column-start bin are sub-column candidates split_threshold = col_start_bin[2] - _edge_tolerance sub_words = [w for w in geo.words if w['left'] < split_threshold] main_words = [w for w in geo.words if w['left'] >= split_threshold] # Count only body words (excluding header/footer) for the threshold check # so that header/footer words don't artificially trigger a split. sub_body = [w for w in sub_words if (min_top_rel is None or w['top'] >= min_top_rel) and (max_top_rel is None or w['top'] <= max_top_rel)] if len(sub_body) < 2 or len(sub_body) / len(geo.words) >= 0.35: result.append(geo) continue # --- Guard against inline markers (bullet points, numbering) --- # Bullet points like "1.", "2.", "•", "-" sit close to the main # column text and are part of the cell, not a separate column. # Only split if the horizontal gap between the rightmost sub-word # and the main column start is large enough. max_sub_right = max(w['left'] + w.get('width', 0) for w in sub_words) gap_to_main = col_start_bin[2] - max_sub_right # px gap median_heights = [w.get('height', 20) for w in confident] med_h = statistics.median(median_heights) if median_heights else 20 min_gap = max(med_h * 1.2, 20) # at least 1.2× word height or 20px if gap_to_main < min_gap: logger.debug( "SubColumnSplit: column idx=%d skipped — gap=%dpx < min=%dpx " "(likely inline markers, not a sub-column)", geo.index, gap_to_main, min_gap) result.append(geo) continue # --- Build two sub-column geometries --- # Word 'left' values are relative to left_x; geo.x is absolute. # Convert the split position from relative to absolute coordinates. max_sub_left = max(w['left'] for w in sub_words) split_rel = (max_sub_left + col_start_bin[2]) // 2 split_abs = split_rel + left_x sub_x = geo.x sub_width = split_abs - geo.x main_x = split_abs main_width = (geo.x + geo.width) - split_abs if sub_width <= 0 or main_width <= 0: result.append(geo) continue sub_geo = ColumnGeometry( index=0, x=sub_x, y=geo.y, width=sub_width, height=geo.height, word_count=len(sub_words), words=sub_words, width_ratio=sub_width / content_w if content_w > 0 else 0.0, is_sub_column=True, ) main_geo = ColumnGeometry( index=0, x=main_x, y=geo.y, width=main_width, height=geo.height, word_count=len(main_words), words=main_words, width_ratio=main_width / content_w if content_w > 0 else 0.0, is_sub_column=True, ) result.append(sub_geo) result.append(main_geo) logger.info( f"SubColumnSplit: column idx={geo.index} split at abs_x={split_abs} " f"(rel={split_rel}), sub={len(sub_words)} words, " f"main={len(main_words)} words, " f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})" ) # Re-index by left-to-right order result.sort(key=lambda g: g.x) for i, g in enumerate(result): g.index = i return result def _split_broad_columns( geometries: List[ColumnGeometry], content_w: int, left_x: int = 0, _broad_threshold: float = 0.35, _min_gap_px: int = 15, _min_words_per_split: int = 5, ) -> List[ColumnGeometry]: """Split overly broad columns that contain two language blocks (EN+DE). Uses word-coverage gap analysis: builds a per-pixel coverage array from the words inside each broad column, finds the largest horizontal gap, and splits the column at that gap. Args: geometries: Column geometries from _detect_sub_columns. content_w: Width of the content area in pixels. left_x: Left edge of content ROI in absolute image coordinates. _broad_threshold: Minimum width_ratio to consider a column "broad". _min_gap_px: Minimum gap width (pixels) to trigger a split. _min_words_per_split: Both halves must have at least this many words. Returns: Updated list of ColumnGeometry (possibly with more columns). """ result: List[ColumnGeometry] = [] logger.info(f"SplitBroadCols: input {len(geometries)} cols: " f"{[(g.index, g.x, g.width, g.word_count, round(g.width_ratio, 3)) for g in geometries]}") for geo in geometries: if geo.width_ratio <= _broad_threshold or len(geo.words) < 10: result.append(geo) continue # Build word-coverage array (per pixel within column) col_left_rel = geo.x - left_x # column left in content-relative coords coverage = np.zeros(geo.width, dtype=np.float32) for wd in geo.words: # wd['left'] is relative to left_x (content ROI) wl = wd['left'] - col_left_rel wr = wl + wd.get('width', 0) wl = max(0, int(wl)) wr = min(geo.width, int(wr)) if wr > wl: coverage[wl:wr] += 1.0 # Light smoothing (kernel=3px) to avoid noise if len(coverage) > 3: kernel = np.ones(3, dtype=np.float32) / 3.0 coverage = np.convolve(coverage, kernel, mode='same') # Normalise to [0, 1] cmax = coverage.max() if cmax > 0: coverage /= cmax # Find INTERNAL gaps where coverage < 0.5 # Exclude edge gaps (touching pixel 0 or geo.width) — those are margins. low_mask = coverage < 0.5 all_gaps = [] _gs = None for px in range(len(low_mask)): if low_mask[px]: if _gs is None: _gs = px else: if _gs is not None: all_gaps.append((_gs, px, px - _gs)) _gs = None if _gs is not None: all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs)) # Filter: only internal gaps (not touching column edges) _edge_margin = 10 # pixels from edge to ignore internal_gaps = [g for g in all_gaps if g[0] > _edge_margin and g[1] < geo.width - _edge_margin] best_gap = max(internal_gaps, key=lambda g: g[2]) if internal_gaps else None logger.info(f"SplitBroadCols: col {geo.index} all_gaps(>=5px): " f"{[g for g in all_gaps if g[2] >= 5]}, " f"internal(>=5px): {[g for g in internal_gaps if g[2] >= 5]}, " f"best={best_gap}") if best_gap is None or best_gap[2] < _min_gap_px: result.append(geo) continue gap_center = (best_gap[0] + best_gap[1]) // 2 # Split words by midpoint relative to gap left_words = [] right_words = [] for wd in geo.words: wl = wd['left'] - col_left_rel mid = wl + wd.get('width', 0) / 2.0 if mid < gap_center: left_words.append(wd) else: right_words.append(wd) if len(left_words) < _min_words_per_split or len(right_words) < _min_words_per_split: result.append(geo) continue # Build two new ColumnGeometry objects split_x_abs = geo.x + gap_center left_w = gap_center right_w = geo.width - gap_center left_geo = ColumnGeometry( index=0, x=geo.x, y=geo.y, width=left_w, height=geo.height, word_count=len(left_words), words=left_words, width_ratio=left_w / content_w if content_w else 0, is_sub_column=True, ) right_geo = ColumnGeometry( index=0, x=split_x_abs, y=geo.y, width=right_w, height=geo.height, word_count=len(right_words), words=right_words, width_ratio=right_w / content_w if content_w else 0, is_sub_column=True, ) logger.info( f"SplitBroadCols: col {geo.index} SPLIT at gap_center={gap_center} " f"(gap {best_gap[2]}px @ [{best_gap[0]}..{best_gap[1]}]), " f"left={len(left_words)} words (w={left_w}), " f"right={len(right_words)} words (w={right_w})" ) result.append(left_geo) result.append(right_geo) # Re-index left-to-right result.sort(key=lambda g: g.x) for i, g in enumerate(result): g.index = i return result def expand_narrow_columns( geometries: List[ColumnGeometry], content_w: int, left_x: int, word_dicts: List[Dict], ) -> List[ColumnGeometry]: """Expand narrow columns into adjacent whitespace gaps. Narrow columns (marker, page_ref, < 10% content width) often lose content at image edges due to residual shear. This expands them toward the neighbouring column, but never past 40% of the gap or past the nearest word in the neighbour. Must be called AFTER _detect_sub_columns() so that sub-column splits (which create the narrowest columns) have already happened. """ _NARROW_THRESHOLD_PCT = 10.0 _MIN_WORD_MARGIN = 4 if len(geometries) < 2: return geometries logger.info("ExpandNarrowCols: input %d cols: %s", len(geometries), [(i, g.x, g.width, round(g.width / content_w * 100, 1)) for i, g in enumerate(geometries)]) for i, g in enumerate(geometries): col_pct = g.width / content_w * 100 if content_w > 0 else 100 if col_pct >= _NARROW_THRESHOLD_PCT: continue expanded = False orig_pct = col_pct # --- try expanding to the LEFT --- if i > 0: left_nb = geometries[i - 1] # Gap can be 0 if sub-column split created adjacent columns. # In that case, look at where the neighbor's rightmost words # actually are — there may be unused space we can claim. nb_words_right = [wd['left'] + wd.get('width', 0) for wd in left_nb.words] if nb_words_right: rightmost_word_abs = left_x + max(nb_words_right) safe_left_abs = rightmost_word_abs + _MIN_WORD_MARGIN else: # No words in neighbor → we can take up to neighbor's start safe_left_abs = left_nb.x + _MIN_WORD_MARGIN if safe_left_abs < g.x: g.width += (g.x - safe_left_abs) g.x = safe_left_abs expanded = True # --- try expanding to the RIGHT --- if i + 1 < len(geometries): right_nb = geometries[i + 1] nb_words_left = [wd['left'] for wd in right_nb.words] if nb_words_left: leftmost_word_abs = left_x + min(nb_words_left) safe_right_abs = leftmost_word_abs - _MIN_WORD_MARGIN else: safe_right_abs = right_nb.x + right_nb.width - _MIN_WORD_MARGIN cur_right = g.x + g.width if safe_right_abs > cur_right: g.width = safe_right_abs - g.x expanded = True if expanded: col_left_rel = g.x - left_x col_right_rel = col_left_rel + g.width g.words = [wd for wd in word_dicts if col_left_rel <= wd['left'] < col_right_rel] g.word_count = len(g.words) g.width_ratio = g.width / content_w if content_w > 0 else 0.0 logger.info( "ExpandNarrowCols: col %d (%.1f%% → %.1f%%) x=%d w=%d words=%d", i, orig_pct, g.width / content_w * 100, g.x, g.width, g.word_count) # --- Shrink overlapping neighbors to match new boundaries --- # Left neighbor: its right edge must not exceed our new left edge if i > 0: left_nb = geometries[i - 1] nb_right = left_nb.x + left_nb.width if nb_right > g.x: left_nb.width = g.x - left_nb.x if left_nb.width < 0: left_nb.width = 0 left_nb.width_ratio = left_nb.width / content_w if content_w > 0 else 0.0 # Re-assign words nb_left_rel = left_nb.x - left_x nb_right_rel = nb_left_rel + left_nb.width left_nb.words = [wd for wd in word_dicts if nb_left_rel <= wd['left'] < nb_right_rel] left_nb.word_count = len(left_nb.words) # Right neighbor: its left edge must not be before our new right edge if i + 1 < len(geometries): right_nb = geometries[i + 1] my_right = g.x + g.width if right_nb.x < my_right: old_right_edge = right_nb.x + right_nb.width right_nb.x = my_right right_nb.width = old_right_edge - right_nb.x if right_nb.width < 0: right_nb.width = 0 right_nb.width_ratio = right_nb.width / content_w if content_w > 0 else 0.0 # Re-assign words nb_left_rel = right_nb.x - left_x nb_right_rel = nb_left_rel + right_nb.width right_nb.words = [wd for wd in word_dicts if nb_left_rel <= wd['left'] < nb_right_rel] right_nb.word_count = len(right_nb.words) return geometries