""" Position-based column type classification for OCR layout analysis. Contains Level 2 and Level 3 classification functions: Level 2 – _classify_by_position_enhanced: Position + language confirmation Level 3 – _classify_by_position_fallback: Pure positional (no regression) Extracted from cv_layout_classify.py during file-size split. """ import logging from typing import Dict, List, Optional from cv_vocab_types import ColumnGeometry, PageRegion logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Level 2: Position-Enhanced Classification # --------------------------------------------------------------------------- def _classify_by_position_enhanced(geometries: List[ColumnGeometry], lang_scores: List[Dict[str, float]], content_w: int, content_h: int) -> Optional[List[PageRegion]]: """Level 2: Position-based rules enhanced with language confirmation. Uses the old positional heuristics but confirms EN/DE assignment with language scores (swapping if needed). """ regions = [] untyped = list(range(len(geometries))) first_x = geometries[0].x if geometries else 0 left_20_threshold = first_x + content_w * 0.20 # Rule 1: Leftmost narrow column -> page_ref (only if in left 20%, no strong language) g0 = geometries[0] ls0 = lang_scores[0] has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3 if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0: regions.append(PageRegion( type='page_ref', x=g0.x, y=g0.y, width=g0.width, height=content_h, classification_confidence=0.8, classification_method='position_enhanced', )) untyped.remove(0) # Rule 2: Narrow columns with few words -> marker for i in list(untyped): geom = geometries[i] if geom.width_ratio < 0.06 and geom.word_count <= 15: regions.append(PageRegion( type='column_marker', x=geom.x, y=geom.y, width=geom.width, height=content_h, classification_confidence=0.7, classification_method='position_enhanced', )) untyped.remove(i) # Rule 3: Rightmost remaining -> column_example (if 3+ remaining) if len(untyped) >= 3: last_idx = untyped[-1] geom = geometries[last_idx] regions.append(PageRegion( type='column_example', x=geom.x, y=geom.y, width=geom.width, height=content_h, classification_confidence=0.7, classification_method='position_enhanced', )) untyped.remove(last_idx) # Rule 4: First two remaining -> EN/DE, but check language to possibly swap if len(untyped) >= 2: idx_a = untyped[0] idx_b = untyped[1] ls_a = lang_scores[idx_a] ls_b = lang_scores[idx_b] # Default: first=EN, second=DE (old behavior) en_idx, de_idx = idx_a, idx_b conf = 0.7 # Swap if language signals clearly indicate the opposite if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']: en_idx, de_idx = idx_b, idx_a conf = 0.85 logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores") regions.append(PageRegion( type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y, width=geometries[en_idx].width, height=content_h, classification_confidence=conf, classification_method='position_enhanced', )) regions.append(PageRegion( type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y, width=geometries[de_idx].width, height=content_h, classification_confidence=conf, classification_method='position_enhanced', )) untyped = untyped[2:] elif len(untyped) == 1: idx = untyped[0] geom = geometries[idx] regions.append(PageRegion( type='column_en', x=geom.x, y=geom.y, width=geom.width, height=content_h, classification_confidence=0.5, classification_method='position_enhanced', )) untyped = [] # Remaining -> example for idx in untyped: geom = geometries[idx] regions.append(PageRegion( type='column_example', x=geom.x, y=geom.y, width=geom.width, height=content_h, classification_confidence=0.5, classification_method='position_enhanced', )) regions.sort(key=lambda r: r.x) return regions # --------------------------------------------------------------------------- # Level 3: Position Fallback Classification # --------------------------------------------------------------------------- def _classify_by_position_fallback(geometries: List[ColumnGeometry], content_w: int, content_h: int) -> List[PageRegion]: """Level 3: Pure position-based fallback (identical to old code). Guarantees no regression from the previous behavior. """ regions = [] untyped = list(range(len(geometries))) first_x = geometries[0].x if geometries else 0 left_20_threshold = first_x + content_w * 0.20 # Rule 1: Leftmost narrow column -> page_ref (only if in left 20%) g0 = geometries[0] if g0.width_ratio < 0.12 and g0.x < left_20_threshold: regions.append(PageRegion( type='page_ref', x=g0.x, y=g0.y, width=g0.width, height=content_h, classification_confidence=1.0, classification_method='position_fallback', )) untyped.remove(0) # Rule 2: Narrow + few words -> marker for i in list(untyped): geom = geometries[i] if geom.width_ratio < 0.06 and geom.word_count <= 15: regions.append(PageRegion( type='column_marker', x=geom.x, y=geom.y, width=geom.width, height=content_h, classification_confidence=1.0, classification_method='position_fallback', )) untyped.remove(i) # Rule 3: Rightmost remaining -> example (if 3+) if len(untyped) >= 3: last_idx = untyped[-1] geom = geometries[last_idx] regions.append(PageRegion( type='column_example', x=geom.x, y=geom.y, width=geom.width, height=content_h, classification_confidence=1.0, classification_method='position_fallback', )) untyped.remove(last_idx) # Rule 4: First remaining -> EN, second -> DE if len(untyped) >= 2: en_idx = untyped[0] de_idx = untyped[1] regions.append(PageRegion( type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y, width=geometries[en_idx].width, height=content_h, classification_confidence=1.0, classification_method='position_fallback', )) regions.append(PageRegion( type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y, width=geometries[de_idx].width, height=content_h, classification_confidence=1.0, classification_method='position_fallback', )) untyped = untyped[2:] elif len(untyped) == 1: idx = untyped[0] geom = geometries[idx] regions.append(PageRegion( type='column_en', x=geom.x, y=geom.y, width=geom.width, height=content_h, classification_confidence=1.0, classification_method='position_fallback', )) untyped = [] for idx in untyped: geom = geometries[idx] regions.append(PageRegion( type='column_example', x=geom.x, y=geom.y, width=geom.width, height=content_h, classification_confidence=1.0, classification_method='position_fallback', )) regions.sort(key=lambda r: r.x) return regions