From db8327f039c151c3a18a8d444d5360ddbbe292d7 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 27 Feb 2026 23:16:31 +0100 Subject: [PATCH] fix(ocr-pipeline): tune column detection based on GT comparison Address 5 weaknesses found via ground-truth comparison on session df3548d1: - Add column_ignore for edge columns with < 3 words (margin detection) - Absorb tiny clusters (< 5% width) into neighbors post-merge - Restrict page_ref to left 35% of content area across all 3 levels - Loosen marker thresholds (width < 6%, words <= 15) and add strong marker score for very narrow non-edge columns (< 4%) - Add EN/DE position tiebreaker when language signals are both weak Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 147 +++++++++++++++++-- 1 file changed, 134 insertions(+), 13 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index b1820bd..790e960 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -1031,6 +1031,33 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt else: merged.append(s.copy()) + # --- Post-merge: absorb tiny clusters (< 5% content width) into neighbors --- + i = 0 + absorbed_count = 0 + while i < len(merged) and len(merged) > 3: + if i + 1 < len(merged): + cluster_w = merged[i + 1]['mean_x'] - merged[i]['mean_x'] + else: + cluster_w = content_w - (merged[i]['mean_x'] - merged[0]['mean_x']) + if cluster_w / content_w < 0.05: + # Absorb into neighbor (prefer left) + if i > 0: + target = merged[i - 1] + else: + target = merged[i + 1] + target['count'] += merged[i]['count'] + target['min_edge'] = min(target['min_edge'], merged[i]['min_edge']) + target['max_edge'] = max(target['max_edge'], merged[i]['max_edge']) + target['y_min'] = min(target['y_min'], merged[i]['y_min']) + target['y_max'] = max(target['y_max'], merged[i]['y_max']) + target['y_coverage'] = (target['y_max'] - target['y_min']) / content_h if content_h > 0 else 0.0 + del merged[i] + absorbed_count += 1 + else: + i += 1 + if absorbed_count: + logger.info(f"ColumnGeometry: absorbed {absorbed_count} tiny clusters (< 5% width)") + _merged_summary = [(m['mean_x']+left_x, m['count'], format(m['y_coverage'], '.0%')) for m in merged] logger.info(f"ColumnGeometry: {len(merged)} clusters after merging (dist={merge_distance}px): {_merged_summary}") @@ -1157,11 +1184,14 @@ def _score_role(geom: ColumnGeometry) -> Dict[str, float]: if digit_ratio > 0.4: scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5) - # Marker: very narrow + few short entries - if geom.width_ratio < 0.08 and geom.word_count <= 10: + # Marker: narrow + few short entries + if geom.width_ratio < 0.06 and geom.word_count <= 15: scores['marker'] = 0.7 if avg_word_len < 4: scores['marker'] = 0.9 + # Very narrow non-edge column → strong marker regardless of word count + if geom.width_ratio < 0.04 and geom.index > 0: + scores['marker'] = max(scores['marker'], 0.9) # Sentence: longer words + punctuation present if geom.width_ratio > 0.15 and has_punctuation > 2: @@ -1213,6 +1243,39 @@ def classify_column_types(geometries: List[ColumnGeometry], classification_method='content', )] + # --- Pre-filter: first/last columns with very few words → column_ignore --- + ignore_regions = [] + active_geometries = [] + for idx, g in enumerate(geometries): + if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 3: + ignore_regions.append(PageRegion( + type='column_ignore', x=g.x, y=g.y, + width=g.width, height=content_h, + classification_confidence=0.95, + classification_method='content', + )) + logger.info(f"ClassifyColumns: column {idx} (x={g.x}, words={g.word_count}) → column_ignore (edge, few words)") + else: + active_geometries.append(g) + + # Re-index active geometries for classification + for new_idx, g in enumerate(active_geometries): + g.index = new_idx + geometries = active_geometries + + # Handle edge case: all columns ignored or only 1 left + if len(geometries) == 0: + return ignore_regions + if len(geometries) == 1: + geom = geometries[0] + ignore_regions.append(PageRegion( + type='column_text', x=geom.x, y=geom.y, + width=geom.width, height=geom.height, + classification_confidence=0.9, + classification_method='content', + )) + return ignore_regions + # --- Score all columns --- lang_scores = [_score_language(g.words) for g in geometries] role_scores = [_score_role(g) for g in geometries] @@ -1227,20 +1290,20 @@ def classify_column_types(geometries: List[ColumnGeometry], if regions is not None: logger.info("ClassifyColumns: Level 1 (content-based) succeeded") _add_header_footer(regions, top_y, bottom_y, img_w, img_h) - return regions + return ignore_regions + regions # --- Level 2: Position + language enhanced --- regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h) if regions is not None: logger.info("ClassifyColumns: Level 2 (position+language) succeeded") _add_header_footer(regions, top_y, bottom_y, img_w, img_h) - return regions + return ignore_regions + regions # --- Level 3: Pure position fallback (old code, no regression) --- logger.info("ClassifyColumns: Level 3 (position fallback)") regions = _classify_by_position_fallback(geometries, content_w, content_h) _add_header_footer(regions, top_y, bottom_y, img_w, img_h) - return regions + return ignore_regions + regions def _classify_by_content(geometries: List[ColumnGeometry], @@ -1257,8 +1320,12 @@ def _classify_by_content(geometries: List[ColumnGeometry], assigned = set() # Step 1: Assign structural roles first (reference, marker) + first_x = geometries[0].x if geometries else 0 + left_35_threshold = first_x + content_w * 0.35 + for i, (geom, rs) in enumerate(zip(geometries, role_scores)): - if rs['reference'] >= 0.5 and geom.width_ratio < 0.12: + is_left_side = geom.x < left_35_threshold + if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side: regions.append(PageRegion( type='page_ref', x=geom.x, y=geom.y, width=geom.width, height=content_h, @@ -1266,7 +1333,7 @@ def _classify_by_content(geometries: List[ColumnGeometry], classification_method='content', )) assigned.add(i) - elif rs['marker'] >= 0.7 and geom.width_ratio < 0.08: + elif rs['marker'] >= 0.7 and geom.width_ratio < 0.06: regions.append(PageRegion( type='column_marker', x=geom.x, y=geom.y, width=geom.width, height=content_h, @@ -1274,6 +1341,15 @@ def _classify_by_content(geometries: List[ColumnGeometry], classification_method='content', )) assigned.add(i) + elif geom.width_ratio < 0.05 and not is_left_side: + # Narrow column on the right side → marker, not page_ref + regions.append(PageRegion( + type='column_marker', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=0.8, + classification_method='content', + )) + assigned.add(i) # Step 2: Among remaining columns, find EN and DE by language scores remaining = [(i, geometries[i], lang_scores[i], role_scores[i]) @@ -1296,6 +1372,47 @@ def _classify_by_content(geometries: List[ColumnGeometry], en_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['eng'] > ls['deu'] and ls['eng'] > 0.05] de_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['deu'] > ls['eng'] and ls['deu'] > 0.05] + # Position tiebreaker: when language signals are weak, use left=EN, right=DE + if (not en_candidates or not de_candidates) and len(remaining) >= 2: + max_eng = max(ls['eng'] for _, _, ls, _ in remaining) + max_deu = max(ls['deu'] for _, _, ls, _ in remaining) + if max_eng < 0.15 and max_deu < 0.15: + # Both signals weak — fall back to positional: left=EN, right=DE + sorted_remaining = sorted(remaining, key=lambda x: x[1].x) + best_en = (sorted_remaining[0][0], sorted_remaining[0][1], sorted_remaining[0][2]) + best_de = (sorted_remaining[1][0], sorted_remaining[1][1], sorted_remaining[1][2]) + logger.info("ClassifyColumns: Level 1 using position tiebreaker (weak signals) - left=EN, right=DE") + en_conf = 0.4 + de_conf = 0.4 + + regions.append(PageRegion( + type='column_en', x=best_en[1].x, y=best_en[1].y, + width=best_en[1].width, height=content_h, + classification_confidence=en_conf, + classification_method='content', + )) + assigned.add(best_en[0]) + + regions.append(PageRegion( + type='column_de', x=best_de[1].x, y=best_de[1].y, + width=best_de[1].width, height=content_h, + classification_confidence=de_conf, + classification_method='content', + )) + assigned.add(best_de[0]) + + # Assign remaining as example + for i, geom, ls, rs in remaining: + if i not in assigned: + regions.append(PageRegion( + type='column_example', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=0.4, + classification_method='content', + )) + regions.sort(key=lambda r: r.x) + return regions + if not en_candidates or not de_candidates: # Language signals too weak for content-based classification logger.info("ClassifyColumns: Level 1 failed - no clear EN/DE language split") @@ -1363,10 +1480,12 @@ def _classify_by_position_enhanced(geometries: List[ColumnGeometry], """ regions = [] untyped = list(range(len(geometries))) + first_x = geometries[0].x if geometries else 0 + left_35_threshold = first_x + content_w * 0.35 - # Rule 1: Leftmost narrow column → page_ref + # Rule 1: Leftmost narrow column → page_ref (only if in left 35%) g0 = geometries[0] - if g0.width_ratio < 0.12: + if g0.width_ratio < 0.12 and g0.x < left_35_threshold: regions.append(PageRegion( type='page_ref', x=g0.x, y=g0.y, width=g0.width, height=content_h, @@ -1378,7 +1497,7 @@ def _classify_by_position_enhanced(geometries: List[ColumnGeometry], # Rule 2: Narrow columns with few words → marker for i in list(untyped): geom = geometries[i] - if geom.width_ratio < 0.08 and geom.word_count <= 10: + if geom.width_ratio < 0.06 and geom.word_count <= 15: regions.append(PageRegion( type='column_marker', x=geom.x, y=geom.y, width=geom.width, height=content_h, @@ -1463,10 +1582,12 @@ def _classify_by_position_fallback(geometries: List[ColumnGeometry], """ regions = [] untyped = list(range(len(geometries))) + first_x = geometries[0].x if geometries else 0 + left_35_threshold = first_x + content_w * 0.35 - # Rule 1: Leftmost narrow column → page_ref + # Rule 1: Leftmost narrow column → page_ref (only if in left 35%) g0 = geometries[0] - if g0.width_ratio < 0.12: + if g0.width_ratio < 0.12 and g0.x < left_35_threshold: regions.append(PageRegion( type='page_ref', x=g0.x, y=g0.y, width=g0.width, height=content_h, @@ -1478,7 +1599,7 @@ def _classify_by_position_fallback(geometries: List[ColumnGeometry], # Rule 2: Narrow + few words → marker for i in list(untyped): geom = geometries[i] - if geom.width_ratio < 0.08 and geom.word_count <= 8: + if geom.width_ratio < 0.06 and geom.word_count <= 15: regions.append(PageRegion( type='column_marker', x=geom.x, y=geom.y, width=geom.width, height=content_h,