From db8327f039c151c3a18a8d444d5360ddbbe292d7 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Fri, 27 Feb 2026 23:16:31 +0100
Subject: [PATCH] fix(ocr-pipeline): tune column detection based on GT
 comparison

Address 5 weaknesses found via ground-truth comparison on session df3548d1:
- Add column_ignore for edge columns with < 3 words (margin detection)
- Absorb tiny clusters (< 5% width) into neighbors post-merge
- Restrict page_ref to left 35% of content area across all 3 levels
- Loosen marker thresholds (width < 6%, words <= 15) and add strong
  marker score for very narrow non-edge columns (< 4%)
- Add EN/DE position tiebreaker when language signals are both weak

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 147 +++++++++++++++++--
 1 file changed, 134 insertions(+), 13 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index b1820bd..790e960 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -1031,6 +1031,33 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
         else:
             merged.append(s.copy())
 
+    # --- Post-merge: absorb tiny clusters (< 5% content width) into neighbors ---
+    i = 0
+    absorbed_count = 0
+    while i < len(merged) and len(merged) > 3:
+        if i + 1 < len(merged):
+            cluster_w = merged[i + 1]['mean_x'] - merged[i]['mean_x']
+        else:
+            cluster_w = content_w - (merged[i]['mean_x'] - merged[0]['mean_x'])
+        if cluster_w / content_w < 0.05:
+            # Absorb into neighbor (prefer left)
+            if i > 0:
+                target = merged[i - 1]
+            else:
+                target = merged[i + 1]
+            target['count'] += merged[i]['count']
+            target['min_edge'] = min(target['min_edge'], merged[i]['min_edge'])
+            target['max_edge'] = max(target['max_edge'], merged[i]['max_edge'])
+            target['y_min'] = min(target['y_min'], merged[i]['y_min'])
+            target['y_max'] = max(target['y_max'], merged[i]['y_max'])
+            target['y_coverage'] = (target['y_max'] - target['y_min']) / content_h if content_h > 0 else 0.0
+            del merged[i]
+            absorbed_count += 1
+        else:
+            i += 1
+    if absorbed_count:
+        logger.info(f"ColumnGeometry: absorbed {absorbed_count} tiny clusters (< 5% width)")
+
     _merged_summary = [(m['mean_x']+left_x, m['count'], format(m['y_coverage'], '.0%')) for m in merged]
     logger.info(f"ColumnGeometry: {len(merged)} clusters after merging (dist={merge_distance}px): {_merged_summary}")
 
@@ -1157,11 +1184,14 @@ def _score_role(geom: ColumnGeometry) -> Dict[str, float]:
         if digit_ratio > 0.4:
             scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5)
 
-    # Marker: very narrow + few short entries
-    if geom.width_ratio < 0.08 and geom.word_count <= 10:
+    # Marker: narrow + few short entries
+    if geom.width_ratio < 0.06 and geom.word_count <= 15:
         scores['marker'] = 0.7
         if avg_word_len < 4:
             scores['marker'] = 0.9
+    # Very narrow non-edge column → strong marker regardless of word count
+    if geom.width_ratio < 0.04 and geom.index > 0:
+        scores['marker'] = max(scores['marker'], 0.9)
 
     # Sentence: longer words + punctuation present
     if geom.width_ratio > 0.15 and has_punctuation > 2:
@@ -1213,6 +1243,39 @@ def classify_column_types(geometries: List[ColumnGeometry],
             classification_method='content',
         )]
 
+    # --- Pre-filter: first/last columns with very few words → column_ignore ---
+    ignore_regions = []
+    active_geometries = []
+    for idx, g in enumerate(geometries):
+        if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 3:
+            ignore_regions.append(PageRegion(
+                type='column_ignore', x=g.x, y=g.y,
+                width=g.width, height=content_h,
+                classification_confidence=0.95,
+                classification_method='content',
+            ))
+            logger.info(f"ClassifyColumns: column {idx} (x={g.x}, words={g.word_count}) → column_ignore (edge, few words)")
+        else:
+            active_geometries.append(g)
+
+    # Re-index active geometries for classification
+    for new_idx, g in enumerate(active_geometries):
+        g.index = new_idx
+    geometries = active_geometries
+
+    # Handle edge case: all columns ignored or only 1 left
+    if len(geometries) == 0:
+        return ignore_regions
+    if len(geometries) == 1:
+        geom = geometries[0]
+        ignore_regions.append(PageRegion(
+            type='column_text', x=geom.x, y=geom.y,
+            width=geom.width, height=geom.height,
+            classification_confidence=0.9,
+            classification_method='content',
+        ))
+        return ignore_regions
+
     # --- Score all columns ---
     lang_scores = [_score_language(g.words) for g in geometries]
     role_scores = [_score_role(g) for g in geometries]
@@ -1227,20 +1290,20 @@ def classify_column_types(geometries: List[ColumnGeometry],
     if regions is not None:
         logger.info("ClassifyColumns: Level 1 (content-based) succeeded")
         _add_header_footer(regions, top_y, bottom_y, img_w, img_h)
-        return regions
+        return ignore_regions + regions
 
     # --- Level 2: Position + language enhanced ---
     regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h)
     if regions is not None:
         logger.info("ClassifyColumns: Level 2 (position+language) succeeded")
         _add_header_footer(regions, top_y, bottom_y, img_w, img_h)
-        return regions
+        return ignore_regions + regions
 
     # --- Level 3: Pure position fallback (old code, no regression) ---
     logger.info("ClassifyColumns: Level 3 (position fallback)")
     regions = _classify_by_position_fallback(geometries, content_w, content_h)
     _add_header_footer(regions, top_y, bottom_y, img_w, img_h)
-    return regions
+    return ignore_regions + regions
 
 
 def _classify_by_content(geometries: List[ColumnGeometry],
@@ -1257,8 +1320,12 @@ def _classify_by_content(geometries: List[ColumnGeometry],
     assigned = set()
 
     # Step 1: Assign structural roles first (reference, marker)
+    first_x = geometries[0].x if geometries else 0
+    left_35_threshold = first_x + content_w * 0.35
+
     for i, (geom, rs) in enumerate(zip(geometries, role_scores)):
-        if rs['reference'] >= 0.5 and geom.width_ratio < 0.12:
+        is_left_side = geom.x < left_35_threshold
+        if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side:
             regions.append(PageRegion(
                 type='page_ref', x=geom.x, y=geom.y,
                 width=geom.width, height=content_h,
@@ -1266,7 +1333,7 @@ def _classify_by_content(geometries: List[ColumnGeometry],
                 classification_method='content',
             ))
             assigned.add(i)
-        elif rs['marker'] >= 0.7 and geom.width_ratio < 0.08:
+        elif rs['marker'] >= 0.7 and geom.width_ratio < 0.06:
             regions.append(PageRegion(
                 type='column_marker', x=geom.x, y=geom.y,
                 width=geom.width, height=content_h,
@@ -1274,6 +1341,15 @@ def _classify_by_content(geometries: List[ColumnGeometry],
                 classification_method='content',
             ))
             assigned.add(i)
+        elif geom.width_ratio < 0.05 and not is_left_side:
+            # Narrow column on the right side → marker, not page_ref
+            regions.append(PageRegion(
+                type='column_marker', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=0.8,
+                classification_method='content',
+            ))
+            assigned.add(i)
 
     # Step 2: Among remaining columns, find EN and DE by language scores
     remaining = [(i, geometries[i], lang_scores[i], role_scores[i])
@@ -1296,6 +1372,47 @@ def _classify_by_content(geometries: List[ColumnGeometry],
     en_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['eng'] > ls['deu'] and ls['eng'] > 0.05]
     de_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['deu'] > ls['eng'] and ls['deu'] > 0.05]
 
+    # Position tiebreaker: when language signals are weak, use left=EN, right=DE
+    if (not en_candidates or not de_candidates) and len(remaining) >= 2:
+        max_eng = max(ls['eng'] for _, _, ls, _ in remaining)
+        max_deu = max(ls['deu'] for _, _, ls, _ in remaining)
+        if max_eng < 0.15 and max_deu < 0.15:
+            # Both signals weak — fall back to positional: left=EN, right=DE
+            sorted_remaining = sorted(remaining, key=lambda x: x[1].x)
+            best_en = (sorted_remaining[0][0], sorted_remaining[0][1], sorted_remaining[0][2])
+            best_de = (sorted_remaining[1][0], sorted_remaining[1][1], sorted_remaining[1][2])
+            logger.info("ClassifyColumns: Level 1 using position tiebreaker (weak signals) - left=EN, right=DE")
+            en_conf = 0.4
+            de_conf = 0.4
+
+            regions.append(PageRegion(
+                type='column_en', x=best_en[1].x, y=best_en[1].y,
+                width=best_en[1].width, height=content_h,
+                classification_confidence=en_conf,
+                classification_method='content',
+            ))
+            assigned.add(best_en[0])
+
+            regions.append(PageRegion(
+                type='column_de', x=best_de[1].x, y=best_de[1].y,
+                width=best_de[1].width, height=content_h,
+                classification_confidence=de_conf,
+                classification_method='content',
+            ))
+            assigned.add(best_de[0])
+
+            # Assign remaining as example
+            for i, geom, ls, rs in remaining:
+                if i not in assigned:
+                    regions.append(PageRegion(
+                        type='column_example', x=geom.x, y=geom.y,
+                        width=geom.width, height=content_h,
+                        classification_confidence=0.4,
+                        classification_method='content',
+                    ))
+            regions.sort(key=lambda r: r.x)
+            return regions
+
     if not en_candidates or not de_candidates:
         # Language signals too weak for content-based classification
         logger.info("ClassifyColumns: Level 1 failed - no clear EN/DE language split")
@@ -1363,10 +1480,12 @@ def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
     """
     regions = []
     untyped = list(range(len(geometries)))
+    first_x = geometries[0].x if geometries else 0
+    left_35_threshold = first_x + content_w * 0.35
 
-    # Rule 1: Leftmost narrow column → page_ref
+    # Rule 1: Leftmost narrow column → page_ref (only if in left 35%)
     g0 = geometries[0]
-    if g0.width_ratio < 0.12:
+    if g0.width_ratio < 0.12 and g0.x < left_35_threshold:
         regions.append(PageRegion(
             type='page_ref', x=g0.x, y=g0.y,
             width=g0.width, height=content_h,
@@ -1378,7 +1497,7 @@ def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
     # Rule 2: Narrow columns with few words → marker
     for i in list(untyped):
         geom = geometries[i]
-        if geom.width_ratio < 0.08 and geom.word_count <= 10:
+        if geom.width_ratio < 0.06 and geom.word_count <= 15:
             regions.append(PageRegion(
                 type='column_marker', x=geom.x, y=geom.y,
                 width=geom.width, height=content_h,
@@ -1463,10 +1582,12 @@ def _classify_by_position_fallback(geometries: List[ColumnGeometry],
     """
     regions = []
     untyped = list(range(len(geometries)))
+    first_x = geometries[0].x if geometries else 0
+    left_35_threshold = first_x + content_w * 0.35
 
-    # Rule 1: Leftmost narrow column → page_ref
+    # Rule 1: Leftmost narrow column → page_ref (only if in left 35%)
     g0 = geometries[0]
-    if g0.width_ratio < 0.12:
+    if g0.width_ratio < 0.12 and g0.x < left_35_threshold:
         regions.append(PageRegion(
             type='page_ref', x=g0.x, y=g0.y,
             width=g0.width, height=content_h,
@@ -1478,7 +1599,7 @@ def _classify_by_position_fallback(geometries: List[ColumnGeometry],
     # Rule 2: Narrow + few words → marker
     for i in list(untyped):
         geom = geometries[i]
-        if geom.width_ratio < 0.08 and geom.word_count <= 8:
+        if geom.width_ratio < 0.06 and geom.word_count <= 15:
             regions.append(PageRegion(
                 type='column_marker', x=geom.x, y=geom.y,
                 width=geom.width, height=content_h,