fix(ocr-pipeline): tune column detection based on GT comparison

Address 5 weaknesses found via ground-truth comparison on session df3548d1: - Add column_ignore for edge columns with < 3 words (margin detection) - Absorb tiny clusters (< 5% width) into neighbors post-merge - Restrict page_ref to left 35% of content area across all 3 levels - Loosen marker thresholds (width < 6%, words <= 15) and add strong marker score for very narrow non-edge columns (< 4%) - Add EN/DE position tiebreaker when language signals are both weak Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-27 23:16:31 +01:00
parent 587b066a40
commit db8327f039
1 changed files with 134 additions and 13 deletions
@@ -1031,6 +1031,33 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
        else:
            merged.append(s.copy())

+    # --- Post-merge: absorb tiny clusters (< 5% content width) into neighbors ---
+    i = 0
+    absorbed_count = 0
+    while i < len(merged) and len(merged) > 3:
+        if i + 1 < len(merged):
+            cluster_w = merged[i + 1]['mean_x'] - merged[i]['mean_x']
+        else:
+            cluster_w = content_w - (merged[i]['mean_x'] - merged[0]['mean_x'])
+        if cluster_w / content_w < 0.05:
+            # Absorb into neighbor (prefer left)
+            if i > 0:
+                target = merged[i - 1]
+            else:
+                target = merged[i + 1]
+            target['count'] += merged[i]['count']
+            target['min_edge'] = min(target['min_edge'], merged[i]['min_edge'])
+            target['max_edge'] = max(target['max_edge'], merged[i]['max_edge'])
+            target['y_min'] = min(target['y_min'], merged[i]['y_min'])
+            target['y_max'] = max(target['y_max'], merged[i]['y_max'])
+            target['y_coverage'] = (target['y_max'] - target['y_min']) / content_h if content_h > 0 else 0.0
+            del merged[i]
+            absorbed_count += 1
+        else:
+            i += 1
+    if absorbed_count:
+        logger.info(f"ColumnGeometry: absorbed {absorbed_count} tiny clusters (< 5% width)")
+
    _merged_summary = [(m['mean_x']+left_x, m['count'], format(m['y_coverage'], '.0%')) for m in merged]
    logger.info(f"ColumnGeometry: {len(merged)} clusters after merging (dist={merge_distance}px): {_merged_summary}")

@@ -1157,11 +1184,14 @@ def _score_role(geom: ColumnGeometry) -> Dict[str, float]:
        if digit_ratio > 0.4:
            scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5)

-    # Marker: very narrow + few short entries
-    if geom.width_ratio < 0.08 and geom.word_count <= 10:
+    # Marker: narrow + few short entries
+    if geom.width_ratio < 0.06 and geom.word_count <= 15:
        scores['marker'] = 0.7
        if avg_word_len < 4:
            scores['marker'] = 0.9
+    # Very narrow non-edge column → strong marker regardless of word count
+    if geom.width_ratio < 0.04 and geom.index > 0:
+        scores['marker'] = max(scores['marker'], 0.9)

    # Sentence: longer words + punctuation present
    if geom.width_ratio > 0.15 and has_punctuation > 2:
@@ -1213,6 +1243,39 @@ def classify_column_types(geometries: List[ColumnGeometry],
            classification_method='content',
        )]

+    # --- Pre-filter: first/last columns with very few words → column_ignore ---
+    ignore_regions = []
+    active_geometries = []
+    for idx, g in enumerate(geometries):
+        if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 3:
+            ignore_regions.append(PageRegion(
+                type='column_ignore', x=g.x, y=g.y,
+                width=g.width, height=content_h,
+                classification_confidence=0.95,
+                classification_method='content',
+            ))
+            logger.info(f"ClassifyColumns: column {idx} (x={g.x}, words={g.word_count}) → column_ignore (edge, few words)")
+        else:
+            active_geometries.append(g)
+
+    # Re-index active geometries for classification
+    for new_idx, g in enumerate(active_geometries):
+        g.index = new_idx
+    geometries = active_geometries
+
+    # Handle edge case: all columns ignored or only 1 left
+    if len(geometries) == 0:
+        return ignore_regions
+    if len(geometries) == 1:
+        geom = geometries[0]
+        ignore_regions.append(PageRegion(
+            type='column_text', x=geom.x, y=geom.y,
+            width=geom.width, height=geom.height,
+            classification_confidence=0.9,
+            classification_method='content',
+        ))
+        return ignore_regions
+
    # --- Score all columns ---
    lang_scores = [_score_language(g.words) for g in geometries]
    role_scores = [_score_role(g) for g in geometries]
@@ -1227,20 +1290,20 @@ def classify_column_types(geometries: List[ColumnGeometry],
    if regions is not None:
        logger.info("ClassifyColumns: Level 1 (content-based) succeeded")
        _add_header_footer(regions, top_y, bottom_y, img_w, img_h)
-        return regions
+        return ignore_regions + regions

    # --- Level 2: Position + language enhanced ---
    regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h)
    if regions is not None:
        logger.info("ClassifyColumns: Level 2 (position+language) succeeded")
        _add_header_footer(regions, top_y, bottom_y, img_w, img_h)
-        return regions
+        return ignore_regions + regions

    # --- Level 3: Pure position fallback (old code, no regression) ---
    logger.info("ClassifyColumns: Level 3 (position fallback)")
    regions = _classify_by_position_fallback(geometries, content_w, content_h)
    _add_header_footer(regions, top_y, bottom_y, img_w, img_h)
-    return regions
+    return ignore_regions + regions


 def _classify_by_content(geometries: List[ColumnGeometry],
@@ -1257,8 +1320,12 @@ def _classify_by_content(geometries: List[ColumnGeometry],
    assigned = set()

    # Step 1: Assign structural roles first (reference, marker)
+    first_x = geometries[0].x if geometries else 0
+    left_35_threshold = first_x + content_w * 0.35
+
    for i, (geom, rs) in enumerate(zip(geometries, role_scores)):
-        if rs['reference'] >= 0.5 and geom.width_ratio < 0.12:
+        is_left_side = geom.x < left_35_threshold
+        if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side:
            regions.append(PageRegion(
                type='page_ref', x=geom.x, y=geom.y,
                width=geom.width, height=content_h,
@@ -1266,7 +1333,7 @@ def _classify_by_content(geometries: List[ColumnGeometry],
                classification_method='content',
            ))
            assigned.add(i)
-        elif rs['marker'] >= 0.7 and geom.width_ratio < 0.08:
+        elif rs['marker'] >= 0.7 and geom.width_ratio < 0.06:
            regions.append(PageRegion(
                type='column_marker', x=geom.x, y=geom.y,
                width=geom.width, height=content_h,
@@ -1274,6 +1341,15 @@ def _classify_by_content(geometries: List[ColumnGeometry],
                classification_method='content',
            ))
            assigned.add(i)
+        elif geom.width_ratio < 0.05 and not is_left_side:
+            # Narrow column on the right side → marker, not page_ref
+            regions.append(PageRegion(
+                type='column_marker', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=0.8,
+                classification_method='content',
+            ))
+            assigned.add(i)

    # Step 2: Among remaining columns, find EN and DE by language scores
    remaining = [(i, geometries[i], lang_scores[i], role_scores[i])
@@ -1296,6 +1372,47 @@ def _classify_by_content(geometries: List[ColumnGeometry],
    en_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['eng'] > ls['deu'] and ls['eng'] > 0.05]
    de_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['deu'] > ls['eng'] and ls['deu'] > 0.05]

+    # Position tiebreaker: when language signals are weak, use left=EN, right=DE
+    if (not en_candidates or not de_candidates) and len(remaining) >= 2:
+        max_eng = max(ls['eng'] for _, _, ls, _ in remaining)
+        max_deu = max(ls['deu'] for _, _, ls, _ in remaining)
+        if max_eng < 0.15 and max_deu < 0.15:
+            # Both signals weak — fall back to positional: left=EN, right=DE
+            sorted_remaining = sorted(remaining, key=lambda x: x[1].x)
+            best_en = (sorted_remaining[0][0], sorted_remaining[0][1], sorted_remaining[0][2])
+            best_de = (sorted_remaining[1][0], sorted_remaining[1][1], sorted_remaining[1][2])
+            logger.info("ClassifyColumns: Level 1 using position tiebreaker (weak signals) - left=EN, right=DE")
+            en_conf = 0.4
+            de_conf = 0.4
+
+            regions.append(PageRegion(
+                type='column_en', x=best_en[1].x, y=best_en[1].y,
+                width=best_en[1].width, height=content_h,
+                classification_confidence=en_conf,
+                classification_method='content',
+            ))
+            assigned.add(best_en[0])
+
+            regions.append(PageRegion(
+                type='column_de', x=best_de[1].x, y=best_de[1].y,
+                width=best_de[1].width, height=content_h,
+                classification_confidence=de_conf,
+                classification_method='content',
+            ))
+            assigned.add(best_de[0])
+
+            # Assign remaining as example
+            for i, geom, ls, rs in remaining:
+                if i not in assigned:
+                    regions.append(PageRegion(
+                        type='column_example', x=geom.x, y=geom.y,
+                        width=geom.width, height=content_h,
+                        classification_confidence=0.4,
+                        classification_method='content',
+                    ))
+            regions.sort(key=lambda r: r.x)
+            return regions
+
    if not en_candidates or not de_candidates:
        # Language signals too weak for content-based classification
        logger.info("ClassifyColumns: Level 1 failed - no clear EN/DE language split")
@@ -1363,10 +1480,12 @@ def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
    """
    regions = []
    untyped = list(range(len(geometries)))
+    first_x = geometries[0].x if geometries else 0
+    left_35_threshold = first_x + content_w * 0.35

-    # Rule 1: Leftmost narrow column → page_ref
+    # Rule 1: Leftmost narrow column → page_ref (only if in left 35%)
    g0 = geometries[0]
-    if g0.width_ratio < 0.12:
+    if g0.width_ratio < 0.12 and g0.x < left_35_threshold:
        regions.append(PageRegion(
            type='page_ref', x=g0.x, y=g0.y,
            width=g0.width, height=content_h,
@@ -1378,7 +1497,7 @@ def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
    # Rule 2: Narrow columns with few words → marker
    for i in list(untyped):
        geom = geometries[i]
-        if geom.width_ratio < 0.08 and geom.word_count <= 10:
+        if geom.width_ratio < 0.06 and geom.word_count <= 15:
            regions.append(PageRegion(
                type='column_marker', x=geom.x, y=geom.y,
                width=geom.width, height=content_h,
@@ -1463,10 +1582,12 @@ def _classify_by_position_fallback(geometries: List[ColumnGeometry],
    """
    regions = []
    untyped = list(range(len(geometries)))
+    first_x = geometries[0].x if geometries else 0
+    left_35_threshold = first_x + content_w * 0.35

-    # Rule 1: Leftmost narrow column → page_ref
+    # Rule 1: Leftmost narrow column → page_ref (only if in left 35%)
    g0 = geometries[0]
-    if g0.width_ratio < 0.12:
+    if g0.width_ratio < 0.12 and g0.x < left_35_threshold:
        regions.append(PageRegion(
            type='page_ref', x=g0.x, y=g0.y,
            width=g0.width, height=content_h,
@@ -1478,7 +1599,7 @@ def _classify_by_position_fallback(geometries: List[ColumnGeometry],
    # Rule 2: Narrow + few words → marker
    for i in list(untyped):
        geom = geometries[i]
-        if geom.width_ratio < 0.08 and geom.word_count <= 8:
+        if geom.width_ratio < 0.06 and geom.word_count <= 15:
            regions.append(PageRegion(
                type='column_marker', x=geom.x, y=geom.y,
                width=geom.width, height=content_h,