refactor: positional_column_regions auch in OCR Pipeline verwenden

Shared Funktion positional_column_regions() in cv_vocab_pipeline.py, wird jetzt von beiden Pfaden (Vocab-Worksheet + OCR Pipeline Admin) genutzt. classify_column_types() bleibt als Legacy erhalten. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 17:20:51 +01:00
parent b0bfc0a960
commit 7a1bd5e82d
2 changed files with 89 additions and 74 deletions
@@ -3321,6 +3321,75 @@ def _build_margin_regions(
    return margins


+def positional_column_regions(
+    geometries: List[ColumnGeometry],
+    content_w: int,
+    content_h: int,
+    left_x: int,
+) -> List[PageRegion]:
+    """Classify columns by position only (no language scoring).
+
+    Structural columns (page_ref, column_marker) are identified by geometry.
+    Remaining content columns are labelled left→right as column_en, column_de,
+    column_example.  The names are purely positional – no language analysis.
+    """
+    structural: List[PageRegion] = []
+    content_cols: List[ColumnGeometry] = []
+
+    for g in geometries:
+        rel_x = g.x - left_x
+        # page_ref: narrow column in the leftmost 20% region
+        if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20:
+            structural.append(PageRegion(
+                type='page_ref', x=g.x, y=g.y,
+                width=g.width, height=content_h,
+                classification_confidence=0.95,
+                classification_method='positional',
+            ))
+        # column_marker: very narrow, few words
+        elif g.width_ratio < 0.06 and g.word_count <= 15:
+            structural.append(PageRegion(
+                type='column_marker', x=g.x, y=g.y,
+                width=g.width, height=content_h,
+                classification_confidence=0.95,
+                classification_method='positional',
+            ))
+        else:
+            content_cols.append(g)
+
+    # Single content column → plain text page
+    if len(content_cols) == 1:
+        g = content_cols[0]
+        return structural + [PageRegion(
+            type='column_text', x=g.x, y=g.y,
+            width=g.width, height=content_h,
+            classification_confidence=0.9,
+            classification_method='positional',
+        )]
+
+    # No content columns
+    if not content_cols:
+        return structural
+
+    # Sort content columns left→right and assign positional labels
+    content_cols.sort(key=lambda g: g.x)
+    labels = ['column_en', 'column_de', 'column_example']
+    regions = list(structural)
+    for i, g in enumerate(content_cols):
+        label = labels[i] if i < len(labels) else 'column_example'
+        regions.append(PageRegion(
+            type=label, x=g.x, y=g.y,
+            width=g.width, height=content_h,
+            classification_confidence=0.95,
+            classification_method='positional',
+        ))
+
+    logger.info(f"PositionalColumns: {len(structural)} structural, "
+                f"{len(content_cols)} content → "
+                f"{[r.type for r in regions]}")
+    return regions
+
+
 def classify_column_types(geometries: List[ColumnGeometry],
                          content_w: int,
                          top_y: int,
@@ -3548,6 +3617,21 @@ def _classify_by_content(geometries: List[ColumnGeometry],
    best_en = max(en_candidates, key=lambda x: x[2]['eng'])
    best_de = max(de_candidates, key=lambda x: x[2]['deu'])

+    # Position-aware EN selection: in typical textbooks the layout is EN | DE | Example.
+    # Example sentences contain English function words ("the", "a", "is") which inflate
+    # the eng score of the Example column.  When the best EN candidate sits to the RIGHT
+    # of the DE column and there is another EN candidate to the LEFT, prefer the left one
+    # — it is almost certainly the real vocabulary column.
+    if best_de[2]['deu'] > 0.5 and best_en[1].x > best_de[1].x and len(en_candidates) > 1:
+        left_of_de = [c for c in en_candidates if c[1].x < best_de[1].x]
+        if left_of_de:
+            alt_en = max(left_of_de, key=lambda x: x[2]['eng'])
+            logger.info(
+                f"ClassifyColumns: Level 1 position fix — best EN col {best_en[0]} "
+                f"(eng={best_en[2]['eng']:.3f}) is right of DE col {best_de[0]}; "
+                f"preferring left col {alt_en[0]} (eng={alt_en[2]['eng']:.3f})")
+            best_en = alt_en
+
    if best_en[0] == best_de[0]:
        # Same column scored highest for both — ambiguous
        logger.info("ClassifyColumns: Level 1 failed - same column highest for EN and DE")
@@ -3996,9 +4080,9 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
    geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
                                      top_y=top_y, header_y=header_y, footer_y=footer_y)

-    # Phase B: Content-based classification
-    regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
-                                    left_x=left_x, right_x=right_x, inv=_inv)
+    # Phase B: Positional classification (no language scoring)
+    content_h = bottom_y - top_y
+    regions = positional_column_regions(geometries, content_w, content_h, left_x)

    col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
    methods = set(r.classification_method for r in regions if r.classification_method)
@@ -70,7 +70,7 @@ try:
        detect_column_geometry, analyze_layout_by_words, analyze_layout, create_layout_image,
        detect_row_geometry, build_cell_grid_v2,
        _cells_to_vocab_entries, _detect_sub_columns, _detect_header_footer_gaps,
-        expand_narrow_columns, llm_review_entries,
+        expand_narrow_columns, positional_column_regions, llm_review_entries,
        _fix_phonetic_brackets,
        render_pdf_high_res,
        PageRegion, RowGeometry,
@@ -1336,75 +1336,6 @@ async def process_single_page(
    }


-def _positional_column_regions(
-    geometries: list,
-    content_w: int,
-    content_h: int,
-    left_x: int,
-) -> list:
-    """Classify columns by position only (no language scoring).
-
-    Structural columns (page_ref, column_marker) are identified by geometry.
-    Remaining content columns are labelled left→right as column_en, column_de,
-    column_example.  The names are purely positional – no language analysis.
-    """
-    structural = []
-    content_cols = []
-
-    for g in geometries:
-        rel_x = g.x - left_x
-        # page_ref: narrow column in the leftmost 20% region
-        if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20:
-            structural.append(PageRegion(
-                type='page_ref', x=g.x, y=g.y,
-                width=g.width, height=content_h,
-                classification_confidence=0.95,
-                classification_method='positional',
-            ))
-        # column_marker: very narrow, few words
-        elif g.width_ratio < 0.06 and g.word_count <= 15:
-            structural.append(PageRegion(
-                type='column_marker', x=g.x, y=g.y,
-                width=g.width, height=content_h,
-                classification_confidence=0.95,
-                classification_method='positional',
-            ))
-        else:
-            content_cols.append(g)
-
-    # Single content column → plain text page
-    if len(content_cols) == 1:
-        g = content_cols[0]
-        return structural + [PageRegion(
-            type='column_text', x=g.x, y=g.y,
-            width=g.width, height=content_h,
-            classification_confidence=0.9,
-            classification_method='positional',
-        )]
-
-    # No content columns
-    if not content_cols:
-        return structural
-
-    # Sort content columns left→right and assign positional labels
-    content_cols.sort(key=lambda g: g.x)
-    labels = ['column_en', 'column_de', 'column_example']
-    regions = list(structural)
-    for i, g in enumerate(content_cols):
-        label = labels[i] if i < len(labels) else 'column_example'
-        regions.append(PageRegion(
-            type=label, x=g.x, y=g.y,
-            width=g.width, height=content_h,
-            classification_confidence=0.95,
-            classification_method='positional',
-        ))
-
-    logger.info(f"PositionalColumns: {len(structural)} structural, "
-                f"{len(content_cols)} content → "
-                f"{[r.type for r in regions]}")
-    return regions
-
-
 async def _run_ocr_pipeline_for_page(
    img_bgr: np.ndarray,
    page_number: int,
@@ -1479,7 +1410,7 @@ async def _run_ocr_pipeline_for_page(
                                          top_y=top_y, header_y=header_y, footer_y=footer_y)
        geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
        content_h = bottom_y - top_y
-        regions = _positional_column_regions(geometries, content_w, content_h, left_x)
+        regions = positional_column_regions(geometries, content_w, content_h, left_x)
        content_bounds = (left_x, right_x, top_y, bottom_y)

    logger.info(f"  columns: {len(regions)} detected ({_time.time() - t0:.1f}s)")