fix: leere Spalten als strukturell behandeln + 2-Spalten-Layout korrekt labeln

Spalten mit <=2 Woertern und <15% Breite werden jetzt als column_marker statt als content-Spalte klassifiziert. Bei 2 breiten Content-Spalten wird die rechte als column_example statt column_de gelabelt, da die linke Spalte EN+DE kombiniert enthaelt. OSD-Zoom von 1.0 auf 2.0 erhoeht fuer zuverlaessigere Orientierungserkennung. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 19:35:21 +01:00
parent e8ba5ec073
commit 4e8ea77140
2 changed files with 35 additions and 2 deletions
@@ -2145,6 +2145,22 @@ def _split_broad_columns(
            if best_gap is None or gw > best_gap[2]:
                best_gap = (gap_start, len(low_mask), gw)

+        # Log all gaps found for debugging
+        all_gaps = []
+        _gs = None
+        for px in range(len(low_mask)):
+            if low_mask[px]:
+                if _gs is None:
+                    _gs = px
+            else:
+                if _gs is not None:
+                    all_gaps.append((_gs, px, px - _gs))
+                    _gs = None
+        if _gs is not None:
+            all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs))
+        logger.info(f"SplitBroadCols: col {geo.index} coverage gaps (>=5px): "
+                    f"{[g for g in all_gaps if g[2] >= 5]}, best={best_gap}")
+
        if best_gap is None or best_gap[2] < _min_gap_px:
            result.append(geo)
            continue
@@ -3547,6 +3563,14 @@ def positional_column_regions(
                classification_confidence=0.95,
                classification_method='positional',
            ))
+        # empty or near-empty narrow column → treat as margin/structural
+        elif g.word_count <= 2 and g.width_ratio < 0.15:
+            structural.append(PageRegion(
+                type='column_marker', x=g.x, y=g.y,
+                width=g.width, height=content_h,
+                classification_confidence=0.85,
+                classification_method='positional',
+            ))
        else:
            content_cols.append(g)

@@ -3566,7 +3590,16 @@ def positional_column_regions(

    # Sort content columns left→right and assign positional labels
    content_cols.sort(key=lambda g: g.x)
-    labels = ['column_en', 'column_de', 'column_example']
+
+    # With exactly 2 content columns: if the left one is very wide (>35%),
+    # it likely contains EN+DE combined, so the right one is examples.
+    if (len(content_cols) == 2
+            and content_cols[0].width_ratio > 0.35
+            and content_cols[1].width_ratio > 0.20):
+        labels = ['column_en', 'column_example']
+    else:
+        labels = ['column_en', 'column_de', 'column_example']
+
    regions = list(structural)
    for i, g in enumerate(content_cols):
        label = labels[i] if i < len(labels) else 'column_example'
@@ -1177,7 +1177,7 @@ async def upload_pdf_get_info(
    if OCR_PIPELINE_AVAILABLE:
        for pg in range(page_count):
            try:
-                img_bgr = render_pdf_high_res(content, pg, zoom=1.0)
+                img_bgr = render_pdf_high_res(content, pg, zoom=2.0)
                _, rotation = detect_and_fix_orientation(img_bgr)
                if rotation:
                    page_rotations[pg] = rotation