From 4e8ea77140db1cdb5e2b3e7d72046e0ff4820fca Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Sat, 7 Mar 2026 19:35:21 +0100
Subject: [PATCH] fix: leere Spalten als strukturell behandeln +
 2-Spalten-Layout korrekt labeln

Spalten mit <=2 Woertern und <15% Breite werden jetzt als column_marker
statt als content-Spalte klassifiziert. Bei 2 breiten Content-Spalten
wird die rechte als column_example statt column_de gelabelt, da die
linke Spalte EN+DE kombiniert enthaelt.
OSD-Zoom von 1.0 auf 2.0 erhoeht fuer zuverlaessigere Orientierungserkennung.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py  | 35 ++++++++++++++++++-
 .../backend/vocab_worksheet_api.py            |  2 +-
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index fc5e690..303d890 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -2145,6 +2145,22 @@ def _split_broad_columns(
             if best_gap is None or gw > best_gap[2]:
                 best_gap = (gap_start, len(low_mask), gw)
 
+        # Log all gaps found for debugging
+        all_gaps = []
+        _gs = None
+        for px in range(len(low_mask)):
+            if low_mask[px]:
+                if _gs is None:
+                    _gs = px
+            else:
+                if _gs is not None:
+                    all_gaps.append((_gs, px, px - _gs))
+                    _gs = None
+        if _gs is not None:
+            all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs))
+        logger.info(f"SplitBroadCols: col {geo.index} coverage gaps (>=5px): "
+                    f"{[g for g in all_gaps if g[2] >= 5]}, best={best_gap}")
+
         if best_gap is None or best_gap[2] < _min_gap_px:
             result.append(geo)
             continue
@@ -3547,6 +3563,14 @@ def positional_column_regions(
                 classification_confidence=0.95,
                 classification_method='positional',
             ))
+        # empty or near-empty narrow column → treat as margin/structural
+        elif g.word_count <= 2 and g.width_ratio < 0.15:
+            structural.append(PageRegion(
+                type='column_marker', x=g.x, y=g.y,
+                width=g.width, height=content_h,
+                classification_confidence=0.85,
+                classification_method='positional',
+            ))
         else:
             content_cols.append(g)
 
@@ -3566,7 +3590,16 @@ def positional_column_regions(
 
     # Sort content columns left→right and assign positional labels
     content_cols.sort(key=lambda g: g.x)
-    labels = ['column_en', 'column_de', 'column_example']
+
+    # With exactly 2 content columns: if the left one is very wide (>35%),
+    # it likely contains EN+DE combined, so the right one is examples.
+    if (len(content_cols) == 2
+            and content_cols[0].width_ratio > 0.35
+            and content_cols[1].width_ratio > 0.20):
+        labels = ['column_en', 'column_example']
+    else:
+        labels = ['column_en', 'column_de', 'column_example']
+
     regions = list(structural)
     for i, g in enumerate(content_cols):
         label = labels[i] if i < len(labels) else 'column_example'
diff --git a/klausur-service/backend/vocab_worksheet_api.py b/klausur-service/backend/vocab_worksheet_api.py
index 5b8d45a..2ae025d 100644
--- a/klausur-service/backend/vocab_worksheet_api.py
+++ b/klausur-service/backend/vocab_worksheet_api.py
@@ -1177,7 +1177,7 @@ async def upload_pdf_get_info(
     if OCR_PIPELINE_AVAILABLE:
         for pg in range(page_count):
             try:
-                img_bgr = render_pdf_high_res(content, pg, zoom=1.0)
+                img_bgr = render_pdf_high_res(content, pg, zoom=2.0)
                 _, rotation = detect_and_fix_orientation(img_bgr)
                 if rotation:
                     page_rotations[pg] = rotation