From 4e8ea77140db1cdb5e2b3e7d72046e0ff4820fca Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 7 Mar 2026 19:35:21 +0100 Subject: [PATCH] fix: leere Spalten als strukturell behandeln + 2-Spalten-Layout korrekt labeln Spalten mit <=2 Woertern und <15% Breite werden jetzt als column_marker statt als content-Spalte klassifiziert. Bei 2 breiten Content-Spalten wird die rechte als column_example statt column_de gelabelt, da die linke Spalte EN+DE kombiniert enthaelt. OSD-Zoom von 1.0 auf 2.0 erhoeht fuer zuverlaessigere Orientierungserkennung. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 35 ++++++++++++++++++- .../backend/vocab_worksheet_api.py | 2 +- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index fc5e690..303d890 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -2145,6 +2145,22 @@ def _split_broad_columns( if best_gap is None or gw > best_gap[2]: best_gap = (gap_start, len(low_mask), gw) + # Log all gaps found for debugging + all_gaps = [] + _gs = None + for px in range(len(low_mask)): + if low_mask[px]: + if _gs is None: + _gs = px + else: + if _gs is not None: + all_gaps.append((_gs, px, px - _gs)) + _gs = None + if _gs is not None: + all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs)) + logger.info(f"SplitBroadCols: col {geo.index} coverage gaps (>=5px): " + f"{[g for g in all_gaps if g[2] >= 5]}, best={best_gap}") + if best_gap is None or best_gap[2] < _min_gap_px: result.append(geo) continue @@ -3547,6 +3563,14 @@ def positional_column_regions( classification_confidence=0.95, classification_method='positional', )) + # empty or near-empty narrow column → treat as margin/structural + elif g.word_count <= 2 and g.width_ratio < 0.15: + structural.append(PageRegion( + type='column_marker', x=g.x, y=g.y, + width=g.width, height=content_h, + classification_confidence=0.85, + classification_method='positional', + )) else: content_cols.append(g) @@ -3566,7 +3590,16 @@ def positional_column_regions( # Sort content columns left→right and assign positional labels content_cols.sort(key=lambda g: g.x) - labels = ['column_en', 'column_de', 'column_example'] + + # With exactly 2 content columns: if the left one is very wide (>35%), + # it likely contains EN+DE combined, so the right one is examples. + if (len(content_cols) == 2 + and content_cols[0].width_ratio > 0.35 + and content_cols[1].width_ratio > 0.20): + labels = ['column_en', 'column_example'] + else: + labels = ['column_en', 'column_de', 'column_example'] + regions = list(structural) for i, g in enumerate(content_cols): label = labels[i] if i < len(labels) else 'column_example' diff --git a/klausur-service/backend/vocab_worksheet_api.py b/klausur-service/backend/vocab_worksheet_api.py index 5b8d45a..2ae025d 100644 --- a/klausur-service/backend/vocab_worksheet_api.py +++ b/klausur-service/backend/vocab_worksheet_api.py @@ -1177,7 +1177,7 @@ async def upload_pdf_get_info( if OCR_PIPELINE_AVAILABLE: for pg in range(page_count): try: - img_bgr = render_pdf_high_res(content, pg, zoom=1.0) + img_bgr = render_pdf_high_res(content, pg, zoom=2.0) _, rotation = detect_and_fix_orientation(img_bgr) if rotation: page_rotations[pg] = rotation