From e1ae5d5fa9d5ffa69b8f0108ed876055278640f9 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 7 Mar 2026 22:16:29 +0100 Subject: [PATCH] fix: Edge-Gaps in _split_broad_columns ignorieren + return-Tuple bei leerem Ergebnis Gaps die den Spaltenrand beruehren (Margins) werden jetzt ausgeschlossen, nur interne Gaps werden als Split-Kandidaten betrachtet. Behebt das Problem dass trailing whitespace faelschlich als groesster Gap gewaehlt wurde. Early-return in _run_ocr_pipeline_for_page gibt jetzt korrekt ([], rotation) statt [] zurueck. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 35 +++++++------------ .../backend/vocab_worksheet_api.py | 2 +- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 303d890..1c4961d 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -2125,27 +2125,9 @@ def _split_broad_columns( if cmax > 0: coverage /= cmax - # Find gaps where coverage < 0.5 + # Find INTERNAL gaps where coverage < 0.5 + # Exclude edge gaps (touching pixel 0 or geo.width) — those are margins. low_mask = coverage < 0.5 - gap_start = None - best_gap = None # (start, end, width) - for px in range(len(low_mask)): - if low_mask[px]: - if gap_start is None: - gap_start = px - else: - if gap_start is not None: - gw = px - gap_start - if best_gap is None or gw > best_gap[2]: - best_gap = (gap_start, px, gw) - gap_start = None - # Handle trailing gap - if gap_start is not None: - gw = len(low_mask) - gap_start - if best_gap is None or gw > best_gap[2]: - best_gap = (gap_start, len(low_mask), gw) - - # Log all gaps found for debugging all_gaps = [] _gs = None for px in range(len(low_mask)): @@ -2158,8 +2140,17 @@ def _split_broad_columns( _gs = None if _gs is not None: all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs)) - logger.info(f"SplitBroadCols: col {geo.index} coverage gaps (>=5px): " - f"{[g for g in all_gaps if g[2] >= 5]}, best={best_gap}") + + # Filter: only internal gaps (not touching column edges) + _edge_margin = 10 # pixels from edge to ignore + internal_gaps = [g for g in all_gaps + if g[0] > _edge_margin and g[1] < geo.width - _edge_margin] + best_gap = max(internal_gaps, key=lambda g: g[2]) if internal_gaps else None + + logger.info(f"SplitBroadCols: col {geo.index} all_gaps(>=5px): " + f"{[g for g in all_gaps if g[2] >= 5]}, " + f"internal(>=5px): {[g for g in internal_gaps if g[2] >= 5]}, " + f"best={best_gap}") if best_gap is None or best_gap[2] < _min_gap_px: result.append(geo) diff --git a/klausur-service/backend/vocab_worksheet_api.py b/klausur-service/backend/vocab_worksheet_api.py index 2ae025d..5f787d8 100644 --- a/klausur-service/backend/vocab_worksheet_api.py +++ b/klausur-service/backend/vocab_worksheet_api.py @@ -1510,7 +1510,7 @@ async def _run_ocr_pipeline_for_page( if not is_vocab: logger.warning(f" Page {page_number + 1}: layout is not vocab table " f"(types: {col_types}), returning empty") - return [] + return [], rotation # 8. Map cells → vocab entries entries = _cells_to_vocab_entries(cells, columns_meta)