From d98359fcebcf2ef1114dd952e7a08c0b99c77639 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 7 Mar 2026 22:51:14 +0100 Subject: [PATCH] fix: _split_broad_columns nur bei maximal 1 breiter Spalte ausfuehren Wenn bereits 2+ breite Content-Spalten existieren, ist das Layout wahrscheinlich korrekt in EN/DE getrennt. Split wird nur ausgefuehrt wenn eine einzelne breite Spalte EN+DE kombiniert enthaelt. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 1c4961d..c2a01d1 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -2097,6 +2097,14 @@ def _split_broad_columns( logger.info(f"SplitBroadCols: input {len(geometries)} cols: " f"{[(g.index, g.x, g.width, g.word_count, round(g.width_ratio, 3)) for g in geometries]}") + # Count how many broad content columns exist. If there are already 2+, + # the layout is likely already correctly split into EN / DE — skip. + broad_count = sum(1 for g in geometries + if g.width_ratio > _broad_threshold and len(g.words) >= 10) + if broad_count >= 2: + logger.info(f"SplitBroadCols: {broad_count} broad cols already → skip") + return geometries + for geo in geometries: if geo.width_ratio <= _broad_threshold or len(geo.words) < 10: result.append(geo)