From 964c916a816172decfc1569776e54a5f2be0b5f7 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 10 Mar 2026 09:41:25 +0100 Subject: [PATCH] fix: _clean_cell_text entfernt Waehrungssymbole am Zeilenende MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _is_noise_tail_token() stuft rein nicht-alphabetische Tokens wie €0.50, £1, €2.50 als OCR-Noise ein und entfernt sie. Zusaetzlich zerstoert ' '.join(tokens) das proportionale Spacing. Fuer Single-Column Sub-Sessions wird _clean_cell_text uebersprungen. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_cell_grid.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/klausur-service/backend/cv_cell_grid.py b/klausur-service/backend/cv_cell_grid.py index 56ccc53..e20499d 100644 --- a/klausur-service/backend/cv_cell_grid.py +++ b/klausur-service/backend/cv_cell_grid.py @@ -393,8 +393,13 @@ def build_cell_grid_v2( logger.info(f"R{row_idx:02d}: 0 words (row has " f"{row.word_count} total, y={row.y}..{row.y+row.height})") - # Apply noise filter - text = _clean_cell_text(text) + # Apply noise filter — but NOT for single-column sub-sessions: + # 1. _clean_cell_text strips trailing non-alpha tokens (e.g. €0.50, + # £1, €2.50) which are valid content in box layouts. + # 2. _clean_cell_text joins tokens with single space, destroying + # the proportional spacing from _words_to_spaced_text. + if not is_single_full_column: + text = _clean_cell_text(text) cell = { 'cell_id': f"R{row_idx:02d}_C{col_idx}",