From 3a791179af21ea75ed00fdfd5a4c6aab9a389b40 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 10 Mar 2026 09:31:34 +0100 Subject: [PATCH] debug: Logging fuer Sub-Session Woertererkennung Zeigt low-confidence Woerter (conf<30) und Zellinhalte pro Zeile, um fehlende Euro/Pfund-Betraege zu diagnostizieren. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_cell_grid.py | 18 ++++++++++++------ klausur-service/backend/ocr_pipeline_api.py | 12 +++++++++++- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/klausur-service/backend/cv_cell_grid.py b/klausur-service/backend/cv_cell_grid.py index cdae606..71dd587 100644 --- a/klausur-service/backend/cv_cell_grid.py +++ b/klausur-service/backend/cv_cell_grid.py @@ -370,22 +370,28 @@ def build_cell_grid_v2( # Filter low-confidence words words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF] + # Single full-width column (box sub-session): preserve spacing + is_single_full_column = ( + len(relevant_cols) == 1 + and img_w > 0 + and relevant_cols[0].width / img_w > 0.9 + ) + if words: y_tol = max(15, row.height) - # Single full-width column (box sub-session): preserve spacing - is_single_full_column = ( - len(relevant_cols) == 1 - and img_w > 0 - and relevant_cols[0].width / img_w > 0.9 - ) if is_single_full_column: text = _words_to_spaced_text(words, y_tolerance_px=y_tol) + logger.debug(f"R{row_idx:02d}: {len(words)} words, " + f"text={text!r:.100}") else: text = _words_to_reading_order_text(words, y_tolerance_px=y_tol) avg_conf = round(sum(w['conf'] for w in words) / len(words), 1) else: text = '' avg_conf = 0.0 + if is_single_full_column: + logger.debug(f"R{row_idx:02d}: 0 words (row has " + f"{row.word_count} total, y={row.y}..{row.y+row.height})") # Apply noise filter text = _clean_cell_text(text) diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 342fbaa..f0bb08d 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -1248,7 +1248,17 @@ async def detect_columns(session_id: str): 'width': int(data['width'][i]), 'height': int(data['height'][i]), }) - logger.info(f"OCR Pipeline: sub-session {session_id}: Tesseract found {len(word_dicts)} words") + # Log all words including low-confidence ones for debugging + all_count = sum(1 for i in range(len(data['text'])) + if str(data['text'][i]).strip()) + low_conf = [(str(data['text'][i]).strip(), int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1) + for i in range(len(data['text'])) + if str(data['text'][i]).strip() + and (int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1) < 30 + and (int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1) >= 0] + if low_conf: + logger.info(f"OCR Pipeline: sub-session {session_id}: {len(low_conf)} words below conf 30: {low_conf[:20]}") + logger.info(f"OCR Pipeline: sub-session {session_id}: Tesseract found {len(word_dicts)}/{all_count} words (conf>=30)") except Exception as e: logger.warning(f"OCR Pipeline: sub-session {session_id}: Tesseract failed: {e}") word_dicts = []