From 3a791179af21ea75ed00fdfd5a4c6aab9a389b40 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Tue, 10 Mar 2026 09:31:34 +0100
Subject: [PATCH] debug: Logging fuer Sub-Session Woertererkennung

Zeigt low-confidence Woerter (conf<30) und Zellinhalte pro Zeile,
um fehlende Euro/Pfund-Betraege zu diagnostizieren.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_cell_grid.py     | 18 ++++++++++++------
 klausur-service/backend/ocr_pipeline_api.py | 12 +++++++++++-
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/klausur-service/backend/cv_cell_grid.py b/klausur-service/backend/cv_cell_grid.py
index cdae606..71dd587 100644
--- a/klausur-service/backend/cv_cell_grid.py
+++ b/klausur-service/backend/cv_cell_grid.py
@@ -370,22 +370,28 @@ def build_cell_grid_v2(
                 # Filter low-confidence words
                 words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
 
+                # Single full-width column (box sub-session): preserve spacing
+                is_single_full_column = (
+                    len(relevant_cols) == 1
+                    and img_w > 0
+                    and relevant_cols[0].width / img_w > 0.9
+                )
+
                 if words:
                     y_tol = max(15, row.height)
-                    # Single full-width column (box sub-session): preserve spacing
-                    is_single_full_column = (
-                        len(relevant_cols) == 1
-                        and img_w > 0
-                        and relevant_cols[0].width / img_w > 0.9
-                    )
                     if is_single_full_column:
                         text = _words_to_spaced_text(words, y_tolerance_px=y_tol)
+                        logger.debug(f"R{row_idx:02d}: {len(words)} words, "
+                                     f"text={text!r:.100}")
                     else:
                         text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
                     avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
                 else:
                     text = ''
                     avg_conf = 0.0
+                    if is_single_full_column:
+                        logger.debug(f"R{row_idx:02d}: 0 words (row has "
+                                     f"{row.word_count} total, y={row.y}..{row.y+row.height})")
 
                 # Apply noise filter
                 text = _clean_cell_text(text)
diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py
index 342fbaa..f0bb08d 100644
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -1248,7 +1248,17 @@ async def detect_columns(session_id: str):
                     'width': int(data['width'][i]),
                     'height': int(data['height'][i]),
                 })
-            logger.info(f"OCR Pipeline: sub-session {session_id}: Tesseract found {len(word_dicts)} words")
+            # Log all words including low-confidence ones for debugging
+            all_count = sum(1 for i in range(len(data['text']))
+                            if str(data['text'][i]).strip())
+            low_conf = [(str(data['text'][i]).strip(), int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1)
+                        for i in range(len(data['text']))
+                        if str(data['text'][i]).strip()
+                        and (int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1) < 30
+                        and (int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1) >= 0]
+            if low_conf:
+                logger.info(f"OCR Pipeline: sub-session {session_id}: {len(low_conf)} words below conf 30: {low_conf[:20]}")
+            logger.info(f"OCR Pipeline: sub-session {session_id}: Tesseract found {len(word_dicts)}/{all_count} words (conf>=30)")
         except Exception as e:
             logger.warning(f"OCR Pipeline: sub-session {session_id}: Tesseract failed: {e}")
             word_dicts = []