debug: Logging fuer Sub-Session Woertererkennung

Zeigt low-confidence Woerter (conf<30) und Zellinhalte pro Zeile, um fehlende Euro/Pfund-Betraege zu diagnostizieren. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 09:31:34 +01:00
parent f65bd11919
commit 3a791179af
2 changed files with 23 additions and 7 deletions
@@ -370,22 +370,28 @@ def build_cell_grid_v2(
                # Filter low-confidence words
                words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]

+                # Single full-width column (box sub-session): preserve spacing
+                is_single_full_column = (
+                    len(relevant_cols) == 1
+                    and img_w > 0
+                    and relevant_cols[0].width / img_w > 0.9
+                )
+
                if words:
                    y_tol = max(15, row.height)
-                    # Single full-width column (box sub-session): preserve spacing
-                    is_single_full_column = (
-                        len(relevant_cols) == 1
-                        and img_w > 0
-                        and relevant_cols[0].width / img_w > 0.9
-                    )
                    if is_single_full_column:
                        text = _words_to_spaced_text(words, y_tolerance_px=y_tol)
+                        logger.debug(f"R{row_idx:02d}: {len(words)} words, "
+                                     f"text={text!r:.100}")
                    else:
                        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
                    avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
                else:
                    text = ''
                    avg_conf = 0.0
+                    if is_single_full_column:
+                        logger.debug(f"R{row_idx:02d}: 0 words (row has "
+                                     f"{row.word_count} total, y={row.y}..{row.y+row.height})")

                # Apply noise filter
                text = _clean_cell_text(text)
@@ -1248,7 +1248,17 @@ async def detect_columns(session_id: str):
                    'width': int(data['width'][i]),
                    'height': int(data['height'][i]),
                })
-            logger.info(f"OCR Pipeline: sub-session {session_id}: Tesseract found {len(word_dicts)} words")
+            # Log all words including low-confidence ones for debugging
+            all_count = sum(1 for i in range(len(data['text']))
+                            if str(data['text'][i]).strip())
+            low_conf = [(str(data['text'][i]).strip(), int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1)
+                        for i in range(len(data['text']))
+                        if str(data['text'][i]).strip()
+                        and (int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1) < 30
+                        and (int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1) >= 0]
+            if low_conf:
+                logger.info(f"OCR Pipeline: sub-session {session_id}: {len(low_conf)} words below conf 30: {low_conf[:20]}")
+            logger.info(f"OCR Pipeline: sub-session {session_id}: Tesseract found {len(word_dicts)}/{all_count} words (conf>=30)")
        except Exception as e:
            logger.warning(f"OCR Pipeline: sub-session {session_id}: Tesseract failed: {e}")
            word_dicts = []