debug: Logging fuer Sub-Session Woertererkennung
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 31s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-agent-core (push) Has been cancelled
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-klausur (push) Has been cancelled

Zeigt low-confidence Woerter (conf<30) und Zellinhalte pro Zeile,
um fehlende Euro/Pfund-Betraege zu diagnostizieren.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-10 09:31:34 +01:00
parent f65bd11919
commit 3a791179af
2 changed files with 23 additions and 7 deletions

View File

@@ -370,22 +370,28 @@ def build_cell_grid_v2(
# Filter low-confidence words
words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
# Single full-width column (box sub-session): preserve spacing
is_single_full_column = (
len(relevant_cols) == 1
and img_w > 0
and relevant_cols[0].width / img_w > 0.9
)
if words:
y_tol = max(15, row.height)
# Single full-width column (box sub-session): preserve spacing
is_single_full_column = (
len(relevant_cols) == 1
and img_w > 0
and relevant_cols[0].width / img_w > 0.9
)
if is_single_full_column:
text = _words_to_spaced_text(words, y_tolerance_px=y_tol)
logger.debug(f"R{row_idx:02d}: {len(words)} words, "
f"text={text!r:.100}")
else:
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
else:
text = ''
avg_conf = 0.0
if is_single_full_column:
logger.debug(f"R{row_idx:02d}: 0 words (row has "
f"{row.word_count} total, y={row.y}..{row.y+row.height})")
# Apply noise filter
text = _clean_cell_text(text)

View File

@@ -1248,7 +1248,17 @@ async def detect_columns(session_id: str):
'width': int(data['width'][i]),
'height': int(data['height'][i]),
})
logger.info(f"OCR Pipeline: sub-session {session_id}: Tesseract found {len(word_dicts)} words")
# Log all words including low-confidence ones for debugging
all_count = sum(1 for i in range(len(data['text']))
if str(data['text'][i]).strip())
low_conf = [(str(data['text'][i]).strip(), int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1)
for i in range(len(data['text']))
if str(data['text'][i]).strip()
and (int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1) < 30
and (int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1) >= 0]
if low_conf:
logger.info(f"OCR Pipeline: sub-session {session_id}: {len(low_conf)} words below conf 30: {low_conf[:20]}")
logger.info(f"OCR Pipeline: sub-session {session_id}: Tesseract found {len(word_dicts)}/{all_count} words (conf>=30)")
except Exception as e:
logger.warning(f"OCR Pipeline: sub-session {session_id}: Tesseract failed: {e}")
word_dicts = []