debug: Logging fuer Sub-Session Woertererkennung
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 31s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-agent-core (push) Has been cancelled
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-klausur (push) Has been cancelled
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 31s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-agent-core (push) Has been cancelled
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-klausur (push) Has been cancelled
Zeigt low-confidence Woerter (conf<30) und Zellinhalte pro Zeile, um fehlende Euro/Pfund-Betraege zu diagnostizieren. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -370,22 +370,28 @@ def build_cell_grid_v2(
|
||||
# Filter low-confidence words
|
||||
words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||||
|
||||
# Single full-width column (box sub-session): preserve spacing
|
||||
is_single_full_column = (
|
||||
len(relevant_cols) == 1
|
||||
and img_w > 0
|
||||
and relevant_cols[0].width / img_w > 0.9
|
||||
)
|
||||
|
||||
if words:
|
||||
y_tol = max(15, row.height)
|
||||
# Single full-width column (box sub-session): preserve spacing
|
||||
is_single_full_column = (
|
||||
len(relevant_cols) == 1
|
||||
and img_w > 0
|
||||
and relevant_cols[0].width / img_w > 0.9
|
||||
)
|
||||
if is_single_full_column:
|
||||
text = _words_to_spaced_text(words, y_tolerance_px=y_tol)
|
||||
logger.debug(f"R{row_idx:02d}: {len(words)} words, "
|
||||
f"text={text!r:.100}")
|
||||
else:
|
||||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
||||
else:
|
||||
text = ''
|
||||
avg_conf = 0.0
|
||||
if is_single_full_column:
|
||||
logger.debug(f"R{row_idx:02d}: 0 words (row has "
|
||||
f"{row.word_count} total, y={row.y}..{row.y+row.height})")
|
||||
|
||||
# Apply noise filter
|
||||
text = _clean_cell_text(text)
|
||||
|
||||
@@ -1248,7 +1248,17 @@ async def detect_columns(session_id: str):
|
||||
'width': int(data['width'][i]),
|
||||
'height': int(data['height'][i]),
|
||||
})
|
||||
logger.info(f"OCR Pipeline: sub-session {session_id}: Tesseract found {len(word_dicts)} words")
|
||||
# Log all words including low-confidence ones for debugging
|
||||
all_count = sum(1 for i in range(len(data['text']))
|
||||
if str(data['text'][i]).strip())
|
||||
low_conf = [(str(data['text'][i]).strip(), int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1)
|
||||
for i in range(len(data['text']))
|
||||
if str(data['text'][i]).strip()
|
||||
and (int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1) < 30
|
||||
and (int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1) >= 0]
|
||||
if low_conf:
|
||||
logger.info(f"OCR Pipeline: sub-session {session_id}: {len(low_conf)} words below conf 30: {low_conf[:20]}")
|
||||
logger.info(f"OCR Pipeline: sub-session {session_id}: Tesseract found {len(word_dicts)}/{all_count} words (conf>=30)")
|
||||
except Exception as e:
|
||||
logger.warning(f"OCR Pipeline: sub-session {session_id}: Tesseract failed: {e}")
|
||||
word_dicts = []
|
||||
|
||||
Reference in New Issue
Block a user