From 356d39d6eeb3be199ef87a47a3ec38d66f2ebb2f Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 28 Feb 2026 09:40:04 +0100 Subject: [PATCH] fix(ocr-pipeline): use PSM 6 (block) for multi-line cell OCR in word grid PSM 7 (single line) missed the second line in cells with two lines. PSM 6 handles multi-line content. Also fix sort order to Y-then-X for correct reading order. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 5acaf1c..72b22a7 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -2264,10 +2264,10 @@ def build_word_grid( ) cell_lang = lang_map.get(col.type, lang) - words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=7) + words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6) - # Sort words by x position, join to text - words.sort(key=lambda w: w['left']) + # Sort words by Y then X (reading order for multi-line cells) + words.sort(key=lambda w: (w['top'], w['left'])) text = ' '.join(w['text'] for w in words) if words: avg_conf = sum(w['conf'] for w in words) / len(words)