From 9bbde1c03e8b4b1c705e4c1d533cf25554540b14 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 2 Mar 2026 07:38:33 +0100 Subject: [PATCH] fix(ocr-pipeline): re-populate row.words for word-lookup in Step 5 The row_result stored in DB excludes words to keep payload small. When Step 5 reconstructs RowGeometry from DB, words were empty, causing word-lookup to find nothing and return blank cells. Now re-populates row.words from cached _word_dicts (or re-runs detect_column_geometry if cache is cold) before cell grid building. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/ocr_pipeline_api.py | 30 +++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 99f6be6..24f1d4a 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -1084,6 +1084,36 @@ async def detect_words( for r in row_result["rows"] ] + # Re-populate row.words from cached full-page Tesseract words. + # Word-lookup in _ocr_single_cell needs these to avoid re-running OCR. + word_dicts = cached.get("_word_dicts") + if word_dicts is None: + ocr_img_tmp = create_ocr_image(dewarped_bgr) + geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr) + if geo_result is not None: + _geoms, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result + cached["_word_dicts"] = word_dicts + cached["_inv"] = inv + cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y) + + if word_dicts: + # words['top'] is relative to content-ROI top_y. + # row.y is absolute. Convert: row_y_rel = row.y - top_y. + content_bounds = cached.get("_content_bounds") + if content_bounds: + _lx, _rx, top_y, _by = content_bounds + else: + top_y = min(r.y for r in row_geoms) if row_geoms else 0 + + for row in row_geoms: + row_y_rel = row.y - top_y + row_bottom_rel = row_y_rel + row.height + row.words = [ + w for w in word_dicts + if row_y_rel <= w['top'] + w['height'] / 2 < row_bottom_rel + ] + row.word_count = len(row.words) + if stream: return StreamingResponse( _word_stream_generator(