fix(ocr-pipeline): re-populate row.words for word-lookup in Step 5

The row_result stored in DB excludes words to keep payload small.
When Step 5 reconstructs RowGeometry from DB, words were empty,
causing word-lookup to find nothing and return blank cells.

Now re-populates row.words from cached _word_dicts (or re-runs
detect_column_geometry if cache is cold) before cell grid building.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-02 07:38:33 +01:00
parent 77869e32f4
commit 9bbde1c03e

View File

@@ -1084,6 +1084,36 @@ async def detect_words(
for r in row_result["rows"]
]
# Re-populate row.words from cached full-page Tesseract words.
# Word-lookup in _ocr_single_cell needs these to avoid re-running OCR.
word_dicts = cached.get("_word_dicts")
if word_dicts is None:
ocr_img_tmp = create_ocr_image(dewarped_bgr)
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
if geo_result is not None:
_geoms, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
cached["_word_dicts"] = word_dicts
cached["_inv"] = inv
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
if word_dicts:
# words['top'] is relative to content-ROI top_y.
# row.y is absolute. Convert: row_y_rel = row.y - top_y.
content_bounds = cached.get("_content_bounds")
if content_bounds:
_lx, _rx, top_y, _by = content_bounds
else:
top_y = min(r.y for r in row_geoms) if row_geoms else 0
for row in row_geoms:
row_y_rel = row.y - top_y
row_bottom_rel = row_y_rel + row.height
row.words = [
w for w in word_dicts
if row_y_rel <= w['top'] + w['height'] / 2 < row_bottom_rel
]
row.word_count = len(row.words)
if stream:
return StreamingResponse(
_word_stream_generator(