fix(ocr-pipeline): re-populate row.words for word-lookup in Step 5
The row_result stored in DB excludes words to keep payload small. When Step 5 reconstructs RowGeometry from DB, words were empty, causing word-lookup to find nothing and return blank cells. Now re-populates row.words from cached _word_dicts (or re-runs detect_column_geometry if cache is cold) before cell grid building. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1084,6 +1084,36 @@ async def detect_words(
|
||||
for r in row_result["rows"]
|
||||
]
|
||||
|
||||
# Re-populate row.words from cached full-page Tesseract words.
|
||||
# Word-lookup in _ocr_single_cell needs these to avoid re-running OCR.
|
||||
word_dicts = cached.get("_word_dicts")
|
||||
if word_dicts is None:
|
||||
ocr_img_tmp = create_ocr_image(dewarped_bgr)
|
||||
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
|
||||
if geo_result is not None:
|
||||
_geoms, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
|
||||
cached["_word_dicts"] = word_dicts
|
||||
cached["_inv"] = inv
|
||||
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
||||
|
||||
if word_dicts:
|
||||
# words['top'] is relative to content-ROI top_y.
|
||||
# row.y is absolute. Convert: row_y_rel = row.y - top_y.
|
||||
content_bounds = cached.get("_content_bounds")
|
||||
if content_bounds:
|
||||
_lx, _rx, top_y, _by = content_bounds
|
||||
else:
|
||||
top_y = min(r.y for r in row_geoms) if row_geoms else 0
|
||||
|
||||
for row in row_geoms:
|
||||
row_y_rel = row.y - top_y
|
||||
row_bottom_rel = row_y_rel + row.height
|
||||
row.words = [
|
||||
w for w in word_dicts
|
||||
if row_y_rel <= w['top'] + w['height'] / 2 < row_bottom_rel
|
||||
]
|
||||
row.word_count = len(row.words)
|
||||
|
||||
if stream:
|
||||
return StreamingResponse(
|
||||
_word_stream_generator(
|
||||
|
||||
Reference in New Issue
Block a user