fix(ocr-pipeline): re-populate row.words for word-lookup in Step 5
The row_result stored in DB excludes words to keep payload small. When Step 5 reconstructs RowGeometry from DB, words were empty, causing word-lookup to find nothing and return blank cells. Now re-populates row.words from cached _word_dicts (or re-runs detect_column_geometry if cache is cold) before cell grid building. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1084,6 +1084,36 @@ async def detect_words(
|
|||||||
for r in row_result["rows"]
|
for r in row_result["rows"]
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Re-populate row.words from cached full-page Tesseract words.
|
||||||
|
# Word-lookup in _ocr_single_cell needs these to avoid re-running OCR.
|
||||||
|
word_dicts = cached.get("_word_dicts")
|
||||||
|
if word_dicts is None:
|
||||||
|
ocr_img_tmp = create_ocr_image(dewarped_bgr)
|
||||||
|
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
|
||||||
|
if geo_result is not None:
|
||||||
|
_geoms, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
|
||||||
|
cached["_word_dicts"] = word_dicts
|
||||||
|
cached["_inv"] = inv
|
||||||
|
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
||||||
|
|
||||||
|
if word_dicts:
|
||||||
|
# words['top'] is relative to content-ROI top_y.
|
||||||
|
# row.y is absolute. Convert: row_y_rel = row.y - top_y.
|
||||||
|
content_bounds = cached.get("_content_bounds")
|
||||||
|
if content_bounds:
|
||||||
|
_lx, _rx, top_y, _by = content_bounds
|
||||||
|
else:
|
||||||
|
top_y = min(r.y for r in row_geoms) if row_geoms else 0
|
||||||
|
|
||||||
|
for row in row_geoms:
|
||||||
|
row_y_rel = row.y - top_y
|
||||||
|
row_bottom_rel = row_y_rel + row.height
|
||||||
|
row.words = [
|
||||||
|
w for w in word_dicts
|
||||||
|
if row_y_rel <= w['top'] + w['height'] / 2 < row_bottom_rel
|
||||||
|
]
|
||||||
|
row.word_count = len(row.words)
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
return StreamingResponse(
|
return StreamingResponse(
|
||||||
_word_stream_generator(
|
_word_stream_generator(
|
||||||
|
|||||||
Reference in New Issue
Block a user