fix(ocr-pipeline): prevent grid from producing more rows than gap-based
Two fixes: 1. Grid validation: reject word-center grid if it produces MORE rows than gap-based detection (more rows = lines were split = worse). Falls back to gap-based rows in that case. 2. Words overlay: draw clean grid cells (column × row intersections) instead of padded entry bboxes. Eliminates confusing double lines. OCR text labels are placed inside the grid cells directly. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1829,6 +1829,13 @@ def _regularize_row_grid(
|
|||||||
# Remove empty grid rows (no words assigned)
|
# Remove empty grid rows (no words assigned)
|
||||||
grid_rows = [gr for gr in grid_rows if gr.word_count > 0]
|
grid_rows = [gr for gr in grid_rows if gr.word_count > 0]
|
||||||
|
|
||||||
|
# The grid must not produce MORE rows than gap-based detection.
|
||||||
|
# More rows means the clustering split actual lines — that's worse.
|
||||||
|
if len(grid_rows) > len(content_rows):
|
||||||
|
logger.info(f"RowGrid: grid produced {len(grid_rows)} rows > "
|
||||||
|
f"{len(content_rows)} gap-based → keeping gap-based rows")
|
||||||
|
return rows
|
||||||
|
|
||||||
# --- Step H: Merge header/footer + re-index ---
|
# --- Step H: Merge header/footer + re-index ---
|
||||||
result = list(non_content) + grid_rows
|
result = list(non_content) + grid_rows
|
||||||
result.sort(key=lambda r: r.y)
|
result.sort(key=lambda r: r.y)
|
||||||
|
|||||||
@@ -1256,8 +1256,8 @@ async def _get_words_overlay(session_id: str) -> Response:
|
|||||||
|
|
||||||
img_h, img_w = img.shape[:2]
|
img_h, img_w = img.shape[:2]
|
||||||
|
|
||||||
# Color map for cell types (BGR)
|
# Color map for column types (BGR)
|
||||||
cell_colors = {
|
col_colors = {
|
||||||
"column_en": (255, 180, 0), # Blue
|
"column_en": (255, 180, 0), # Blue
|
||||||
"column_de": (0, 200, 0), # Green
|
"column_de": (0, 200, 0), # Green
|
||||||
"column_example": (0, 140, 255), # Orange
|
"column_example": (0, 140, 255), # Orange
|
||||||
@@ -1265,28 +1265,43 @@ async def _get_words_overlay(session_id: str) -> Response:
|
|||||||
|
|
||||||
overlay = img.copy()
|
overlay = img.copy()
|
||||||
|
|
||||||
# Draw column divider lines (vertical)
|
# Build grid from column_result × row_result (the actual cells)
|
||||||
|
columns = []
|
||||||
if column_result and column_result.get("columns"):
|
if column_result and column_result.get("columns"):
|
||||||
for col in column_result["columns"]:
|
columns = [c for c in column_result["columns"]
|
||||||
col_type = col.get("type", "")
|
if c.get("type", "").startswith("column_")]
|
||||||
if col_type in cell_colors:
|
|
||||||
cx = col["x"]
|
|
||||||
cv2.line(img, (cx, 0), (cx, img_h), cell_colors[col_type], 1)
|
|
||||||
cx_end = col["x"] + col["width"]
|
|
||||||
cv2.line(img, (cx_end, 0), (cx_end, img_h), cell_colors[col_type], 1)
|
|
||||||
|
|
||||||
# Draw row divider lines (horizontal) for content rows
|
content_rows_data = []
|
||||||
if row_result and row_result.get("rows"):
|
if row_result and row_result.get("rows"):
|
||||||
for row in row_result["rows"]:
|
content_rows_data = [r for r in row_result["rows"]
|
||||||
if row.get("row_type") == "content":
|
if r.get("row_type") == "content"]
|
||||||
ry = row["y"]
|
|
||||||
cv2.line(img, (0, ry), (img_w, ry), (180, 180, 180), 1)
|
|
||||||
|
|
||||||
# Draw entry cells with text labels
|
# Draw grid: column × row cells
|
||||||
|
for col in columns:
|
||||||
|
col_type = col.get("type", "")
|
||||||
|
color = col_colors.get(col_type, (200, 200, 200))
|
||||||
|
cx, cw = col["x"], col["width"]
|
||||||
|
|
||||||
|
for row in content_rows_data:
|
||||||
|
ry, rh = row["y"], row["height"]
|
||||||
|
# Cell rectangle (exact grid intersection, no padding)
|
||||||
|
cv2.rectangle(img, (cx, ry), (cx + cw, ry + rh), color, 1)
|
||||||
|
# Semi-transparent fill
|
||||||
|
cv2.rectangle(overlay, (cx, ry), (cx + cw, ry + rh), color, -1)
|
||||||
|
|
||||||
|
# Place OCR text labels inside grid cells
|
||||||
|
# Build lookup: row_index → entry for fast access
|
||||||
entries = word_result["entries"]
|
entries = word_result["entries"]
|
||||||
|
entry_by_row: Dict[int, Dict] = {}
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
|
entry_by_row[entry.get("row_index", -1)] = entry
|
||||||
|
|
||||||
|
for row_idx, row in enumerate(content_rows_data):
|
||||||
|
entry = entry_by_row.get(row_idx)
|
||||||
|
if not entry:
|
||||||
|
continue
|
||||||
|
|
||||||
conf = entry.get("confidence", 0)
|
conf = entry.get("confidence", 0)
|
||||||
# Color by confidence: green > 70, yellow 50-70, red < 50
|
|
||||||
if conf >= 70:
|
if conf >= 70:
|
||||||
text_color = (0, 180, 0)
|
text_color = (0, 180, 0)
|
||||||
elif conf >= 50:
|
elif conf >= 50:
|
||||||
@@ -1294,35 +1309,27 @@ async def _get_words_overlay(session_id: str) -> Response:
|
|||||||
else:
|
else:
|
||||||
text_color = (0, 0, 220)
|
text_color = (0, 0, 220)
|
||||||
|
|
||||||
for bbox_key, field_key, col_type in [
|
ry, rh = row["y"], row["height"]
|
||||||
("bbox_en", "english", "column_en"),
|
|
||||||
("bbox_de", "german", "column_de"),
|
|
||||||
("bbox_ex", "example", "column_example"),
|
|
||||||
]:
|
|
||||||
bbox = entry.get(bbox_key)
|
|
||||||
text = entry.get(field_key, "")
|
|
||||||
if not bbox or not text:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Convert percent to pixels
|
for col in columns:
|
||||||
bx = int(bbox["x"] / 100 * img_w)
|
col_type = col.get("type", "")
|
||||||
by = int(bbox["y"] / 100 * img_h)
|
cx, cw = col["x"], col["width"]
|
||||||
bw = int(bbox["w"] / 100 * img_w)
|
|
||||||
bh = int(bbox["h"] / 100 * img_h)
|
|
||||||
|
|
||||||
color = cell_colors.get(col_type, (200, 200, 200))
|
# Pick the right text field for this column
|
||||||
|
if col_type == "column_en":
|
||||||
|
text = entry.get("english", "")
|
||||||
|
elif col_type == "column_de":
|
||||||
|
text = entry.get("german", "")
|
||||||
|
elif col_type == "column_example":
|
||||||
|
text = entry.get("example", "")
|
||||||
|
else:
|
||||||
|
text = ""
|
||||||
|
|
||||||
# Semi-transparent fill
|
if text:
|
||||||
cv2.rectangle(overlay, (bx, by), (bx + bw, by + bh), color, -1)
|
label = text.replace('\n', ' ')[:30]
|
||||||
|
font_scale = 0.35
|
||||||
# Border
|
cv2.putText(img, label, (cx + 3, ry + rh - 4),
|
||||||
cv2.rectangle(img, (bx, by), (bx + bw, by + bh), text_color, 1)
|
cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, 1)
|
||||||
|
|
||||||
# Text label (truncate if too long)
|
|
||||||
label = text[:30] if len(text) > 30 else text
|
|
||||||
font_scale = 0.35
|
|
||||||
cv2.putText(img, label, (bx + 3, by + bh - 4),
|
|
||||||
cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, 1)
|
|
||||||
|
|
||||||
# Blend overlay at 10% opacity
|
# Blend overlay at 10% opacity
|
||||||
cv2.addWeighted(overlay, 0.1, img, 0.9, 0, img)
|
cv2.addWeighted(overlay, 0.1, img, 0.9, 0, img)
|
||||||
|
|||||||
Reference in New Issue
Block a user