fix: paddle_direct groups words per row (matching _build_cells format)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 34s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m11s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 24s

One cell per row with all words as word_boxes instead of one cell per
word. Gives OverlayReconstruction a row-spanning bbox_pct for correct
font sizing and per-word positions for slide/cluster placement.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-12 17:10:10 +01:00
parent c743a38eaf
commit 71a1b5f058

View File

@@ -2597,10 +2597,10 @@ def _paddle_words_to_grid_cells(
) -> tuple:
"""Convert PaddleOCR word dicts into GridCell dicts + columns_meta.
1. Sort words by (top, left).
2. Cluster into rows by Y-proximity (threshold = 50% of median word height).
3. Within each row, sort left→right and assign col_index.
4. Each word → 1 GridCell with word_boxes and bbox_pct.
Groups words into rows (Y-proximity), then builds ONE cell per row
with all words as word_boxes — matching the format of _build_cells()
in cv_words_first.py. This gives OverlayReconstruction a row-spanning
bbox_pct for correct font sizing and per-word positions for placement.
Returns (cells, columns_meta) in the same format as build_grid_from_words.
"""
@@ -2632,69 +2632,82 @@ def _paddle_words_to_grid_cells(
if current_row:
rows.append(current_row)
# Sort each row left→right and build cells
# Build ONE cell per row (all words in reading order, word_boxes for positioning)
cells: List[Dict[str, Any]] = []
max_col = 0
for row_idx, row_words in enumerate(rows):
row_words.sort(key=lambda w: w["left"])
for col_idx, w in enumerate(row_words):
left = w["left"]
top = w["top"]
width = w["width"]
height = w["height"]
conf = w.get("confidence", 0)
if isinstance(conf, float) and conf <= 1.0:
conf = conf * 100 # normalize to 0-100
cell = {
"cell_id": f"PD_R{row_idx:02d}_W{col_idx:02d}",
"x": left,
"y": top,
"width": width,
"height": height,
"text": w.get("text", ""),
"confidence": round(conf, 1),
"column_index": col_idx,
"row_index": row_idx,
"zone_index": 0,
"ocr_engine": "paddle_direct",
"word_boxes": [{
"text": w.get("text", ""),
"left": left,
"top": top,
"width": width,
"height": height,
"confidence": round(conf, 1),
}],
"bbox_pct": {
"x": round(left / img_w * 100, 3),
"y": round(top / img_h * 100, 3),
"w": round(width / img_w * 100, 3),
"h": round(height / img_h * 100, 3),
},
}
cells.append(cell)
if col_idx > max_col:
max_col = col_idx
# Tight bbox spanning all words in this row
x_min = min(w["left"] for w in row_words)
y_min = min(w["top"] for w in row_words)
x_max = max(w["left"] + w["width"] for w in row_words)
y_max = max(w["top"] + w["height"] for w in row_words)
bw = x_max - x_min
bh = y_max - y_min
# Build columns_meta — one pseudo-column per column index
columns_meta = []
for ci in range(max_col + 1):
col_cells = [c for c in cells if c["column_index"] == ci]
if col_cells:
min_x = min(c["x"] for c in col_cells)
max_right = max(c["x"] + c["width"] for c in col_cells)
columns_meta.append({
"type": "column_text",
"x": min_x,
"y": 0,
"width": max_right - min_x,
"height": img_h,
"classification_confidence": 1.0,
"classification_method": "paddle_direct",
# Text: all words joined by space
text = " ".join(w.get("text", "").strip() for w in row_words if w.get("text", "").strip())
# Average confidence
confs = []
for w in row_words:
c = w.get("confidence", 0)
if isinstance(c, float) and c <= 1.0:
c = c * 100
confs.append(c)
avg_conf = sum(confs) / len(confs) if confs else 0.0
# Per-word boxes with absolute pixel coordinates
word_boxes = []
for w in row_words:
raw_text = w.get("text", "").strip()
if not raw_text:
continue
c = w.get("confidence", 0)
if isinstance(c, float) and c <= 1.0:
c = c * 100
word_boxes.append({
"text": raw_text,
"left": w["left"],
"top": w["top"],
"width": w["width"],
"height": w["height"],
"conf": round(c, 1),
})
cell = {
"cell_id": f"PD_R{row_idx:02d}_C0",
"row_index": row_idx,
"col_index": 0,
"col_type": "column_text",
"text": text,
"confidence": round(avg_conf, 1),
"zone_index": 0,
"ocr_engine": "paddle_direct",
"is_bold": False,
"word_boxes": word_boxes,
"bbox_px": {"x": x_min, "y": y_min, "w": bw, "h": bh},
"bbox_pct": {
"x": round(x_min / img_w * 100, 2) if img_w else 0,
"y": round(y_min / img_h * 100, 2) if img_h else 0,
"w": round(bw / img_w * 100, 2) if img_w else 0,
"h": round(bh / img_h * 100, 2) if img_h else 0,
},
}
cells.append(cell)
# Single full-page pseudo-column (all rows belong to column 0)
columns_meta = [{
"type": "column_text",
"x": 0,
"y": 0,
"width": img_w,
"height": img_h,
"classification_confidence": 1.0,
"classification_method": "paddle_direct",
}]
return cells, columns_meta