fix: rewrite Kombi merge with row-based sequence alignment
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m59s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s

Replaces position-based word matching with row-based sequence alignment
to fix doubled words and cross-line averaging in Kombi-Modus.

New algorithm:
1. Group words into rows by Y-position clustering
2. Match rows between engines by vertical center proximity
3. Within each row: walk both sequences left-to-right, deduplicating
4. Unmatched rows kept as-is

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-13 08:45:03 +01:00
parent 4280298e02
commit 846292f632
2 changed files with 384 additions and 487 deletions

View File

@@ -2599,175 +2599,189 @@ async def paddle_direct(session_id: str):
return {"session_id": session_id, **word_result}
def _box_iou(a: dict, b: dict) -> float:
"""Compute IoU between two word boxes (each has left, top, width, height)."""
ax1, ay1 = a["left"], a["top"]
ax2, ay2 = ax1 + a["width"], ay1 + a["height"]
bx1, by1 = b["left"], b["top"]
bx2, by2 = bx1 + b["width"], by1 + b["height"]
def _group_words_into_rows(words: list, row_gap: int = 12) -> list:
"""Group words into rows by Y-position clustering.
ix1, iy1 = max(ax1, bx1), max(ay1, by1)
ix2, iy2 = min(ax2, bx2), min(ay2, by2)
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
if inter == 0:
return 0.0
area_a = (ax2 - ax1) * (ay2 - ay1)
area_b = (bx2 - bx1) * (by2 - by1)
return inter / (area_a + area_b - inter) if (area_a + area_b - inter) > 0 else 0.0
def _box_center_dist(a: dict, b: dict) -> float:
"""Euclidean distance between box centers."""
acx = a["left"] + a["width"] / 2
acy = a["top"] + a["height"] / 2
bcx = b["left"] + b["width"] / 2
bcy = b["top"] + b["height"] / 2
return ((acx - bcx) ** 2 + (acy - bcy) ** 2) ** 0.5
def _text_similarity(a: str, b: str) -> float:
"""Simple text similarity (0-1). Handles stripped punctuation."""
if not a or not b:
return 0.0
a_lower = a.lower().strip()
b_lower = b.lower().strip()
if a_lower == b_lower:
return 1.0
# One might be substring of the other (e.g. "!Betonung" vs "Betonung")
if a_lower in b_lower or b_lower in a_lower:
return 0.8
# Check if they share most characters
shorter, longer = (a_lower, b_lower) if len(a_lower) <= len(b_lower) else (b_lower, a_lower)
if len(shorter) == 0:
return 0.0
matches = sum(1 for c in shorter if c in longer)
return matches / max(len(shorter), len(longer))
def _words_match(pw: dict, tw: dict) -> bool:
"""Determine if a Paddle word and a Tesseract word represent the same word.
Uses three criteria (any one is sufficient):
1. IoU > 0.15 (relaxed from 0.3 — engines produce different-sized boxes)
2. Center distance < max(word height, 20px) AND on same row (vertical overlap)
3. Text similarity > 0.7 AND on same row
Words whose vertical centers are within `row_gap` pixels are on the same row.
Returns list of rows, each row is a list of words sorted left-to-right.
"""
iou = _box_iou(pw, tw)
if iou > 0.15:
return True
if not words:
return []
# Sort by vertical center
sorted_words = sorted(words, key=lambda w: w["top"] + w.get("height", 0) / 2)
rows: list = []
current_row: list = [sorted_words[0]]
current_cy = sorted_words[0]["top"] + sorted_words[0].get("height", 0) / 2
# Same row check: vertical overlap > 50% of smaller height
py1, py2 = pw["top"], pw["top"] + pw["height"]
ty1, ty2 = tw["top"], tw["top"] + tw["height"]
v_overlap = max(0, min(py2, ty2) - max(py1, ty1))
min_h = max(min(pw["height"], tw["height"]), 1)
same_row = v_overlap > 0.5 * min_h
if not same_row:
return False
# Center proximity on same row
cdist = _box_center_dist(pw, tw)
h_threshold = max(pw["height"], tw["height"], 20)
if cdist < h_threshold:
return True
# Text similarity on same row
if _text_similarity(pw["text"], tw["text"]) > 0.7:
return True
return False
for w in sorted_words[1:]:
cy = w["top"] + w.get("height", 0) / 2
if abs(cy - current_cy) <= row_gap:
current_row.append(w)
else:
# Sort current row left-to-right before saving
rows.append(sorted(current_row, key=lambda w: w["left"]))
current_row = [w]
current_cy = cy
if current_row:
rows.append(sorted(current_row, key=lambda w: w["left"]))
return rows
def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list:
"""Merge word boxes from PaddleOCR and Tesseract.
def _row_center_y(row: list) -> float:
"""Average vertical center of a row of words."""
if not row:
return 0.0
return sum(w["top"] + w.get("height", 0) / 2 for w in row) / len(row)
Strategy:
- For each Paddle word, find the best matching Tesseract word
- Match criteria: IoU, center proximity, or text similarity (see _words_match)
- Matched pairs: keep Paddle text, average coordinates weighted by confidence
- Unmatched Paddle words: keep as-is
- Unmatched Tesseract words (conf >= 40): add (bullet points, symbols, etc.)
def _merge_row_sequences(paddle_row: list, tess_row: list) -> list:
"""Merge two word sequences from the same row using sequence alignment.
Both sequences are sorted left-to-right. Walk through both simultaneously:
- If words match (same/similar text): take Paddle text with averaged coords
- If they don't match: the extra word is unique to one engine, include it
This prevents duplicates because both engines produce words in the same order.
"""
merged = []
used_tess: set = set()
pi, ti = 0, 0
for pw in paddle_words:
best_score, best_ti = 0.0, -1
for ti, tw in enumerate(tess_words):
if ti in used_tess:
continue
if not _words_match(pw, tw):
continue
# Score: IoU + text_similarity to pick best match
score = _box_iou(pw, tw) + _text_similarity(pw["text"], tw["text"])
if score > best_score:
best_score, best_ti = score, ti
while pi < len(paddle_row) and ti < len(tess_row):
pw = paddle_row[pi]
tw = tess_row[ti]
if best_ti >= 0:
tw = tess_words[best_ti]
used_tess.add(best_ti)
# Check if these are the same word
pt = pw.get("text", "").lower().strip()
tt = tw.get("text", "").lower().strip()
# Same text or one contains the other
is_same = (pt == tt) or (len(pt) > 1 and len(tt) > 1 and (pt in tt or tt in pt))
if is_same:
# Matched — average coordinates weighted by confidence
pc = pw.get("conf", 80)
tc = tw.get("conf", 50)
total = pc + tc
if total == 0:
total = 1
merged.append({
"text": pw["text"], # Paddle text usually better
"text": pw["text"], # Paddle text preferred
"left": round((pw["left"] * pc + tw["left"] * tc) / total),
"top": round((pw["top"] * pc + tw["top"] * tc) / total),
"width": round((pw["width"] * pc + tw["width"] * tc) / total),
"height": round((pw["height"] * pc + tw["height"] * tc) / total),
"conf": max(pc, tc),
})
pi += 1
ti += 1
else:
# No Tesseract match — keep Paddle word as-is
merged.append(pw)
# Different text — one engine found something extra
# Look ahead: is the current Paddle word somewhere in Tesseract ahead?
paddle_ahead = any(
tess_row[t].get("text", "").lower().strip() == pt
for t in range(ti + 1, min(ti + 4, len(tess_row)))
)
# Is the current Tesseract word somewhere in Paddle ahead?
tess_ahead = any(
paddle_row[p].get("text", "").lower().strip() == tt
for p in range(pi + 1, min(pi + 4, len(paddle_row)))
)
# Add unmatched Tesseract words (bullet points, symbols, etc.)
for ti, tw in enumerate(tess_words):
if ti not in used_tess and tw.get("conf", 0) >= 40:
merged.append(tw)
# Safety net: deduplicate any remaining near-duplicate words
return _deduplicate_words(merged)
def _deduplicate_words(words: list) -> list:
"""Remove near-duplicate words that slipped through matching.
Two words are considered duplicates if:
- Same text (case-insensitive)
- Centers within 30px horizontally and 15px vertically
The word with higher confidence is kept.
"""
if len(words) <= 1:
return words
keep = [True] * len(words)
for i in range(len(words)):
if not keep[i]:
continue
w1 = words[i]
cx1 = w1["left"] + w1.get("width", 0) / 2
cy1 = w1["top"] + w1.get("height", 0) / 2
t1 = w1.get("text", "").lower().strip()
for j in range(i + 1, len(words)):
if not keep[j]:
continue
w2 = words[j]
t2 = w2.get("text", "").lower().strip()
if t1 != t2:
continue
cx2 = w2["left"] + w2.get("width", 0) / 2
cy2 = w2["top"] + w2.get("height", 0) / 2
if abs(cx1 - cx2) < 30 and abs(cy1 - cy2) < 15:
# Drop the one with lower confidence
if w1.get("conf", 0) >= w2.get("conf", 0):
keep[j] = False
if paddle_ahead and not tess_ahead:
# Tesseract has an extra word (e.g. "!" or bullet) → include it
if tw.get("conf", 0) >= 30:
merged.append(tw)
ti += 1
elif tess_ahead and not paddle_ahead:
# Paddle has an extra word → include it
merged.append(pw)
pi += 1
else:
# Both have unique words or neither found ahead → take leftmost first
if pw["left"] <= tw["left"]:
merged.append(pw)
pi += 1
else:
keep[i] = False
break # w1 is dropped, stop comparing
return [w for w, k in zip(words, keep) if k]
if tw.get("conf", 0) >= 30:
merged.append(tw)
ti += 1
# Remaining words from either engine
while pi < len(paddle_row):
merged.append(paddle_row[pi])
pi += 1
while ti < len(tess_row):
tw = tess_row[ti]
if tw.get("conf", 0) >= 30:
merged.append(tw)
ti += 1
return merged
def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list:
"""Merge word boxes from PaddleOCR and Tesseract using row-based sequence alignment.
Strategy:
1. Group each engine's words into rows (by Y-position clustering)
2. Match rows between engines (by vertical center proximity)
3. Within each matched row: merge sequences left-to-right, deduplicating
words that appear in both engines at the same sequence position
4. Unmatched rows from either engine: keep as-is
This prevents:
- Cross-line averaging (words from different lines being merged)
- Duplicate words (same word from both engines shown twice)
"""
if not paddle_words and not tess_words:
return []
if not paddle_words:
return [w for w in tess_words if w.get("conf", 0) >= 40]
if not tess_words:
return list(paddle_words)
# Step 1: Group into rows
paddle_rows = _group_words_into_rows(paddle_words)
tess_rows = _group_words_into_rows(tess_words)
# Step 2: Match rows between engines by vertical center proximity
used_tess_rows: set = set()
merged_all: list = []
for pr in paddle_rows:
pr_cy = _row_center_y(pr)
best_dist, best_tri = float("inf"), -1
for tri, tr in enumerate(tess_rows):
if tri in used_tess_rows:
continue
tr_cy = _row_center_y(tr)
dist = abs(pr_cy - tr_cy)
if dist < best_dist:
best_dist, best_tri = dist, tri
# Row height threshold — rows must be within ~1.5x typical line height
max_row_dist = max(
max((w.get("height", 20) for w in pr), default=20),
15,
)
if best_tri >= 0 and best_dist <= max_row_dist:
# Matched row — merge sequences
tr = tess_rows[best_tri]
used_tess_rows.add(best_tri)
merged_all.extend(_merge_row_sequences(pr, tr))
else:
# No matching Tesseract row — keep Paddle row as-is
merged_all.extend(pr)
# Add unmatched Tesseract rows
for tri, tr in enumerate(tess_rows):
if tri not in used_tess_rows:
for tw in tr:
if tw.get("conf", 0) >= 40:
merged_all.append(tw)
return merged_all
@router.post("/sessions/{session_id}/paddle-kombi")