fix: require both horizontal AND vertical overlap for word dedup
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 30s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m11s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 18s

Previous version only checked X overlap, causing false positives for
short words like "=" and "I" that appear at similar X positions in
different rows. Now requires >=50% overlap in both dimensions.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-17 10:57:44 +01:00
parent 29d3c1caf5
commit bbf0a5720e

View File

@@ -3359,11 +3359,14 @@ def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list:
return merged_all
def _deduplicate_words(words: list, overlap_ratio: float = 0.4) -> list:
def _deduplicate_words(words: list) -> list:
"""Remove duplicate words with same text at overlapping positions.
PaddleOCR can return overlapping phrases (e.g. "von jm." and "jm. =")
that produce duplicate words after splitting. This pass removes them.
A word is a duplicate only when BOTH horizontal AND vertical overlap
exceed 50% — same text on the same visual line at the same position.
"""
if not words:
return words
@@ -3374,19 +3377,25 @@ def _deduplicate_words(words: list, overlap_ratio: float = 0.4) -> list:
if not wt:
continue
is_dup = False
w_right = w["left"] + w.get("width", 0)
w_bottom = w["top"] + w.get("height", 0)
for existing in result:
et = existing.get("text", "").lower().strip()
if wt != et:
continue
# Check horizontal overlap
ol = max(w["left"], existing["left"])
or_ = min(
w["left"] + w.get("width", 0),
existing["left"] + existing.get("width", 0),
)
ow = max(0, or_ - ol)
# Horizontal overlap
ox_l = max(w["left"], existing["left"])
ox_r = min(w_right, existing["left"] + existing.get("width", 0))
ox = max(0, ox_r - ox_l)
min_w = min(w.get("width", 1), existing.get("width", 1))
if min_w > 0 and ow / min_w >= overlap_ratio:
if min_w <= 0 or ox / min_w < 0.5:
continue
# Vertical overlap — must also be on the same line
oy_t = max(w["top"], existing["top"])
oy_b = min(w_bottom, existing["top"] + existing.get("height", 0))
oy = max(0, oy_b - oy_t)
min_h = min(w.get("height", 1), existing.get("height", 1))
if min_h > 0 and oy / min_h >= 0.5:
is_dup = True
break
if not is_dup: