Fix Bug 3: recover OCR-lost prefixes via overlap merge + chain merging

When OCR merge expands a prefix word box (e.g. "zer" w=42 → w=104), it heavily overlaps (>75%) with the next fragment ("brech"). The grid builder's overlap filter previously removed the prefix as a duplicate. Fix: when overlap > 75% but both boxes are alphabetic with different text and one is ≤ 4 chars, merge instead of removing. Also enable chain merging via merge_parent tracking so "zer" + "brech" + "lich" → "zerbrechlich" in a single pass. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-27 15:49:52 +01:00
parent 96ea23164d
commit 0685fb12da
1 changed files with 29 additions and 10 deletions
@@ -1353,6 +1353,19 @@ async def _build_grid_core(
                        to_merge.append((i1, i2))
                        continue

+                    # High overlap (>75%) with different alphabetic text:
+                    # OCR merge can expand a prefix box (e.g. "zer" w=42 → w=104)
+                    # causing it to heavily overlap with the next fragment ("brech").
+                    # Merge instead of removing when one is a short prefix (≤4 chars)
+                    # and the texts are different.
+                    if (overlap_pct > 0.75
+                            and _ALPHA_WORD_RE.match(t1)
+                            and _ALPHA_WORD_RE.match(t2)
+                            and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
+                            and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
+                        to_merge.append((i1, i2))
+                        continue
+
                    if overlap_pct <= 0.40:
                        continue  # too little overlap and not alphabetic merge

@@ -1393,15 +1406,22 @@ async def _build_grid_core(
                    c2 = w2.get("conf", 50)
                    to_remove.add(i1 if c1 <= c2 else i2)

-            # Execute merges first (syllable-split words)
+            # Execute merges first (syllable-split words).
+            # Use merge_parent to support chain merging: if "zer" absorbed
+            # "brech" and then "brech"+"lich" is a merge pair, redirect to
+            # merge "lich" into "zer" → "zerbrechlich".
            if to_merge:
-                merged_indices: set = set()
+                merge_parent: Dict[int, int] = {}  # absorbed → absorber
                for mi1, mi2 in to_merge:
-                    if mi1 in to_remove or mi2 in to_remove:
-                        continue  # don't merge if one is being removed
-                    if mi1 in merged_indices or mi2 in merged_indices:
-                        continue  # already merged
-                    mw1, mw2 = wbs[mi1], wbs[mi2]
+                    # Follow chain: if mi1 was absorbed, find root absorber
+                    actual_mi1 = mi1
+                    while actual_mi1 in merge_parent:
+                        actual_mi1 = merge_parent[actual_mi1]
+                    if actual_mi1 in to_remove or mi2 in to_remove:
+                        continue
+                    if mi2 in merge_parent:
+                        continue  # mi2 already absorbed
+                    mw1, mw2 = wbs[actual_mi1], wbs[mi2]
                    # Concatenate text (no space — they're parts of one word)
                    mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
                    mt2 = (mw2.get("text") or "").strip()
@@ -1419,9 +1439,8 @@ async def _build_grid_core(
                    mw1["width"] = mr - mx
                    mw1["height"] = mb - my
                    mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
-                    to_remove.add(mi2)  # remove the second one
-                    merged_indices.add(mi1)
-                    merged_indices.add(mi2)
+                    to_remove.add(mi2)
+                    merge_parent[mi2] = actual_mi1
                    bullet_removed -= 1  # net: merge, not removal

            if to_remove: