Fix Bug 3: recover OCR-lost prefixes via overlap merge + chain merging

When OCR merge expands a prefix word box (e.g. "zer" w=42 → w=104), it heavily overlaps (>75%) with the next fragment ("brech"). The grid builder's overlap filter previously removed the prefix as a duplicate. Fix: when overlap > 75% but both boxes are alphabetic with different text and one is ≤ 4 chars, merge instead of removing. Also enable chain merging via merge_parent tracking so "zer" + "brech" + "lich" → "zerbrechlich" in a single pass. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-27 15:49:52 +01:00
parent 96ea23164d
commit 0685fb12da
1 changed files with 29 additions and 10 deletions
@@ -1353,6 +1353,19 @@ async def _build_grid_core(
                        to_merge.append((i1, i2))
                        continue
                    # High overlap (>75%) with different alphabetic text:
                    # OCR merge can expand a prefix box (e.g. "zer" w=42 → w=104)
                    # causing it to heavily overlap with the next fragment ("brech").
                    # Merge instead of removing when one is a short prefix (≤4 chars)
                    # and the texts are different.
                    if (overlap_pct > 0.75
                            and _ALPHA_WORD_RE.match(t1)
                            and _ALPHA_WORD_RE.match(t2)
                            and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
                            and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
                        to_merge.append((i1, i2))
                        continue
                    if overlap_pct <= 0.40:
                        continue  # too little overlap and not alphabetic merge
@@ -1393,15 +1406,22 @@ async def _build_grid_core(
                    c2 = w2.get("conf", 50)
                    to_remove.add(i1 if c1 <= c2 else i2)
-            # Execute merges first (syllable-split words)
+            # Execute merges first (syllable-split words).
            # Use merge_parent to support chain merging: if "zer" absorbed
            # "brech" and then "brech"+"lich" is a merge pair, redirect to
            # merge "lich" into "zer" → "zerbrechlich".
            if to_merge:
-                merged_indices: set = set()
+                merge_parent: Dict[int, int] = {}  # absorbed → absorber
                for mi1, mi2 in to_merge:
-                    if mi1 in to_remove or mi2 in to_remove:
+                    # Follow chain: if mi1 was absorbed, find root absorber
-                        continue  # don't merge if one is being removed
+                    actual_mi1 = mi1
-                    if mi1 in merged_indices or mi2 in merged_indices:
+                    while actual_mi1 in merge_parent:
-                        continue  # already merged
+                        actual_mi1 = merge_parent[actual_mi1]
-                    mw1, mw2 = wbs[mi1], wbs[mi2]
+                    if actual_mi1 in to_remove or mi2 in to_remove:
                        continue
                    if mi2 in merge_parent:
                        continue  # mi2 already absorbed
                    mw1, mw2 = wbs[actual_mi1], wbs[mi2]
                    # Concatenate text (no space — they're parts of one word)
                    mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
                    mt2 = (mw2.get("text") or "").strip()
@@ -1419,9 +1439,8 @@ async def _build_grid_core(
                    mw1["width"] = mr - mx
                    mw1["height"] = mb - my
                    mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
-                    to_remove.add(mi2)  # remove the second one
+                    to_remove.add(mi2)
-                    merged_indices.add(mi1)
+                    merge_parent[mi2] = actual_mi1
                    merged_indices.add(mi2)
                    bullet_removed -= 1  # net: merge, not removal
            if to_remove: