From 0685fb12da5714d3e17019e9b86b1e1e7a443bb1 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 27 Mar 2026 15:49:52 +0100 Subject: [PATCH] Fix Bug 3: recover OCR-lost prefixes via overlap merge + chain merging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When OCR merge expands a prefix word box (e.g. "zer" w=42 → w=104), it heavily overlaps (>75%) with the next fragment ("brech"). The grid builder's overlap filter previously removed the prefix as a duplicate. Fix: when overlap > 75% but both boxes are alphabetic with different text and one is ≤ 4 chars, merge instead of removing. Also enable chain merging via merge_parent tracking so "zer" + "brech" + "lich" → "zerbrechlich" in a single pass. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 39 ++++++++++++++++------ 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 2aa5ce3..619798c 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1353,6 +1353,19 @@ async def _build_grid_core( to_merge.append((i1, i2)) continue + # High overlap (>75%) with different alphabetic text: + # OCR merge can expand a prefix box (e.g. "zer" w=42 → w=104) + # causing it to heavily overlap with the next fragment ("brech"). + # Merge instead of removing when one is a short prefix (≤4 chars) + # and the texts are different. + if (overlap_pct > 0.75 + and _ALPHA_WORD_RE.match(t1) + and _ALPHA_WORD_RE.match(t2) + and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower() + and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4): + to_merge.append((i1, i2)) + continue + if overlap_pct <= 0.40: continue # too little overlap and not alphabetic merge @@ -1393,15 +1406,22 @@ async def _build_grid_core( c2 = w2.get("conf", 50) to_remove.add(i1 if c1 <= c2 else i2) - # Execute merges first (syllable-split words) + # Execute merges first (syllable-split words). + # Use merge_parent to support chain merging: if "zer" absorbed + # "brech" and then "brech"+"lich" is a merge pair, redirect to + # merge "lich" into "zer" → "zerbrechlich". if to_merge: - merged_indices: set = set() + merge_parent: Dict[int, int] = {} # absorbed → absorber for mi1, mi2 in to_merge: - if mi1 in to_remove or mi2 in to_remove: - continue # don't merge if one is being removed - if mi1 in merged_indices or mi2 in merged_indices: - continue # already merged - mw1, mw2 = wbs[mi1], wbs[mi2] + # Follow chain: if mi1 was absorbed, find root absorber + actual_mi1 = mi1 + while actual_mi1 in merge_parent: + actual_mi1 = merge_parent[actual_mi1] + if actual_mi1 in to_remove or mi2 in to_remove: + continue + if mi2 in merge_parent: + continue # mi2 already absorbed + mw1, mw2 = wbs[actual_mi1], wbs[mi2] # Concatenate text (no space — they're parts of one word) mt1 = (mw1.get("text") or "").rstrip(".,;:!?") mt2 = (mw2.get("text") or "").strip() @@ -1419,9 +1439,8 @@ async def _build_grid_core( mw1["width"] = mr - mx mw1["height"] = mb - my mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2 - to_remove.add(mi2) # remove the second one - merged_indices.add(mi1) - merged_indices.add(mi2) + to_remove.add(mi2) + merge_parent[mi2] = actual_mi1 bullet_removed -= 1 # net: merge, not removal if to_remove: