Fix bullet overlap disambiguation + raise red threshold to 90

Step 5i: For word_boxes with >90% x-overlap and different text, use IPA dictionary to decide which to keep (e.g. "tightly" in dict, "fighily" not). Red threshold raised from 80 to 90 to catch remaining scanner artifacts like "tight" and "5" that were still misclassified as red. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 18:21:00 +01:00
parent 82433b4bad
commit 2c63beff04
3 changed files with 20 additions and 4 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -2275,6 +2275,22 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                if overlap_pct > 0.40:
                    c1 = w1.get("conf", 50)
                    c2 = w2.get("conf", 50)
+                    t1 = (w1.get("text") or "").strip().lower()
+                    t2 = (w2.get("text") or "").strip().lower()
+
+                    # For very high overlap (>90%) with different text,
+                    # prefer the word that exists in the IPA dictionary
+                    # over confidence (OCR can give artifacts high conf).
+                    if overlap_pct > 0.90 and t1 != t2:
+                        in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1), "british")) if t1.isalpha() else False
+                        in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2), "british")) if t2.isalpha() else False
+                        if in_dict_1 and not in_dict_2:
+                            to_remove.add(i2)
+                            continue
+                        elif in_dict_2 and not in_dict_1:
+                            to_remove.add(i1)
+                            continue
+
                    if c1 < c2:
                        to_remove.add(i1)
                    elif c2 < c1: