From 2c63beff04f0d6577b5b5d3b26c7bb9255b8a115 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 20 Mar 2026 18:21:00 +0100 Subject: [PATCH] Fix bullet overlap disambiguation + raise red threshold to 90 Step 5i: For word_boxes with >90% x-overlap and different text, use IPA dictionary to decide which to keep (e.g. "tightly" in dict, "fighily" not). Red threshold raised from 80 to 90 to catch remaining scanner artifacts like "tight" and "5" that were still misclassified as red. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_color_detect.py | 2 +- klausur-service/backend/grid_editor_api.py | 16 ++++++++++++++++ .../backend/tests/test_grid_editor_api.py | 6 +++--- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/klausur-service/backend/cv_color_detect.py b/klausur-service/backend/cv_color_detect.py index 423d61d..998d604 100644 --- a/klausur-service/backend/cv_color_detect.py +++ b/klausur-service/backend/cv_color_detect.py @@ -182,7 +182,7 @@ def detect_word_colors( # Red requires higher saturation — scanner artifacts on black # text often produce a slight warm tint (hue ~0) with low # saturation that would otherwise be misclassified as red. - if name == "red" and median_sat < 80: + if name == "red" and median_sat < 90: wb["color"] = _COLOR_HEX["black"] wb["color_name"] = "black" continue diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 7db58d9..20c048a 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -2275,6 +2275,22 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: if overlap_pct > 0.40: c1 = w1.get("conf", 50) c2 = w2.get("conf", 50) + t1 = (w1.get("text") or "").strip().lower() + t2 = (w2.get("text") or "").strip().lower() + + # For very high overlap (>90%) with different text, + # prefer the word that exists in the IPA dictionary + # over confidence (OCR can give artifacts high conf). + if overlap_pct > 0.90 and t1 != t2: + in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1), "british")) if t1.isalpha() else False + in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2), "british")) if t2.isalpha() else False + if in_dict_1 and not in_dict_2: + to_remove.add(i2) + continue + elif in_dict_2 and not in_dict_1: + to_remove.add(i1) + continue + if c1 < c2: to_remove.add(i1) elif c2 < c1: diff --git a/klausur-service/backend/tests/test_grid_editor_api.py b/klausur-service/backend/tests/test_grid_editor_api.py index ccb077f..ff03ab5 100644 --- a/klausur-service/backend/tests/test_grid_editor_api.py +++ b/klausur-service/backend/tests/test_grid_editor_api.py @@ -924,13 +924,13 @@ class TestRedFalsePositiveSuppression: """Red requires median_sat >= 80 to avoid scanner artifact false positives.""" def test_low_saturation_red_classified_as_black(self): - """Black text with slight warm scanner tint (sat ~60) → black, not red.""" + """Black text with slight warm scanner tint (sat ~85) → black, not red.""" import numpy as np from cv_color_detect import detect_word_colors # Create a 40x20 image with dark gray pixels (slight warm tint) - # HSV: hue=5 (red range), sat=60 (above 55 threshold but below 80), val=40 - img_hsv = np.full((40, 200, 3), [5, 60, 40], dtype=np.uint8) + # HSV: hue=5 (red range), sat=85 (above 55 threshold but below 90), val=40 + img_hsv = np.full((40, 200, 3), [5, 85, 40], dtype=np.uint8) img_bgr = cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR) wb = [{"left": 10, "top": 5, "width": 50, "height": 20, "text": "test"}]