diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 241d4c1..ae1d063 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -2431,6 +2431,8 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: # Rule (b) + (c): overlap and duplicate detection # Sort by x for pairwise comparison + _ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$') + to_merge: List[Tuple[int, int]] = [] # pairs (i1, i2) to merge indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0)) for p in range(len(indexed) - 1): i1, w1 = indexed[p] @@ -2442,19 +2444,33 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: gap = x2s - x1e overlap_pct = overlap / min_w if min_w > 0 else 0 - # (b) Significant x-overlap: remove the lower-confidence one - if overlap_pct > 0.40: + # (b) Significant x-overlap + if overlap_pct > 0.20: + t1 = (w1.get("text") or "").strip() + t2 = (w2.get("text") or "").strip() + + # Syllable-split words: both are alphabetic text with + # moderate overlap (20-75%). Merge instead of removing. + # OCR splits words at syllable marks, producing overlapping + # boxes like "zu" + "tiefst" → "zutiefst". + if (overlap_pct <= 0.75 + and _ALPHA_WORD_RE.match(t1) + and _ALPHA_WORD_RE.match(t2)): + to_merge.append((i1, i2)) + continue + + if overlap_pct <= 0.40: + continue # too little overlap and not alphabetic merge + c1 = w1.get("conf", 50) c2 = w2.get("conf", 50) - t1 = (w1.get("text") or "").strip().lower() - t2 = (w2.get("text") or "").strip().lower() # For very high overlap (>90%) with different text, # prefer the word that exists in the IPA dictionary # over confidence (OCR can give artifacts high conf). - if overlap_pct > 0.90 and t1 != t2: - in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1), "british")) if t1.isalpha() else False - in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2), "british")) if t2.isalpha() else False + if overlap_pct > 0.90 and t1.lower() != t2.lower(): + in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False + in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False if in_dict_1 and not in_dict_2: to_remove.add(i2) continue @@ -2483,6 +2499,37 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: c2 = w2.get("conf", 50) to_remove.add(i1 if c1 <= c2 else i2) + # Execute merges first (syllable-split words) + if to_merge: + merged_indices: set = set() + for mi1, mi2 in to_merge: + if mi1 in to_remove or mi2 in to_remove: + continue # don't merge if one is being removed + if mi1 in merged_indices or mi2 in merged_indices: + continue # already merged + mw1, mw2 = wbs[mi1], wbs[mi2] + # Concatenate text (no space — they're parts of one word) + mt1 = (mw1.get("text") or "").rstrip(".,;:!?") + mt2 = (mw2.get("text") or "").strip() + merged_text = mt1 + mt2 + # Union bounding box + mx = min(mw1["left"], mw2["left"]) + my = min(mw1["top"], mw2["top"]) + mr = max(mw1["left"] + mw1["width"], + mw2["left"] + mw2["width"]) + mb = max(mw1["top"] + mw1["height"], + mw2["top"] + mw2["height"]) + mw1["text"] = merged_text + mw1["left"] = mx + mw1["top"] = my + mw1["width"] = mr - mx + mw1["height"] = mb - my + mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2 + to_remove.add(mi2) # remove the second one + merged_indices.add(mi1) + merged_indices.add(mi2) + bullet_removed -= 1 # net: merge, not removal + if to_remove: bullet_removed += len(to_remove) filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove] @@ -2525,7 +2572,8 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: is_artifact = True elif _PURE_JUNK_RE.match(core): is_artifact = True - elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS: + elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha(): + # Short non-alphabetic text like "a=", not word beginnings like "Zw" is_artifact = True elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS: is_artifact = True