fix: merge syllable-split word_boxes + keep dictionary guide words

OCR splits words at syllable marks into overlapping word_boxes (e.g. "zu" + "tiefst" with 52% x-overlap). Step 5i previously removed the lower-confidence box, losing the prefix. Now: when both boxes are alphabetic text with 20-75% overlap, MERGE them into one word_box ("zutiefst") instead of removing. Also relaxed artifact cell filter: 2-char alphabetic text like "Zw" (dictionary guide word) is no longer removed. Only non-alphabetic short text like "a=" is filtered. Results for session 5997: "tiefst"→"zutiefst", "zu"→"zuständig", "Zu die Zuschüsse"→"Zuschuss, die Zuschüsse", "Zw" restored. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 08:21:00 +01:00
parent 882b177fc3
commit 7b3319be2e
1 changed files with 56 additions and 8 deletions
@@ -2431,6 +2431,8 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:

            # Rule (b) + (c): overlap and duplicate detection
            # Sort by x for pairwise comparison
+            _ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
+            to_merge: List[Tuple[int, int]] = []  # pairs (i1, i2) to merge
            indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
            for p in range(len(indexed) - 1):
                i1, w1 = indexed[p]
@@ -2442,19 +2444,33 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                gap = x2s - x1e
                overlap_pct = overlap / min_w if min_w > 0 else 0

-                # (b) Significant x-overlap: remove the lower-confidence one
-                if overlap_pct > 0.40:
+                # (b) Significant x-overlap
+                if overlap_pct > 0.20:
+                    t1 = (w1.get("text") or "").strip()
+                    t2 = (w2.get("text") or "").strip()
+
+                    # Syllable-split words: both are alphabetic text with
+                    # moderate overlap (20-75%).  Merge instead of removing.
+                    # OCR splits words at syllable marks, producing overlapping
+                    # boxes like "zu" + "tiefst" → "zutiefst".
+                    if (overlap_pct <= 0.75
+                            and _ALPHA_WORD_RE.match(t1)
+                            and _ALPHA_WORD_RE.match(t2)):
+                        to_merge.append((i1, i2))
+                        continue
+
+                    if overlap_pct <= 0.40:
+                        continue  # too little overlap and not alphabetic merge
+
                    c1 = w1.get("conf", 50)
                    c2 = w2.get("conf", 50)
-                    t1 = (w1.get("text") or "").strip().lower()
-                    t2 = (w2.get("text") or "").strip().lower()

                    # For very high overlap (>90%) with different text,
                    # prefer the word that exists in the IPA dictionary
                    # over confidence (OCR can give artifacts high conf).
-                    if overlap_pct > 0.90 and t1 != t2:
-                        in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1), "british")) if t1.isalpha() else False
-                        in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2), "british")) if t2.isalpha() else False
+                    if overlap_pct > 0.90 and t1.lower() != t2.lower():
+                        in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
+                        in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
                        if in_dict_1 and not in_dict_2:
                            to_remove.add(i2)
                            continue
@@ -2483,6 +2499,37 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                    c2 = w2.get("conf", 50)
                    to_remove.add(i1 if c1 <= c2 else i2)

+            # Execute merges first (syllable-split words)
+            if to_merge:
+                merged_indices: set = set()
+                for mi1, mi2 in to_merge:
+                    if mi1 in to_remove or mi2 in to_remove:
+                        continue  # don't merge if one is being removed
+                    if mi1 in merged_indices or mi2 in merged_indices:
+                        continue  # already merged
+                    mw1, mw2 = wbs[mi1], wbs[mi2]
+                    # Concatenate text (no space — they're parts of one word)
+                    mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
+                    mt2 = (mw2.get("text") or "").strip()
+                    merged_text = mt1 + mt2
+                    # Union bounding box
+                    mx = min(mw1["left"], mw2["left"])
+                    my = min(mw1["top"], mw2["top"])
+                    mr = max(mw1["left"] + mw1["width"],
+                             mw2["left"] + mw2["width"])
+                    mb = max(mw1["top"] + mw1["height"],
+                             mw2["top"] + mw2["height"])
+                    mw1["text"] = merged_text
+                    mw1["left"] = mx
+                    mw1["top"] = my
+                    mw1["width"] = mr - mx
+                    mw1["height"] = mb - my
+                    mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
+                    to_remove.add(mi2)  # remove the second one
+                    merged_indices.add(mi1)
+                    merged_indices.add(mi2)
+                    bullet_removed -= 1  # net: merge, not removal
+
            if to_remove:
                bullet_removed += len(to_remove)
                filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
@@ -2525,7 +2572,8 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                is_artifact = True
            elif _PURE_JUNK_RE.match(core):
                is_artifact = True
-            elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS:
+            elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
+                # Short non-alphabetic text like "a=", not word beginnings like "Zw"
                is_artifact = True
            elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
                is_artifact = True