Step 5i: Remove blue bullet/artifact and overlapping duplicate word_boxes

Dictionary pages have small blue square bullets before entries that OCR reads as text artifacts. Three detection rules: a) Tiny blue symbols (area < 150, conf < 85): catches ©, e, * etc. b) X-overlapping word_boxes (>40%): remove lower confidence one c) Duplicate blue text with gap < 6px: remove one copy Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 18:17:07 +01:00
parent d889a6959e
commit 82433b4bad
2 changed files with 170 additions and 0 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -2235,6 +2235,84 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
    if slash_ipa_fixed:
        logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)

+    # 5i. Remove blue bullet/artifact word_boxes.
+    # Dictionary pages have small blue square bullets (■) before entries.
+    # OCR reads these as text artifacts (©, e, *, or even plausible words
+    # like "fighily" overlapping the real word "tightly").
+    # Detection rules:
+    #   a) Tiny blue symbols: area < 150 AND conf < 85
+    #   b) Overlapping word_boxes: >40% x-overlap → remove lower confidence
+    #   c) Duplicate text: consecutive blue wbs with identical text, gap < 6px
+    bullet_removed = 0
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            wbs = cell.get("word_boxes") or []
+            if len(wbs) < 2:
+                continue
+            to_remove: set = set()
+
+            # Rule (a): tiny blue symbols
+            for i, wb in enumerate(wbs):
+                if (wb.get("color_name") == "blue"
+                        and wb.get("width", 0) * wb.get("height", 0) < 150
+                        and wb.get("conf", 100) < 85):
+                    to_remove.add(i)
+
+            # Rule (b) + (c): overlap and duplicate detection
+            # Sort by x for pairwise comparison
+            indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
+            for p in range(len(indexed) - 1):
+                i1, w1 = indexed[p]
+                i2, w2 = indexed[p + 1]
+                x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0)
+                x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0)
+                overlap = max(0, min(x1e, x2e) - max(x1s, x2s))
+                min_w = min(w1.get("width", 1), w2.get("width", 1))
+                gap = x2s - x1e
+                overlap_pct = overlap / min_w if min_w > 0 else 0
+
+                # (b) Significant x-overlap: remove the lower-confidence one
+                if overlap_pct > 0.40:
+                    c1 = w1.get("conf", 50)
+                    c2 = w2.get("conf", 50)
+                    if c1 < c2:
+                        to_remove.add(i1)
+                    elif c2 < c1:
+                        to_remove.add(i2)
+                    else:
+                        # Same confidence: remove the taller one (bullet slivers)
+                        if w1.get("height", 0) > w2.get("height", 0):
+                            to_remove.add(i1)
+                        else:
+                            to_remove.add(i2)
+
+                # (c) Duplicate text: consecutive blue with same text, gap < 6px
+                elif (gap < 6
+                      and w1.get("color_name") == "blue"
+                      and w2.get("color_name") == "blue"
+                      and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()):
+                    # Remove the one with lower confidence; if equal, first one
+                    c1 = w1.get("conf", 50)
+                    c2 = w2.get("conf", 50)
+                    to_remove.add(i1 if c1 <= c2 else i2)
+
+            if to_remove:
+                bullet_removed += len(to_remove)
+                filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
+                cell["word_boxes"] = filtered
+                cell["text"] = " ".join(
+                    wb.get("text", "").strip()
+                    for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
+                    if wb.get("text", "").strip()
+                )
+
+    # Remove cells that became empty after bullet removal
+    if bullet_removed:
+        for z in zones_data:
+            z["cells"] = [c for c in z.get("cells", [])
+                          if (c.get("word_boxes") or c.get("text", "").strip())]
+        logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
+
    duration = time.time() - t0

    # 6. Build result