Filter phantom rows from recovered color artifacts and low-conf OCR noise

- Apply recovered-artifact filter to ALL zones (was box-zones only) - Filter any recovered word with text ≤ 2 chars (not just !?•·) - Add post-grid junk-row removal: rows where all word_boxes have conf < 50 and text ≤ 3 chars are dropped as OCR noise Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-18 09:00:43 +01:00
parent 143e41ec76
commit f9bad7beaa
1 changed files with 68 additions and 20 deletions
@@ -805,29 +805,27 @@ async def build_grid(session_id: str):
                # First pass: build grids per zone independently
                zone_grids: List[Dict] = []

-                _RECOVERED_NOISE = {"!", "?", "•", "·"}
-
                for pz in page_zones:
                    zone_words = _words_in_zone(
                        all_words, pz.y, pz.height, pz.x, pz.width
                    )
-                    # In box zones, filter out recovered single-char artifacts
-                    # (decorative elements like !, ?, • from color recovery)
-                    if pz.zone_type == "box":
-                        before = len(zone_words)
-                        zone_words = [
-                            w for w in zone_words
-                            if not (
-                                w.get("recovered")
-                                and w.get("text", "").strip() in _RECOVERED_NOISE
-                            )
-                        ]
-                        removed = before - len(zone_words)
-                        if removed:
-                            logger.info(
-                                "build-grid: filtered %d recovered artifacts from box zone %d",
-                                removed, pz.index,
-                            )
+                    # Filter recovered single-char artifacts in ALL zones
+                    # (decorative colored pixel blobs like !, ?, • from
+                    # recover_colored_text that don't represent real text)
+                    before = len(zone_words)
+                    zone_words = [
+                        w for w in zone_words
+                        if not (
+                            w.get("recovered")
+                            and len(w.get("text", "").strip()) <= 2
+                        )
+                    ]
+                    removed = before - len(zone_words)
+                    if removed:
+                        logger.info(
+                            "build-grid: filtered %d recovered artifacts from %s zone %d",
+                            removed, pz.zone_type, pz.index,
+                        )
                    grid = _build_zone_grid(
                        zone_words, pz.x, pz.y, pz.width, pz.height,
                        pz.index, img_w, img_h,
@@ -940,8 +938,20 @@ async def build_grid(session_id: str):

    # 4. Fallback: no boxes detected → single zone with all words
    if not zones_data:
+        # Filter recovered single-char artifacts (same as in zone loop above)
+        before = len(all_words)
+        filtered_words = [
+            w for w in all_words
+            if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2)
+        ]
+        removed = before - len(filtered_words)
+        if removed:
+            logger.info(
+                "build-grid session %s: filtered %d recovered artifacts (fallback zone)",
+                session_id, removed,
+            )
        grid = _build_zone_grid(
-            all_words, content_x, content_y, content_w, content_h,
+            filtered_words, content_x, content_y, content_w, content_h,
            0, img_w, img_h,
        )
        grid.pop("_raw_columns", None)
@@ -963,6 +973,44 @@ async def build_grid(session_id: str):
            **grid,
        })

+    # 4b. Remove junk rows: rows where ALL cells contain only short,
+    # low-confidence text (OCR noise, stray marks).  Real vocabulary rows
+    # have at least one word with conf >= 50 or meaningful text length.
+    _JUNK_CONF_THRESHOLD = 50
+    _JUNK_MAX_TEXT_LEN = 3
+    for z in zones_data:
+        cells = z.get("cells", [])
+        rows = z.get("rows", [])
+        if not cells or not rows:
+            continue
+        junk_row_indices = set()
+        for row in rows:
+            ri = row["index"]
+            row_cells = [c for c in cells if c.get("row_index") == ri]
+            if not row_cells:
+                continue
+            # Check if ALL word_boxes in ALL cells of this row are junk
+            all_junk = True
+            for cell in row_cells:
+                for wb in cell.get("word_boxes") or []:
+                    text = (wb.get("text") or "").strip()
+                    conf = wb.get("conf", 0)
+                    if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
+                        all_junk = False
+                        break
+                if not all_junk:
+                    break
+            if all_junk:
+                junk_row_indices.add(ri)
+        if junk_row_indices:
+            z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
+            z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
+            logger.info(
+                "build-grid: removed %d junk rows from zone %d: %s",
+                len(junk_row_indices), z["zone_index"],
+                sorted(junk_row_indices),
+            )
+
    # 5. Color annotation on final word_boxes in cells
    if img_bgr is not None:
        all_wb: List[Dict] = []