From f9bad7beaa276c3d6fbc07b0d663f78f4898c413 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 18 Mar 2026 09:00:43 +0100 Subject: [PATCH] Filter phantom rows from recovered color artifacts and low-conf OCR noise MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Apply recovered-artifact filter to ALL zones (was box-zones only) - Filter any recovered word with text ≤ 2 chars (not just !?•·) - Add post-grid junk-row removal: rows where all word_boxes have conf < 50 and text ≤ 3 chars are dropped as OCR noise Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 88 +++++++++++++++++----- 1 file changed, 68 insertions(+), 20 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 9f0137d..5e3561b 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -805,29 +805,27 @@ async def build_grid(session_id: str): # First pass: build grids per zone independently zone_grids: List[Dict] = [] - _RECOVERED_NOISE = {"!", "?", "•", "·"} - for pz in page_zones: zone_words = _words_in_zone( all_words, pz.y, pz.height, pz.x, pz.width ) - # In box zones, filter out recovered single-char artifacts - # (decorative elements like !, ?, • from color recovery) - if pz.zone_type == "box": - before = len(zone_words) - zone_words = [ - w for w in zone_words - if not ( - w.get("recovered") - and w.get("text", "").strip() in _RECOVERED_NOISE - ) - ] - removed = before - len(zone_words) - if removed: - logger.info( - "build-grid: filtered %d recovered artifacts from box zone %d", - removed, pz.index, - ) + # Filter recovered single-char artifacts in ALL zones + # (decorative colored pixel blobs like !, ?, • from + # recover_colored_text that don't represent real text) + before = len(zone_words) + zone_words = [ + w for w in zone_words + if not ( + w.get("recovered") + and len(w.get("text", "").strip()) <= 2 + ) + ] + removed = before - len(zone_words) + if removed: + logger.info( + "build-grid: filtered %d recovered artifacts from %s zone %d", + removed, pz.zone_type, pz.index, + ) grid = _build_zone_grid( zone_words, pz.x, pz.y, pz.width, pz.height, pz.index, img_w, img_h, @@ -940,8 +938,20 @@ async def build_grid(session_id: str): # 4. Fallback: no boxes detected → single zone with all words if not zones_data: + # Filter recovered single-char artifacts (same as in zone loop above) + before = len(all_words) + filtered_words = [ + w for w in all_words + if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2) + ] + removed = before - len(filtered_words) + if removed: + logger.info( + "build-grid session %s: filtered %d recovered artifacts (fallback zone)", + session_id, removed, + ) grid = _build_zone_grid( - all_words, content_x, content_y, content_w, content_h, + filtered_words, content_x, content_y, content_w, content_h, 0, img_w, img_h, ) grid.pop("_raw_columns", None) @@ -963,6 +973,44 @@ async def build_grid(session_id: str): **grid, }) + # 4b. Remove junk rows: rows where ALL cells contain only short, + # low-confidence text (OCR noise, stray marks). Real vocabulary rows + # have at least one word with conf >= 50 or meaningful text length. + _JUNK_CONF_THRESHOLD = 50 + _JUNK_MAX_TEXT_LEN = 3 + for z in zones_data: + cells = z.get("cells", []) + rows = z.get("rows", []) + if not cells or not rows: + continue + junk_row_indices = set() + for row in rows: + ri = row["index"] + row_cells = [c for c in cells if c.get("row_index") == ri] + if not row_cells: + continue + # Check if ALL word_boxes in ALL cells of this row are junk + all_junk = True + for cell in row_cells: + for wb in cell.get("word_boxes") or []: + text = (wb.get("text") or "").strip() + conf = wb.get("conf", 0) + if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN: + all_junk = False + break + if not all_junk: + break + if all_junk: + junk_row_indices.add(ri) + if junk_row_indices: + z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices] + z["rows"] = [r for r in rows if r["index"] not in junk_row_indices] + logger.info( + "build-grid: removed %d junk rows from zone %d: %s", + len(junk_row_indices), z["zone_index"], + sorted(junk_row_indices), + ) + # 5. Color annotation on final word_boxes in cells if img_bgr is not None: all_wb: List[Dict] = []