From febd0a2f84e0b9ccf4030e7d57bb387081d2f1fd Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 17 Mar 2026 09:54:50 +0100 Subject: [PATCH] fix: border ghost filter + row overlap fix for box zones 1. Add _filter_border_ghosts() to grid editor - removes OCR artefacts like | sitting on box borders before row/column clustering. The tall | (h=55) was inflating row 0's y_max, causing row overlap. 2. Fix _assign_word_to_row() to prefer closest y_center when rows overlap, instead of always returning the first matching row. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_words_first.py | 14 +++-- klausur-service/backend/grid_editor_api.py | 62 ++++++++++++++++++++++ 2 files changed, 71 insertions(+), 5 deletions(-) diff --git a/klausur-service/backend/cv_words_first.py b/klausur-service/backend/cv_words_first.py index 83dd24f..b62b547 100644 --- a/klausur-service/backend/cv_words_first.py +++ b/klausur-service/backend/cv_words_first.py @@ -134,12 +134,16 @@ def _assign_word_to_column(word: Dict, columns: List[Dict]) -> int: def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int: - """Return row index for a word based on its Y-center.""" + """Return row index for a word based on its Y-center. + + When rows overlap (e.g. due to tall border-ghost characters inflating + a row's y_max), prefer the row whose y_center is closest. + """ y_center = word['top'] + word['height'] / 2 - # Find the row whose y_range contains this word's center - for row in rows: - if row['y_min'] <= y_center <= row['y_max']: - return row['index'] + # Find all rows whose y_range contains this word's center + matching = [r for r in rows if r['y_min'] <= y_center <= r['y_max']] + if matching: + return min(matching, key=lambda r: abs(r['y_center'] - y_center))['index'] # Fallback: nearest row by Y-center return min(rows, key=lambda r: abs(r['y_center'] - y_center))['index'] diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index d5828d1..3f1e6bd 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -264,6 +264,60 @@ def _cluster_columns_by_alignment( return columns +# Characters that are typically OCR artefacts from box border lines. +# Intentionally excludes ! (red markers) and . , ; (real punctuation). +_GRID_GHOST_CHARS = set("|1lI[](){}/\\-—–_~") + + +def _filter_border_ghosts( + words: List[Dict], + boxes: List, +) -> tuple: + """Remove words sitting on box borders that are OCR artefacts. + + Returns (filtered_words, removed_count). + """ + if not boxes or not words: + return words, 0 + + # Build border bands from detected boxes + x_bands: List[tuple] = [] + y_bands: List[tuple] = [] + for b in boxes: + bx = b.x if hasattr(b, "x") else b.get("x", 0) + by = b.y if hasattr(b, "y") else b.get("y", 0) + bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0)) + bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0)) + bt = ( + b.border_thickness + if hasattr(b, "border_thickness") + else b.get("border_thickness", 3) + ) + margin = max(bt * 2, 10) + 6 + x_bands.append((bx - margin, bx + margin)) + x_bands.append((bx + bw - margin, bx + bw + margin)) + y_bands.append((by - margin, by + margin)) + y_bands.append((by + bh - margin, by + bh + margin)) + + def _is_ghost(w: Dict) -> bool: + text = (w.get("text") or "").strip() + if not text: + return False + cx = w["left"] + w["width"] / 2 + cy = w["top"] + w["height"] / 2 + on_border = any(lo <= cx <= hi for lo, hi in x_bands) or any( + lo <= cy <= hi for lo, hi in y_bands + ) + if not on_border: + return False + if all(c in _GRID_GHOST_CHARS for c in text): + return True + return False + + filtered = [w for w in words if not _is_ghost(w)] + return filtered, len(words) - len(filtered) + + def _flatten_word_boxes(cells: List[Dict]) -> List[Dict]: """Extract all word_boxes from cells into a flat list of word dicts.""" words: List[Dict] = [] @@ -539,6 +593,14 @@ async def build_grid(session_id: str): boxes_detected = len(boxes) if boxes: + # Filter border ghost words before grid building + all_words, ghost_count = _filter_border_ghosts(all_words, boxes) + if ghost_count: + logger.info( + "build-grid session %s: removed %d border ghost words", + session_id, ghost_count, + ) + # Split page into zones page_zones = split_page_into_zones( content_x, content_y, content_w, content_h, boxes