From cb61fab77b11f4c1fa4c4078142579c9e1a2cdf5 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 3 Mar 2026 15:38:58 +0100 Subject: [PATCH] fix(rows): filter artifact rows and heal gaps for full OCR height MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two new functions: - _is_artifact_row(): marks rows as artifacts if all detected tokens are single characters (scanner shadows produce dots/dashes, not words). A real vocabulary row always contains at least one 2+ char word. - _heal_row_gaps(): after removing empty/artifact rows, expands each remaining content row to the midpoint of adjacent gaps, so OCR crops are not artificially narrow. First row extends to content top_bound; last row to content bottom_bound. Applied in both build_cell_grid() and build_cell_grid_streaming() after the word_count>0 filter and before OCR. Addresses cases like: - Row 21: scan shadow → single-char artifacts → filtered before OCR - Row 23: completely empty (word_count=0) → already filtered - Row 22: real content → now expanded upward/downward to fill the space that rows 21 and 23 occupied, giving OCR the correct full height Co-Authored-By: Claude Sonnet 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 93 ++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index cad6ce5..78554e8 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -4303,6 +4303,66 @@ def _ocr_single_cell( } +def _is_artifact_row(row: RowGeometry) -> bool: + """Return True if this row contains only scan artifacts, not real text. + + Artifact rows (scanner shadows, noise) typically produce only single-character + detections. A real content row always has at least one token with 2+ characters. + """ + if row.word_count == 0: + return True + texts = [w.get('text', '').strip() for w in row.words] + return all(len(t) <= 1 for t in texts) + + +def _heal_row_gaps( + rows: List[RowGeometry], + top_bound: int, + bottom_bound: int, +) -> None: + """Expand row y/height to fill vertical gaps caused by removed adjacent rows. + + After filtering out empty or artifact rows, remaining content rows may have + gaps between them where the removed rows used to be. This function mutates + each row to extend upward/downward to the midpoint of such gaps so that + OCR crops cover the full available content area. + + The first row always extends to top_bound; the last row to bottom_bound. + """ + if not rows: + return + rows.sort(key=lambda r: r.y) + n = len(rows) + orig = [(r.y, r.y + r.height) for r in rows] # snapshot before mutation + + for i, row in enumerate(rows): + # New top: midpoint between previous row's bottom and this row's top + if i == 0: + new_top = top_bound + else: + prev_bot = orig[i - 1][1] + my_top = orig[i][0] + gap = my_top - prev_bot + new_top = prev_bot + gap // 2 if gap > 1 else my_top + + # New bottom: midpoint between this row's bottom and next row's top + if i == n - 1: + new_bottom = bottom_bound + else: + my_bot = orig[i][1] + next_top = orig[i + 1][0] + gap = next_top - my_bot + new_bottom = my_bot + gap // 2 if gap > 1 else my_bot + + row.y = new_top + row.height = max(5, new_bottom - new_top) + + logger.debug( + f"_heal_row_gaps: {n} rows → y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] " + f"(bounds: top={top_bound}, bottom={bottom_bound})" + ) + + def build_cell_grid( ocr_img: np.ndarray, column_regions: List[PageRegion], @@ -4374,6 +4434,25 @@ def build_cell_grid( logger.warning("build_cell_grid: no usable columns found") return [], [] + # Filter artifact rows: rows whose detected words are all single characters + # are caused by scanner shadows or noise, not real text. + before_art = len(content_rows) + content_rows = [r for r in content_rows if not _is_artifact_row(r)] + artifact_skipped = before_art - len(content_rows) + if artifact_skipped > 0: + logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)") + if not content_rows: + logger.warning("build_cell_grid: no content rows after artifact filtering") + return [], [] + + # Heal row gaps: rows removed above leave vertical gaps; expand adjacent rows + # to fill the space so OCR crops are not artificially narrow. + _heal_row_gaps( + content_rows, + top_bound=min(c.y for c in relevant_cols), + bottom_bound=max(c.y + c.height for c in relevant_cols), + ) + # Sort columns left-to-right relevant_cols.sort(key=lambda c: c.x) @@ -4555,6 +4634,20 @@ def build_cell_grid_streaming( if not relevant_cols: return + # Filter artifact rows + heal gaps (same logic as build_cell_grid) + before_art = len(content_rows) + content_rows = [r for r in content_rows if not _is_artifact_row(r)] + artifact_skipped = before_art - len(content_rows) + if artifact_skipped > 0: + logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows") + if not content_rows: + return + _heal_row_gaps( + content_rows, + top_bound=min(c.y for c in relevant_cols), + bottom_bound=max(c.y + c.height for c in relevant_cols), + ) + relevant_cols.sort(key=lambda c: c.x) columns_meta = [