diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index cad6ce5..78554e8 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -4303,6 +4303,66 @@ def _ocr_single_cell( } +def _is_artifact_row(row: RowGeometry) -> bool: + """Return True if this row contains only scan artifacts, not real text. + + Artifact rows (scanner shadows, noise) typically produce only single-character + detections. A real content row always has at least one token with 2+ characters. + """ + if row.word_count == 0: + return True + texts = [w.get('text', '').strip() for w in row.words] + return all(len(t) <= 1 for t in texts) + + +def _heal_row_gaps( + rows: List[RowGeometry], + top_bound: int, + bottom_bound: int, +) -> None: + """Expand row y/height to fill vertical gaps caused by removed adjacent rows. + + After filtering out empty or artifact rows, remaining content rows may have + gaps between them where the removed rows used to be. This function mutates + each row to extend upward/downward to the midpoint of such gaps so that + OCR crops cover the full available content area. + + The first row always extends to top_bound; the last row to bottom_bound. + """ + if not rows: + return + rows.sort(key=lambda r: r.y) + n = len(rows) + orig = [(r.y, r.y + r.height) for r in rows] # snapshot before mutation + + for i, row in enumerate(rows): + # New top: midpoint between previous row's bottom and this row's top + if i == 0: + new_top = top_bound + else: + prev_bot = orig[i - 1][1] + my_top = orig[i][0] + gap = my_top - prev_bot + new_top = prev_bot + gap // 2 if gap > 1 else my_top + + # New bottom: midpoint between this row's bottom and next row's top + if i == n - 1: + new_bottom = bottom_bound + else: + my_bot = orig[i][1] + next_top = orig[i + 1][0] + gap = next_top - my_bot + new_bottom = my_bot + gap // 2 if gap > 1 else my_bot + + row.y = new_top + row.height = max(5, new_bottom - new_top) + + logger.debug( + f"_heal_row_gaps: {n} rows → y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] " + f"(bounds: top={top_bound}, bottom={bottom_bound})" + ) + + def build_cell_grid( ocr_img: np.ndarray, column_regions: List[PageRegion], @@ -4374,6 +4434,25 @@ def build_cell_grid( logger.warning("build_cell_grid: no usable columns found") return [], [] + # Filter artifact rows: rows whose detected words are all single characters + # are caused by scanner shadows or noise, not real text. + before_art = len(content_rows) + content_rows = [r for r in content_rows if not _is_artifact_row(r)] + artifact_skipped = before_art - len(content_rows) + if artifact_skipped > 0: + logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)") + if not content_rows: + logger.warning("build_cell_grid: no content rows after artifact filtering") + return [], [] + + # Heal row gaps: rows removed above leave vertical gaps; expand adjacent rows + # to fill the space so OCR crops are not artificially narrow. + _heal_row_gaps( + content_rows, + top_bound=min(c.y for c in relevant_cols), + bottom_bound=max(c.y + c.height for c in relevant_cols), + ) + # Sort columns left-to-right relevant_cols.sort(key=lambda c: c.x) @@ -4555,6 +4634,20 @@ def build_cell_grid_streaming( if not relevant_cols: return + # Filter artifact rows + heal gaps (same logic as build_cell_grid) + before_art = len(content_rows) + content_rows = [r for r in content_rows if not _is_artifact_row(r)] + artifact_skipped = before_art - len(content_rows) + if artifact_skipped > 0: + logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows") + if not content_rows: + return + _heal_row_gaps( + content_rows, + top_bound=min(c.y for c in relevant_cols), + bottom_bound=max(c.y + c.height for c in relevant_cols), + ) + relevant_cols.sort(key=lambda c: c.x) columns_meta = [