From 123b7ada0bca840c27817a376309dc43536c7a90 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 3 Mar 2026 16:06:59 +0100 Subject: [PATCH] fix(columns): filter phantom narrow columns + rename step to OCR-Zeichenkorrektur MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phantom column fix: Adjacent tiny gaps (e.g. 11px + 35px) can create very narrow columns (< 3% of content width) with 0 words. These are scan artefacts, not real columns. New Step 9 in detect_column_geometry(): - Filter columns where width < max(20px, 3% content_w) AND words < 3 - After filtering, extend each remaining column to close the gap with its right neighbor, and re-assign words to correct column Example from logs: 5 columns → 4 columns (phantom at x=710, width=36px eliminated; neighbors expanded to cover the gap) UI rename: - 'Schritt 6: LLM-Korrektur' → 'Schritt 6: OCR-Zeichenkorrektur' - 'LLM-Korrektur starten' → 'Zeichenkorrektur starten' - Error message updated accordingly (No LLM involved anymore — spell-checker is the active engine) Co-Authored-By: Claude Sonnet 4.6 --- .../components/ocr-pipeline/StepLlmReview.tsx | 6 ++--- klausur-service/backend/cv_vocab_pipeline.py | 27 +++++++++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/admin-lehrer/components/ocr-pipeline/StepLlmReview.tsx b/admin-lehrer/components/ocr-pipeline/StepLlmReview.tsx index 7694217..494b37a 100644 --- a/admin-lehrer/components/ocr-pipeline/StepLlmReview.tsx +++ b/admin-lehrer/components/ocr-pipeline/StepLlmReview.tsx @@ -342,7 +342,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) { return (
⚠️
-

Fehler bei LLM-Korrektur

+

Fehler bei OCR-Zeichenkorrektur

{error}

)} {status === 'running' && ( diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 78554e8..89d3238 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -1680,6 +1680,33 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt logger.info(f"ColumnGeometry: {len(geometries)} columns: " f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}") + # --- Step 9: Filter phantom narrow columns --- + # Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow + # columns (< 3% of content width) with zero or no words. These are not + # real columns — remove them and close the gap between neighbors. + min_real_col_w = max(20, int(content_w * 0.03)) + filtered_geoms = [g for g in geometries + if not (g.word_count < 3 and g.width < min_real_col_w)] + if len(filtered_geoms) < len(geometries): + n_removed = len(geometries) - len(filtered_geoms) + logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) " + f"(width < {min_real_col_w}px and words < 3)") + # Extend each remaining column to close gaps with its right neighbor + for i, g in enumerate(filtered_geoms): + if i + 1 < len(filtered_geoms): + g.width = filtered_geoms[i + 1].x - g.x + else: + g.width = right_x - g.x + g.index = i + col_left_rel = g.x - left_x + col_right_rel = col_left_rel + g.width + g.words = [w for w in word_dicts + if col_left_rel <= w['left'] < col_right_rel] + g.word_count = len(g.words) + geometries = filtered_geoms + logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: " + f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}") + return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)