diff --git a/admin-lehrer/components/ocr-pipeline/StepLlmReview.tsx b/admin-lehrer/components/ocr-pipeline/StepLlmReview.tsx index 7694217..494b37a 100644 --- a/admin-lehrer/components/ocr-pipeline/StepLlmReview.tsx +++ b/admin-lehrer/components/ocr-pipeline/StepLlmReview.tsx @@ -342,7 +342,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) { return (
⚠️
-

Fehler bei LLM-Korrektur

+

Fehler bei OCR-Zeichenkorrektur

{error}

)} {status === 'running' && ( diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 78554e8..89d3238 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -1680,6 +1680,33 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt logger.info(f"ColumnGeometry: {len(geometries)} columns: " f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}") + # --- Step 9: Filter phantom narrow columns --- + # Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow + # columns (< 3% of content width) with zero or no words. These are not + # real columns — remove them and close the gap between neighbors. + min_real_col_w = max(20, int(content_w * 0.03)) + filtered_geoms = [g for g in geometries + if not (g.word_count < 3 and g.width < min_real_col_w)] + if len(filtered_geoms) < len(geometries): + n_removed = len(geometries) - len(filtered_geoms) + logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) " + f"(width < {min_real_col_w}px and words < 3)") + # Extend each remaining column to close gaps with its right neighbor + for i, g in enumerate(filtered_geoms): + if i + 1 < len(filtered_geoms): + g.width = filtered_geoms[i + 1].x - g.x + else: + g.width = right_x - g.x + g.index = i + col_left_rel = g.x - left_x + col_right_rel = col_left_rel + g.width + g.words = [w for w in word_dicts + if col_left_rel <= w['left'] < col_right_rel] + g.word_count = len(g.words) + geometries = filtered_geoms + logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: " + f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}") + return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)