- Schritt 6: LLM-Korrektur
+ Schritt 6: OCR-Zeichenkorrektur
{status === 'ready' && `${vocabEntries.length} Eintraege bereit zur Pruefung`}
@@ -405,7 +405,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) {
{status === 'ready' && (
)}
{status === 'running' && (
diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 78554e8..89d3238 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -1680,6 +1680,33 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
+ # --- Step 9: Filter phantom narrow columns ---
+ # Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow
+ # columns (< 3% of content width) with zero or no words. These are not
+ # real columns — remove them and close the gap between neighbors.
+ min_real_col_w = max(20, int(content_w * 0.03))
+ filtered_geoms = [g for g in geometries
+ if not (g.word_count < 3 and g.width < min_real_col_w)]
+ if len(filtered_geoms) < len(geometries):
+ n_removed = len(geometries) - len(filtered_geoms)
+ logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) "
+ f"(width < {min_real_col_w}px and words < 3)")
+ # Extend each remaining column to close gaps with its right neighbor
+ for i, g in enumerate(filtered_geoms):
+ if i + 1 < len(filtered_geoms):
+ g.width = filtered_geoms[i + 1].x - g.x
+ else:
+ g.width = right_x - g.x
+ g.index = i
+ col_left_rel = g.x - left_x
+ col_right_rel = col_left_rel + g.width
+ g.words = [w for w in word_dicts
+ if col_left_rel <= w['left'] < col_right_rel]
+ g.word_count = len(g.words)
+ geometries = filtered_geoms
+ logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: "
+ f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
+
return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)