From 123b7ada0bca840c27817a376309dc43536c7a90 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Tue, 3 Mar 2026 16:06:59 +0100
Subject: [PATCH] fix(columns): filter phantom narrow columns + rename step to
 OCR-Zeichenkorrektur
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phantom column fix:
Adjacent tiny gaps (e.g. 11px + 35px) can create very narrow columns
(< 3% of content width) with 0 words. These are scan artefacts, not
real columns. New Step 9 in detect_column_geometry():
- Filter columns where width < max(20px, 3% content_w) AND words < 3
- After filtering, extend each remaining column to close the gap with
  its right neighbor, and re-assign words to correct column

Example from logs: 5 columns → 4 columns (phantom at x=710, width=36px
eliminated; neighbors expanded to cover the gap)

UI rename:
- 'Schritt 6: LLM-Korrektur' → 'Schritt 6: OCR-Zeichenkorrektur'
- 'LLM-Korrektur starten' → 'Zeichenkorrektur starten'
- Error message updated accordingly
(No LLM involved anymore — spell-checker is the active engine)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../components/ocr-pipeline/StepLlmReview.tsx |  6 ++---
 klausur-service/backend/cv_vocab_pipeline.py  | 27 +++++++++++++++++++
 2 files changed, 30 insertions(+), 3 deletions(-)
diff --git a/admin-lehrer/components/ocr-pipeline/StepLlmReview.tsx b/admin-lehrer/components/ocr-pipeline/StepLlmReview.tsx
index 7694217..494b37a 100644
--- a/admin-lehrer/components/ocr-pipeline/StepLlmReview.tsx
+++ b/admin-lehrer/components/ocr-pipeline/StepLlmReview.tsx
@@ -342,7 +342,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) {
     return (
       <div className="flex flex-col items-center justify-center py-12 text-center">
         <div className="text-5xl mb-4">⚠️</div>
-        <h3 className="text-lg font-medium text-red-600 dark:text-red-400 mb-2">Fehler bei LLM-Korrektur</h3>
+        <h3 className="text-lg font-medium text-red-600 dark:text-red-400 mb-2">Fehler bei OCR-Zeichenkorrektur</h3>
         <p className="text-sm text-gray-500 dark:text-gray-400 max-w-lg mb-4">{error}</p>
         <div className="flex gap-3">
           <button onClick={() => { setError(''); loadSessionData() }}
@@ -387,7 +387,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) {
       <div className="flex items-center justify-between">
         <div>
           <h3 className="text-base font-medium text-gray-700 dark:text-gray-300">
-            Schritt 6: LLM-Korrektur
+            Schritt 6: OCR-Zeichenkorrektur
           </h3>
           <p className="text-xs text-gray-400 mt-0.5">
             {status === 'ready' && `${vocabEntries.length} Eintraege bereit zur Pruefung`}
@@ -405,7 +405,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) {
           {status === 'ready' && (
             <button onClick={runReview}
               className="px-5 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors text-sm font-medium">
-              LLM-Korrektur starten
+              Zeichenkorrektur starten
             </button>
           )}
           {status === 'running' && (
diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 78554e8..89d3238 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -1680,6 +1680,33 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
     logger.info(f"ColumnGeometry: {len(geometries)} columns: "
                 f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
 
+    # --- Step 9: Filter phantom narrow columns ---
+    # Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow
+    # columns (< 3% of content width) with zero or no words. These are not
+    # real columns — remove them and close the gap between neighbors.
+    min_real_col_w = max(20, int(content_w * 0.03))
+    filtered_geoms = [g for g in geometries
+                      if not (g.word_count < 3 and g.width < min_real_col_w)]
+    if len(filtered_geoms) < len(geometries):
+        n_removed = len(geometries) - len(filtered_geoms)
+        logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) "
+                    f"(width < {min_real_col_w}px and words < 3)")
+        # Extend each remaining column to close gaps with its right neighbor
+        for i, g in enumerate(filtered_geoms):
+            if i + 1 < len(filtered_geoms):
+                g.width = filtered_geoms[i + 1].x - g.x
+            else:
+                g.width = right_x - g.x
+            g.index = i
+            col_left_rel = g.x - left_x
+            col_right_rel = col_left_rel + g.width
+            g.words = [w for w in word_dicts
+                       if col_left_rel <= w['left'] < col_right_rel]
+            g.word_count = len(g.words)
+        geometries = filtered_geoms
+        logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: "
+                    f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
+
     return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)