From 9a8ce697828b4785efa0df40625305d972df78cf Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Sat, 11 Apr 2026 01:07:49 +0200
Subject: [PATCH] Fix vocab extraction: use original column types for EN/DE
 classification

The grid-build zones use generic column types, losing the EN/DE
classification from build_grid_from_words(). Now extracts improved
cells from grid zones but classifies them using the original
columns_meta which has the correct column_en/column_de types.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../backend/vocab_worksheet_api.py            | 83 +++++--------------
 1 file changed, 23 insertions(+), 60 deletions(-)

diff --git a/klausur-service/backend/vocab_worksheet_api.py b/klausur-service/backend/vocab_worksheet_api.py
index fe0d6d0..6674ce3 100644
--- a/klausur-service/backend/vocab_worksheet_api.py
+++ b/klausur-service/backend/vocab_worksheet_api.py
@@ -1585,71 +1585,34 @@ async def _run_ocr_pipeline_for_page(
         logger.warning(f"  grid-build failed: {e}, falling back to basic grid")
         grid_result = None
 
-    # 9. Extract vocab entries from grid result (zones → cells → vocab)
+    # 9. Extract vocab entries
+    # The grid-build improves text quality (pipe-autocorrect, word-gap merge),
+    # but its zone columns use generic types. For EN/DE classification we use
+    # the improved cells from grid zones with the original columns_meta from
+    # build_grid_from_words() which has the correct column_en/column_de types.
     page_vocabulary = []
 
+    # Collect improved cell texts from grid zones (if available)
+    grid_cells = cells  # default: raw cells from dual-engine OCR
     if grid_result and grid_result.get("zones"):
-        # Extract from the improved zone-based grid
+        grid_cells = []
         for zone in grid_result["zones"]:
-            zone_cols = zone.get("columns", [])
-            zone_cells = zone.get("cells", [])
-            if not zone_cols or not zone_cells:
-                continue
+            for cell in zone.get("cells", []):
+                grid_cells.append(cell)
 
-            # Build col_index → col_type map
-            col_type_map = {}
-            for col in zone_cols:
-                ci = col.get("col_index", col.get("index", -1))
-                col_type_map[ci] = col.get("type", col.get("col_type", ""))
-
-            # Group cells by row
-            rows_map = {}
-            for cell in zone_cells:
-                ri = cell.get("row_index", 0)
-                if ri not in rows_map:
-                    rows_map[ri] = {}
-                ci = cell.get("col_index", 0)
-                rows_map[ri][ci] = cell
-
-            for ri in sorted(rows_map.keys()):
-                row_cells = rows_map[ri]
-                en = ""
-                de = ""
-                ex = ""
-                for ci, cell in row_cells.items():
-                    ct = col_type_map.get(ci, "")
-                    text = (cell.get("text") or "").strip()
-                    if not text:
-                        continue
-                    if "en" in ct:
-                        en = text
-                    elif "de" in ct:
-                        de = text
-                    elif "example" in ct or "text" in ct:
-                        ex = text if not ex else ex + " " + text
-
-                if en or de:
-                    page_vocabulary.append({
-                        "id": str(uuid.uuid4()),
-                        "english": en,
-                        "german": de,
-                        "example_sentence": ex,
-                        "source_page": page_number + 1,
-                    })
-    else:
-        # Fallback: use basic cells → vocab entries
-        entries = _cells_to_vocab_entries(cells, columns_meta)
-        entries = _fix_phonetic_brackets(entries, pronunciation="british")
-        for entry in entries:
-            if not entry.get("english") and not entry.get("german"):
-                continue
-            page_vocabulary.append({
-                "id": str(uuid.uuid4()),
-                "english": entry.get("english", ""),
-                "german": entry.get("german", ""),
-                "example_sentence": entry.get("example", ""),
-                "source_page": page_number + 1,
-            })
+    # Use _cells_to_vocab_entries with original columns_meta for classification
+    entries = _cells_to_vocab_entries(grid_cells, columns_meta)
+    entries = _fix_phonetic_brackets(entries, pronunciation="british")
+    for entry in entries:
+        if not entry.get("english") and not entry.get("german"):
+            continue
+        page_vocabulary.append({
+            "id": str(uuid.uuid4()),
+            "english": entry.get("english", ""),
+            "german": entry.get("german", ""),
+            "example_sentence": entry.get("example", ""),
+            "source_page": page_number + 1,
+        })
 
     total_duration = _time.time() - t_total
     logger.info(f"Kombi Pipeline page {page_number + 1}: "