From 9a8ce697828b4785efa0df40625305d972df78cf Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 11 Apr 2026 01:07:49 +0200 Subject: [PATCH] Fix vocab extraction: use original column types for EN/DE classification The grid-build zones use generic column types, losing the EN/DE classification from build_grid_from_words(). Now extracts improved cells from grid zones but classifies them using the original columns_meta which has the correct column_en/column_de types. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../backend/vocab_worksheet_api.py | 83 +++++-------------- 1 file changed, 23 insertions(+), 60 deletions(-) diff --git a/klausur-service/backend/vocab_worksheet_api.py b/klausur-service/backend/vocab_worksheet_api.py index fe0d6d0..6674ce3 100644 --- a/klausur-service/backend/vocab_worksheet_api.py +++ b/klausur-service/backend/vocab_worksheet_api.py @@ -1585,71 +1585,34 @@ async def _run_ocr_pipeline_for_page( logger.warning(f" grid-build failed: {e}, falling back to basic grid") grid_result = None - # 9. Extract vocab entries from grid result (zones → cells → vocab) + # 9. Extract vocab entries + # The grid-build improves text quality (pipe-autocorrect, word-gap merge), + # but its zone columns use generic types. For EN/DE classification we use + # the improved cells from grid zones with the original columns_meta from + # build_grid_from_words() which has the correct column_en/column_de types. page_vocabulary = [] + # Collect improved cell texts from grid zones (if available) + grid_cells = cells # default: raw cells from dual-engine OCR if grid_result and grid_result.get("zones"): - # Extract from the improved zone-based grid + grid_cells = [] for zone in grid_result["zones"]: - zone_cols = zone.get("columns", []) - zone_cells = zone.get("cells", []) - if not zone_cols or not zone_cells: - continue + for cell in zone.get("cells", []): + grid_cells.append(cell) - # Build col_index → col_type map - col_type_map = {} - for col in zone_cols: - ci = col.get("col_index", col.get("index", -1)) - col_type_map[ci] = col.get("type", col.get("col_type", "")) - - # Group cells by row - rows_map = {} - for cell in zone_cells: - ri = cell.get("row_index", 0) - if ri not in rows_map: - rows_map[ri] = {} - ci = cell.get("col_index", 0) - rows_map[ri][ci] = cell - - for ri in sorted(rows_map.keys()): - row_cells = rows_map[ri] - en = "" - de = "" - ex = "" - for ci, cell in row_cells.items(): - ct = col_type_map.get(ci, "") - text = (cell.get("text") or "").strip() - if not text: - continue - if "en" in ct: - en = text - elif "de" in ct: - de = text - elif "example" in ct or "text" in ct: - ex = text if not ex else ex + " " + text - - if en or de: - page_vocabulary.append({ - "id": str(uuid.uuid4()), - "english": en, - "german": de, - "example_sentence": ex, - "source_page": page_number + 1, - }) - else: - # Fallback: use basic cells → vocab entries - entries = _cells_to_vocab_entries(cells, columns_meta) - entries = _fix_phonetic_brackets(entries, pronunciation="british") - for entry in entries: - if not entry.get("english") and not entry.get("german"): - continue - page_vocabulary.append({ - "id": str(uuid.uuid4()), - "english": entry.get("english", ""), - "german": entry.get("german", ""), - "example_sentence": entry.get("example", ""), - "source_page": page_number + 1, - }) + # Use _cells_to_vocab_entries with original columns_meta for classification + entries = _cells_to_vocab_entries(grid_cells, columns_meta) + entries = _fix_phonetic_brackets(entries, pronunciation="british") + for entry in entries: + if not entry.get("english") and not entry.get("german"): + continue + page_vocabulary.append({ + "id": str(uuid.uuid4()), + "english": entry.get("english", ""), + "german": entry.get("german", ""), + "example_sentence": entry.get("example", ""), + "source_page": page_number + 1, + }) total_duration = _time.time() - t_total logger.info(f"Kombi Pipeline page {page_number + 1}: "