diff --git a/klausur-service/backend/vocab_worksheet_api.py b/klausur-service/backend/vocab_worksheet_api.py index 6674ce3..ea8bd43 100644 --- a/klausur-service/backend/vocab_worksheet_api.py +++ b/klausur-service/backend/vocab_worksheet_api.py @@ -1585,34 +1585,62 @@ async def _run_ocr_pipeline_for_page( logger.warning(f" grid-build failed: {e}, falling back to basic grid") grid_result = None - # 9. Extract vocab entries - # The grid-build improves text quality (pipe-autocorrect, word-gap merge), - # but its zone columns use generic types. For EN/DE classification we use - # the improved cells from grid zones with the original columns_meta from - # build_grid_from_words() which has the correct column_en/column_de types. + # 9. Extract vocab entries from original cells + columns_meta + # (Grid-build still runs for pipeline session / admin debugging, + # but its zone col_index values don't match the original columns_meta.) page_vocabulary = [] - # Collect improved cell texts from grid zones (if available) - grid_cells = cells # default: raw cells from dual-engine OCR - if grid_result and grid_result.get("zones"): - grid_cells = [] - for zone in grid_result["zones"]: - for cell in zone.get("cells", []): - grid_cells.append(cell) + col_types = {c.get("type") for c in columns_meta} + is_vocab = bool(col_types & {"column_en", "column_de"}) - # Use _cells_to_vocab_entries with original columns_meta for classification - entries = _cells_to_vocab_entries(grid_cells, columns_meta) - entries = _fix_phonetic_brackets(entries, pronunciation="british") - for entry in entries: - if not entry.get("english") and not entry.get("german"): - continue - page_vocabulary.append({ - "id": str(uuid.uuid4()), - "english": entry.get("english", ""), - "german": entry.get("german", ""), - "example_sentence": entry.get("example", ""), - "source_page": page_number + 1, - }) + if is_vocab: + # Classified EN/DE columns → extract structured vocab entries + entries = _cells_to_vocab_entries(cells, columns_meta) + entries = _fix_phonetic_brackets(entries, pronunciation="british") + for entry in entries: + if not entry.get("english") and not entry.get("german"): + continue + page_vocabulary.append({ + "id": str(uuid.uuid4()), + "english": entry.get("english", ""), + "german": entry.get("german", ""), + "example_sentence": entry.get("example", ""), + "source_page": page_number + 1, + }) + else: + # Generic layout — return ALL columns as-is + # Group cells by row, collect text per column in order + rows_map: dict = {} + for cell in cells: + ri = cell.get("row_index", 0) + if ri not in rows_map: + rows_map[ri] = {} + ci = cell.get("col_index", 0) + rows_map[ri][ci] = (cell.get("text") or "").strip() + + # Sort columns by index + all_col_indices = sorted({ci for row in rows_map.values() for ci in row.keys()}) + + for ri in sorted(rows_map.keys()): + row = rows_map[ri] + texts = [row.get(ci, "") for ci in all_col_indices] + # Skip completely empty rows + if not any(texts): + continue + # Map to english/german/example by position (best effort) + entry = { + "id": str(uuid.uuid4()), + "english": texts[0] if len(texts) > 0 else "", + "german": texts[1] if len(texts) > 1 else "", + "example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "", + "source_page": page_number + 1, + } + if entry["english"] or entry["german"]: + page_vocabulary.append(entry) + + logger.info(f" vocab extraction: {len(page_vocabulary)} entries " + f"(layout={'vocab' if is_vocab else 'generic'}, " + f"cols={len(columns_meta)})") total_duration = _time.time() - t_total logger.info(f"Kombi Pipeline page {page_number + 1}: "