From 682b306e51d4a43f94cf93e57ca9223dd256a7ec Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 11 Apr 2026 01:17:40 +0200 Subject: [PATCH] Use grid-build zones for vocab extraction (4-column detection) The initial build_grid_from_words() under-clusters to 1 column while _build_grid_core() correctly finds 4 columns (marker, EN, DE, example). Now extracts vocab from grid zones directly, with heuristic to skip narrow marker columns. Falls back to original cells if zones fail. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../backend/vocab_worksheet_api.py | 154 ++++++++++++------ 1 file changed, 105 insertions(+), 49 deletions(-) diff --git a/klausur-service/backend/vocab_worksheet_api.py b/klausur-service/backend/vocab_worksheet_api.py index ea8bd43..bec9294 100644 --- a/klausur-service/backend/vocab_worksheet_api.py +++ b/klausur-service/backend/vocab_worksheet_api.py @@ -1585,62 +1585,118 @@ async def _run_ocr_pipeline_for_page( logger.warning(f" grid-build failed: {e}, falling back to basic grid") grid_result = None - # 9. Extract vocab entries from original cells + columns_meta - # (Grid-build still runs for pipeline session / admin debugging, - # but its zone col_index values don't match the original columns_meta.) + # 9. Extract vocab entries + # Prefer grid-build result (better column detection, more cells) over + # the initial build_grid_from_words() which often under-clusters. page_vocabulary = [] + extraction_source = "none" - col_types = {c.get("type") for c in columns_meta} - is_vocab = bool(col_types & {"column_en", "column_de"}) - - if is_vocab: - # Classified EN/DE columns → extract structured vocab entries - entries = _cells_to_vocab_entries(cells, columns_meta) - entries = _fix_phonetic_brackets(entries, pronunciation="british") - for entry in entries: - if not entry.get("english") and not entry.get("german"): + # A) Try grid-build zones first (best quality: 4-column detection, autocorrect) + if grid_result and grid_result.get("zones"): + for zone in grid_result["zones"]: + zone_cols = zone.get("columns", []) + zone_cells = zone.get("cells", []) + if not zone_cols or not zone_cells: continue - page_vocabulary.append({ - "id": str(uuid.uuid4()), - "english": entry.get("english", ""), - "german": entry.get("german", ""), - "example_sentence": entry.get("example", ""), - "source_page": page_number + 1, - }) - else: - # Generic layout — return ALL columns as-is - # Group cells by row, collect text per column in order - rows_map: dict = {} - for cell in cells: - ri = cell.get("row_index", 0) - if ri not in rows_map: - rows_map[ri] = {} - ci = cell.get("col_index", 0) - rows_map[ri][ci] = (cell.get("text") or "").strip() - # Sort columns by index - all_col_indices = sorted({ci for row in rows_map.values() for ci in row.keys()}) + # Sort columns by x position to determine roles + sorted_cols = sorted(zone_cols, key=lambda c: c.get("x_min_px", 0)) + col_idx_to_pos = {} + for pos, col in enumerate(sorted_cols): + ci = col.get("col_index", col.get("index", -1)) + col_idx_to_pos[ci] = pos - for ri in sorted(rows_map.keys()): - row = rows_map[ri] - texts = [row.get(ci, "") for ci in all_col_indices] - # Skip completely empty rows - if not any(texts): + # Skip zones with only 1 column (likely headers/boxes) + if len(sorted_cols) < 2: continue - # Map to english/german/example by position (best effort) - entry = { - "id": str(uuid.uuid4()), - "english": texts[0] if len(texts) > 0 else "", - "german": texts[1] if len(texts) > 1 else "", - "example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "", - "source_page": page_number + 1, - } - if entry["english"] or entry["german"]: - page_vocabulary.append(entry) - logger.info(f" vocab extraction: {len(page_vocabulary)} entries " - f"(layout={'vocab' if is_vocab else 'generic'}, " - f"cols={len(columns_meta)})") + # Group cells by row + rows_map: dict = {} + for cell in zone_cells: + ri = cell.get("row_index", 0) + if ri not in rows_map: + rows_map[ri] = {} + ci = cell.get("col_index", 0) + rows_map[ri][ci] = (cell.get("text") or "").strip() + + n_cols = len(sorted_cols) + for ri in sorted(rows_map.keys()): + row = rows_map[ri] + # Collect texts in column-position order + texts = [] + for col in sorted_cols: + ci = col.get("col_index", col.get("index", -1)) + texts.append(row.get(ci, "")) + + if not any(texts): + continue + + # Map by position, skipping narrow first column (page refs/markers) + # Heuristic: if first column is very narrow (<15% of zone width), + # it's likely a marker/ref column — skip it for vocab + first_col_width = sorted_cols[0].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0) + zone_width = max(1, (sorted_cols[-1].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0))) + skip_first = first_col_width / zone_width < 0.15 and n_cols >= 3 + + data_texts = texts[1:] if skip_first else texts + + entry = { + "id": str(uuid.uuid4()), + "english": data_texts[0] if len(data_texts) > 0 else "", + "german": data_texts[1] if len(data_texts) > 1 else "", + "example_sentence": " ".join(t for t in data_texts[2:] if t) if len(data_texts) > 2 else "", + "source_page": page_number + 1, + } + if entry["english"] or entry["german"]: + page_vocabulary.append(entry) + + if page_vocabulary: + extraction_source = f"grid-zones ({len(grid_result['zones'])} zones)" + + # B) Fallback: original cells with column classification + if not page_vocabulary: + col_types = {c.get("type") for c in columns_meta} + is_vocab = bool(col_types & {"column_en", "column_de"}) + + if is_vocab: + entries = _cells_to_vocab_entries(cells, columns_meta) + entries = _fix_phonetic_brackets(entries, pronunciation="british") + for entry in entries: + if not entry.get("english") and not entry.get("german"): + continue + page_vocabulary.append({ + "id": str(uuid.uuid4()), + "english": entry.get("english", ""), + "german": entry.get("german", ""), + "example_sentence": entry.get("example", ""), + "source_page": page_number + 1, + }) + extraction_source = f"classified ({len(columns_meta)} cols)" + else: + # Last resort: all cells by position + rows_map2: dict = {} + for cell in cells: + ri = cell.get("row_index", 0) + if ri not in rows_map2: + rows_map2[ri] = {} + ci = cell.get("col_index", 0) + rows_map2[ri][ci] = (cell.get("text") or "").strip() + all_ci = sorted({ci for r in rows_map2.values() for ci in r.keys()}) + for ri in sorted(rows_map2.keys()): + row = rows_map2[ri] + texts = [row.get(ci, "") for ci in all_ci] + if not any(texts): + continue + page_vocabulary.append({ + "id": str(uuid.uuid4()), + "english": texts[0] if len(texts) > 0 else "", + "german": texts[1] if len(texts) > 1 else "", + "example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "", + "source_page": page_number + 1, + }) + extraction_source = f"generic ({len(all_ci)} cols)" + + logger.info(f" vocab extraction: {len(page_vocabulary)} entries via {extraction_source}") total_duration = _time.time() - t_total logger.info(f"Kombi Pipeline page {page_number + 1}: "