Use grid-build zones for vocab extraction (4-column detection)

The initial build_grid_from_words() under-clusters to 1 column while _build_grid_core() correctly finds 4 columns (marker, EN, DE, example). Now extracts vocab from grid zones directly, with heuristic to skip narrow marker columns. Falls back to original cells if zones fail. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 01:17:40 +02:00
parent 3e3116d2fd
commit 682b306e51
1 changed files with 105 additions and 49 deletions
@@ -1585,62 +1585,118 @@ async def _run_ocr_pipeline_for_page(
        logger.warning(f"  grid-build failed: {e}, falling back to basic grid")
        grid_result = None
-    # 9. Extract vocab entries from original cells + columns_meta
+    # 9. Extract vocab entries
-    # (Grid-build still runs for pipeline session / admin debugging,
+    # Prefer grid-build result (better column detection, more cells) over
-    # but its zone col_index values don't match the original columns_meta.)
+    # the initial build_grid_from_words() which often under-clusters.
    page_vocabulary = []
    extraction_source = "none"
-    col_types = {c.get("type") for c in columns_meta}
+    # A) Try grid-build zones first (best quality: 4-column detection, autocorrect)
-    is_vocab = bool(col_types & {"column_en", "column_de"})
+    if grid_result and grid_result.get("zones"):
-
+        for zone in grid_result["zones"]:
-    if is_vocab:
+            zone_cols = zone.get("columns", [])
-        # Classified EN/DE columns → extract structured vocab entries
+            zone_cells = zone.get("cells", [])
-        entries = _cells_to_vocab_entries(cells, columns_meta)
+            if not zone_cols or not zone_cells:
        entries = _fix_phonetic_brackets(entries, pronunciation="british")
        for entry in entries:
            if not entry.get("english") and not entry.get("german"):
                continue
            page_vocabulary.append({
                "id": str(uuid.uuid4()),
                "english": entry.get("english", ""),
                "german": entry.get("german", ""),
                "example_sentence": entry.get("example", ""),
                "source_page": page_number + 1,
            })
    else:
        # Generic layout — return ALL columns as-is
        # Group cells by row, collect text per column in order
        rows_map: dict = {}
        for cell in cells:
            ri = cell.get("row_index", 0)
            if ri not in rows_map:
                rows_map[ri] = {}
            ci = cell.get("col_index", 0)
            rows_map[ri][ci] = (cell.get("text") or "").strip()
-        # Sort columns by index
+            # Sort columns by x position to determine roles
-        all_col_indices = sorted({ci for row in rows_map.values() for ci in row.keys()})
+            sorted_cols = sorted(zone_cols, key=lambda c: c.get("x_min_px", 0))
            col_idx_to_pos = {}
            for pos, col in enumerate(sorted_cols):
                ci = col.get("col_index", col.get("index", -1))
                col_idx_to_pos[ci] = pos
-        for ri in sorted(rows_map.keys()):
+            # Skip zones with only 1 column (likely headers/boxes)
-            row = rows_map[ri]
+            if len(sorted_cols) < 2:
            texts = [row.get(ci, "") for ci in all_col_indices]
            # Skip completely empty rows
            if not any(texts):
                continue
            # Map to english/german/example by position (best effort)
            entry = {
                "id": str(uuid.uuid4()),
                "english": texts[0] if len(texts) > 0 else "",
                "german": texts[1] if len(texts) > 1 else "",
                "example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "",
                "source_page": page_number + 1,
            }
            if entry["english"] or entry["german"]:
                page_vocabulary.append(entry)
-    logger.info(f"  vocab extraction: {len(page_vocabulary)} entries "
+            # Group cells by row
-                f"(layout={'vocab' if is_vocab else 'generic'}, "
+            rows_map: dict = {}
-                f"cols={len(columns_meta)})")
+            for cell in zone_cells:
                ri = cell.get("row_index", 0)
                if ri not in rows_map:
                    rows_map[ri] = {}
                ci = cell.get("col_index", 0)
                rows_map[ri][ci] = (cell.get("text") or "").strip()
            n_cols = len(sorted_cols)
            for ri in sorted(rows_map.keys()):
                row = rows_map[ri]
                # Collect texts in column-position order
                texts = []
                for col in sorted_cols:
                    ci = col.get("col_index", col.get("index", -1))
                    texts.append(row.get(ci, ""))
                if not any(texts):
                    continue
                # Map by position, skipping narrow first column (page refs/markers)
                # Heuristic: if first column is very narrow (<15% of zone width),
                # it's likely a marker/ref column — skip it for vocab
                first_col_width = sorted_cols[0].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)
                zone_width = max(1, (sorted_cols[-1].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)))
                skip_first = first_col_width / zone_width < 0.15 and n_cols >= 3
                data_texts = texts[1:] if skip_first else texts
                entry = {
                    "id": str(uuid.uuid4()),
                    "english": data_texts[0] if len(data_texts) > 0 else "",
                    "german": data_texts[1] if len(data_texts) > 1 else "",
                    "example_sentence": " ".join(t for t in data_texts[2:] if t) if len(data_texts) > 2 else "",
                    "source_page": page_number + 1,
                }
                if entry["english"] or entry["german"]:
                    page_vocabulary.append(entry)
        if page_vocabulary:
            extraction_source = f"grid-zones ({len(grid_result['zones'])} zones)"
    # B) Fallback: original cells with column classification
    if not page_vocabulary:
        col_types = {c.get("type") for c in columns_meta}
        is_vocab = bool(col_types & {"column_en", "column_de"})
        if is_vocab:
            entries = _cells_to_vocab_entries(cells, columns_meta)
            entries = _fix_phonetic_brackets(entries, pronunciation="british")
            for entry in entries:
                if not entry.get("english") and not entry.get("german"):
                    continue
                page_vocabulary.append({
                    "id": str(uuid.uuid4()),
                    "english": entry.get("english", ""),
                    "german": entry.get("german", ""),
                    "example_sentence": entry.get("example", ""),
                    "source_page": page_number + 1,
                })
            extraction_source = f"classified ({len(columns_meta)} cols)"
        else:
            # Last resort: all cells by position
            rows_map2: dict = {}
            for cell in cells:
                ri = cell.get("row_index", 0)
                if ri not in rows_map2:
                    rows_map2[ri] = {}
                ci = cell.get("col_index", 0)
                rows_map2[ri][ci] = (cell.get("text") or "").strip()
            all_ci = sorted({ci for r in rows_map2.values() for ci in r.keys()})
            for ri in sorted(rows_map2.keys()):
                row = rows_map2[ri]
                texts = [row.get(ci, "") for ci in all_ci]
                if not any(texts):
                    continue
                page_vocabulary.append({
                    "id": str(uuid.uuid4()),
                    "english": texts[0] if len(texts) > 0 else "",
                    "german": texts[1] if len(texts) > 1 else "",
                    "example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "",
                    "source_page": page_number + 1,
                })
            extraction_source = f"generic ({len(all_ci)} cols)"
    logger.info(f"  vocab extraction: {len(page_vocabulary)} entries via {extraction_source}")
    total_duration = _time.time() - t_total
    logger.info(f"Kombi Pipeline page {page_number + 1}: "