Use grid-build zones for vocab extraction (4-column detection)

The initial build_grid_from_words() under-clusters to 1 column while _build_grid_core() correctly finds 4 columns (marker, EN, DE, example). Now extracts vocab from grid zones directly, with heuristic to skip narrow marker columns. Falls back to original cells if zones fail. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 01:17:40 +02:00
parent 3e3116d2fd
commit 682b306e51
1 changed files with 105 additions and 49 deletions
@@ -1585,62 +1585,118 @@ async def _run_ocr_pipeline_for_page(
        logger.warning(f"  grid-build failed: {e}, falling back to basic grid")
        grid_result = None

-    # 9. Extract vocab entries from original cells + columns_meta
-    # (Grid-build still runs for pipeline session / admin debugging,
-    # but its zone col_index values don't match the original columns_meta.)
+    # 9. Extract vocab entries
+    # Prefer grid-build result (better column detection, more cells) over
+    # the initial build_grid_from_words() which often under-clusters.
    page_vocabulary = []
+    extraction_source = "none"

-    col_types = {c.get("type") for c in columns_meta}
-    is_vocab = bool(col_types & {"column_en", "column_de"})
-
-    if is_vocab:
-        # Classified EN/DE columns → extract structured vocab entries
-        entries = _cells_to_vocab_entries(cells, columns_meta)
-        entries = _fix_phonetic_brackets(entries, pronunciation="british")
-        for entry in entries:
-            if not entry.get("english") and not entry.get("german"):
+    # A) Try grid-build zones first (best quality: 4-column detection, autocorrect)
+    if grid_result and grid_result.get("zones"):
+        for zone in grid_result["zones"]:
+            zone_cols = zone.get("columns", [])
+            zone_cells = zone.get("cells", [])
+            if not zone_cols or not zone_cells:
                continue
-            page_vocabulary.append({
-                "id": str(uuid.uuid4()),
-                "english": entry.get("english", ""),
-                "german": entry.get("german", ""),
-                "example_sentence": entry.get("example", ""),
-                "source_page": page_number + 1,
-            })
-    else:
-        # Generic layout — return ALL columns as-is
-        # Group cells by row, collect text per column in order
-        rows_map: dict = {}
-        for cell in cells:
-            ri = cell.get("row_index", 0)
-            if ri not in rows_map:
-                rows_map[ri] = {}
-            ci = cell.get("col_index", 0)
-            rows_map[ri][ci] = (cell.get("text") or "").strip()

-        # Sort columns by index
-        all_col_indices = sorted({ci for row in rows_map.values() for ci in row.keys()})
+            # Sort columns by x position to determine roles
+            sorted_cols = sorted(zone_cols, key=lambda c: c.get("x_min_px", 0))
+            col_idx_to_pos = {}
+            for pos, col in enumerate(sorted_cols):
+                ci = col.get("col_index", col.get("index", -1))
+                col_idx_to_pos[ci] = pos

-        for ri in sorted(rows_map.keys()):
-            row = rows_map[ri]
-            texts = [row.get(ci, "") for ci in all_col_indices]
-            # Skip completely empty rows
-            if not any(texts):
+            # Skip zones with only 1 column (likely headers/boxes)
+            if len(sorted_cols) < 2:
                continue
-            # Map to english/german/example by position (best effort)
-            entry = {
-                "id": str(uuid.uuid4()),
-                "english": texts[0] if len(texts) > 0 else "",
-                "german": texts[1] if len(texts) > 1 else "",
-                "example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "",
-                "source_page": page_number + 1,
-            }
-            if entry["english"] or entry["german"]:
-                page_vocabulary.append(entry)

-    logger.info(f"  vocab extraction: {len(page_vocabulary)} entries "
-                f"(layout={'vocab' if is_vocab else 'generic'}, "
-                f"cols={len(columns_meta)})")
+            # Group cells by row
+            rows_map: dict = {}
+            for cell in zone_cells:
+                ri = cell.get("row_index", 0)
+                if ri not in rows_map:
+                    rows_map[ri] = {}
+                ci = cell.get("col_index", 0)
+                rows_map[ri][ci] = (cell.get("text") or "").strip()
+
+            n_cols = len(sorted_cols)
+            for ri in sorted(rows_map.keys()):
+                row = rows_map[ri]
+                # Collect texts in column-position order
+                texts = []
+                for col in sorted_cols:
+                    ci = col.get("col_index", col.get("index", -1))
+                    texts.append(row.get(ci, ""))
+
+                if not any(texts):
+                    continue
+
+                # Map by position, skipping narrow first column (page refs/markers)
+                # Heuristic: if first column is very narrow (<15% of zone width),
+                # it's likely a marker/ref column — skip it for vocab
+                first_col_width = sorted_cols[0].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)
+                zone_width = max(1, (sorted_cols[-1].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)))
+                skip_first = first_col_width / zone_width < 0.15 and n_cols >= 3
+
+                data_texts = texts[1:] if skip_first else texts
+
+                entry = {
+                    "id": str(uuid.uuid4()),
+                    "english": data_texts[0] if len(data_texts) > 0 else "",
+                    "german": data_texts[1] if len(data_texts) > 1 else "",
+                    "example_sentence": " ".join(t for t in data_texts[2:] if t) if len(data_texts) > 2 else "",
+                    "source_page": page_number + 1,
+                }
+                if entry["english"] or entry["german"]:
+                    page_vocabulary.append(entry)
+
+        if page_vocabulary:
+            extraction_source = f"grid-zones ({len(grid_result['zones'])} zones)"
+
+    # B) Fallback: original cells with column classification
+    if not page_vocabulary:
+        col_types = {c.get("type") for c in columns_meta}
+        is_vocab = bool(col_types & {"column_en", "column_de"})
+
+        if is_vocab:
+            entries = _cells_to_vocab_entries(cells, columns_meta)
+            entries = _fix_phonetic_brackets(entries, pronunciation="british")
+            for entry in entries:
+                if not entry.get("english") and not entry.get("german"):
+                    continue
+                page_vocabulary.append({
+                    "id": str(uuid.uuid4()),
+                    "english": entry.get("english", ""),
+                    "german": entry.get("german", ""),
+                    "example_sentence": entry.get("example", ""),
+                    "source_page": page_number + 1,
+                })
+            extraction_source = f"classified ({len(columns_meta)} cols)"
+        else:
+            # Last resort: all cells by position
+            rows_map2: dict = {}
+            for cell in cells:
+                ri = cell.get("row_index", 0)
+                if ri not in rows_map2:
+                    rows_map2[ri] = {}
+                ci = cell.get("col_index", 0)
+                rows_map2[ri][ci] = (cell.get("text") or "").strip()
+            all_ci = sorted({ci for r in rows_map2.values() for ci in r.keys()})
+            for ri in sorted(rows_map2.keys()):
+                row = rows_map2[ri]
+                texts = [row.get(ci, "") for ci in all_ci]
+                if not any(texts):
+                    continue
+                page_vocabulary.append({
+                    "id": str(uuid.uuid4()),
+                    "english": texts[0] if len(texts) > 0 else "",
+                    "german": texts[1] if len(texts) > 1 else "",
+                    "example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "",
+                    "source_page": page_number + 1,
+                })
+            extraction_source = f"generic ({len(all_ci)} cols)"
+
+    logger.info(f"  vocab extraction: {len(page_vocabulary)} entries via {extraction_source}")

    total_duration = _time.time() - t_total
    logger.info(f"Kombi Pipeline page {page_number + 1}: "