Fix vocab extraction: show all columns for generic layouts

When columns can't be classified as EN/DE, map them by position: col 0 → english, col 1 → german, col 2+ → example. This ensures vocabulary pages are always extracted, even without explicit language classification. Classified pages still use the proper EN/DE/example mapping. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 01:11:40 +02:00
parent 9a8ce69782
commit 3e3116d2fd
1 changed files with 53 additions and 25 deletions
@@ -1585,34 +1585,62 @@ async def _run_ocr_pipeline_for_page(
        logger.warning(f"  grid-build failed: {e}, falling back to basic grid")
        grid_result = None

-    # 9. Extract vocab entries
-    # The grid-build improves text quality (pipe-autocorrect, word-gap merge),
-    # but its zone columns use generic types. For EN/DE classification we use
-    # the improved cells from grid zones with the original columns_meta from
-    # build_grid_from_words() which has the correct column_en/column_de types.
+    # 9. Extract vocab entries from original cells + columns_meta
+    # (Grid-build still runs for pipeline session / admin debugging,
+    # but its zone col_index values don't match the original columns_meta.)
    page_vocabulary = []

-    # Collect improved cell texts from grid zones (if available)
-    grid_cells = cells  # default: raw cells from dual-engine OCR
-    if grid_result and grid_result.get("zones"):
-        grid_cells = []
-        for zone in grid_result["zones"]:
-            for cell in zone.get("cells", []):
-                grid_cells.append(cell)
+    col_types = {c.get("type") for c in columns_meta}
+    is_vocab = bool(col_types & {"column_en", "column_de"})

-    # Use _cells_to_vocab_entries with original columns_meta for classification
-    entries = _cells_to_vocab_entries(grid_cells, columns_meta)
-    entries = _fix_phonetic_brackets(entries, pronunciation="british")
-    for entry in entries:
-        if not entry.get("english") and not entry.get("german"):
-            continue
-        page_vocabulary.append({
-            "id": str(uuid.uuid4()),
-            "english": entry.get("english", ""),
-            "german": entry.get("german", ""),
-            "example_sentence": entry.get("example", ""),
-            "source_page": page_number + 1,
-        })
+    if is_vocab:
+        # Classified EN/DE columns → extract structured vocab entries
+        entries = _cells_to_vocab_entries(cells, columns_meta)
+        entries = _fix_phonetic_brackets(entries, pronunciation="british")
+        for entry in entries:
+            if not entry.get("english") and not entry.get("german"):
+                continue
+            page_vocabulary.append({
+                "id": str(uuid.uuid4()),
+                "english": entry.get("english", ""),
+                "german": entry.get("german", ""),
+                "example_sentence": entry.get("example", ""),
+                "source_page": page_number + 1,
+            })
+    else:
+        # Generic layout — return ALL columns as-is
+        # Group cells by row, collect text per column in order
+        rows_map: dict = {}
+        for cell in cells:
+            ri = cell.get("row_index", 0)
+            if ri not in rows_map:
+                rows_map[ri] = {}
+            ci = cell.get("col_index", 0)
+            rows_map[ri][ci] = (cell.get("text") or "").strip()
+
+        # Sort columns by index
+        all_col_indices = sorted({ci for row in rows_map.values() for ci in row.keys()})
+
+        for ri in sorted(rows_map.keys()):
+            row = rows_map[ri]
+            texts = [row.get(ci, "") for ci in all_col_indices]
+            # Skip completely empty rows
+            if not any(texts):
+                continue
+            # Map to english/german/example by position (best effort)
+            entry = {
+                "id": str(uuid.uuid4()),
+                "english": texts[0] if len(texts) > 0 else "",
+                "german": texts[1] if len(texts) > 1 else "",
+                "example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "",
+                "source_page": page_number + 1,
+            }
+            if entry["english"] or entry["german"]:
+                page_vocabulary.append(entry)
+
+    logger.info(f"  vocab extraction: {len(page_vocabulary)} entries "
+                f"(layout={'vocab' if is_vocab else 'generic'}, "
+                f"cols={len(columns_meta)})")

    total_duration = _time.time() - t_total
    logger.info(f"Kombi Pipeline page {page_number + 1}: "