Fix vocab extraction: show all columns for generic layouts
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 43s
CI / test-go-edu-search (push) Successful in 41s
CI / test-python-klausur (push) Failing after 2m36s
CI / test-python-agent-core (push) Successful in 31s
CI / test-nodejs-website (push) Successful in 36s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 43s
CI / test-go-edu-search (push) Successful in 41s
CI / test-python-klausur (push) Failing after 2m36s
CI / test-python-agent-core (push) Successful in 31s
CI / test-nodejs-website (push) Successful in 36s
When columns can't be classified as EN/DE, map them by position: col 0 → english, col 1 → german, col 2+ → example. This ensures vocabulary pages are always extracted, even without explicit language classification. Classified pages still use the proper EN/DE/example mapping. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1585,34 +1585,62 @@ async def _run_ocr_pipeline_for_page(
|
||||
logger.warning(f" grid-build failed: {e}, falling back to basic grid")
|
||||
grid_result = None
|
||||
|
||||
# 9. Extract vocab entries
|
||||
# The grid-build improves text quality (pipe-autocorrect, word-gap merge),
|
||||
# but its zone columns use generic types. For EN/DE classification we use
|
||||
# the improved cells from grid zones with the original columns_meta from
|
||||
# build_grid_from_words() which has the correct column_en/column_de types.
|
||||
# 9. Extract vocab entries from original cells + columns_meta
|
||||
# (Grid-build still runs for pipeline session / admin debugging,
|
||||
# but its zone col_index values don't match the original columns_meta.)
|
||||
page_vocabulary = []
|
||||
|
||||
# Collect improved cell texts from grid zones (if available)
|
||||
grid_cells = cells # default: raw cells from dual-engine OCR
|
||||
if grid_result and grid_result.get("zones"):
|
||||
grid_cells = []
|
||||
for zone in grid_result["zones"]:
|
||||
for cell in zone.get("cells", []):
|
||||
grid_cells.append(cell)
|
||||
col_types = {c.get("type") for c in columns_meta}
|
||||
is_vocab = bool(col_types & {"column_en", "column_de"})
|
||||
|
||||
# Use _cells_to_vocab_entries with original columns_meta for classification
|
||||
entries = _cells_to_vocab_entries(grid_cells, columns_meta)
|
||||
entries = _fix_phonetic_brackets(entries, pronunciation="british")
|
||||
for entry in entries:
|
||||
if not entry.get("english") and not entry.get("german"):
|
||||
continue
|
||||
page_vocabulary.append({
|
||||
"id": str(uuid.uuid4()),
|
||||
"english": entry.get("english", ""),
|
||||
"german": entry.get("german", ""),
|
||||
"example_sentence": entry.get("example", ""),
|
||||
"source_page": page_number + 1,
|
||||
})
|
||||
if is_vocab:
|
||||
# Classified EN/DE columns → extract structured vocab entries
|
||||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||||
entries = _fix_phonetic_brackets(entries, pronunciation="british")
|
||||
for entry in entries:
|
||||
if not entry.get("english") and not entry.get("german"):
|
||||
continue
|
||||
page_vocabulary.append({
|
||||
"id": str(uuid.uuid4()),
|
||||
"english": entry.get("english", ""),
|
||||
"german": entry.get("german", ""),
|
||||
"example_sentence": entry.get("example", ""),
|
||||
"source_page": page_number + 1,
|
||||
})
|
||||
else:
|
||||
# Generic layout — return ALL columns as-is
|
||||
# Group cells by row, collect text per column in order
|
||||
rows_map: dict = {}
|
||||
for cell in cells:
|
||||
ri = cell.get("row_index", 0)
|
||||
if ri not in rows_map:
|
||||
rows_map[ri] = {}
|
||||
ci = cell.get("col_index", 0)
|
||||
rows_map[ri][ci] = (cell.get("text") or "").strip()
|
||||
|
||||
# Sort columns by index
|
||||
all_col_indices = sorted({ci for row in rows_map.values() for ci in row.keys()})
|
||||
|
||||
for ri in sorted(rows_map.keys()):
|
||||
row = rows_map[ri]
|
||||
texts = [row.get(ci, "") for ci in all_col_indices]
|
||||
# Skip completely empty rows
|
||||
if not any(texts):
|
||||
continue
|
||||
# Map to english/german/example by position (best effort)
|
||||
entry = {
|
||||
"id": str(uuid.uuid4()),
|
||||
"english": texts[0] if len(texts) > 0 else "",
|
||||
"german": texts[1] if len(texts) > 1 else "",
|
||||
"example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "",
|
||||
"source_page": page_number + 1,
|
||||
}
|
||||
if entry["english"] or entry["german"]:
|
||||
page_vocabulary.append(entry)
|
||||
|
||||
logger.info(f" vocab extraction: {len(page_vocabulary)} entries "
|
||||
f"(layout={'vocab' if is_vocab else 'generic'}, "
|
||||
f"cols={len(columns_meta)})")
|
||||
|
||||
total_duration = _time.time() - t_total
|
||||
logger.info(f"Kombi Pipeline page {page_number + 1}: "
|
||||
|
||||
Reference in New Issue
Block a user