Fix vocab extraction: use original column types for EN/DE classification
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 37s
CI / test-go-edu-search (push) Successful in 39s
CI / test-python-agent-core (push) Has been cancelled
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-klausur (push) Has been cancelled
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 37s
CI / test-go-edu-search (push) Successful in 39s
CI / test-python-agent-core (push) Has been cancelled
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-klausur (push) Has been cancelled
The grid-build zones use generic column types, losing the EN/DE classification from build_grid_from_words(). Now extracts improved cells from grid zones but classifies them using the original columns_meta which has the correct column_en/column_de types. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1585,71 +1585,34 @@ async def _run_ocr_pipeline_for_page(
|
|||||||
logger.warning(f" grid-build failed: {e}, falling back to basic grid")
|
logger.warning(f" grid-build failed: {e}, falling back to basic grid")
|
||||||
grid_result = None
|
grid_result = None
|
||||||
|
|
||||||
# 9. Extract vocab entries from grid result (zones → cells → vocab)
|
# 9. Extract vocab entries
|
||||||
|
# The grid-build improves text quality (pipe-autocorrect, word-gap merge),
|
||||||
|
# but its zone columns use generic types. For EN/DE classification we use
|
||||||
|
# the improved cells from grid zones with the original columns_meta from
|
||||||
|
# build_grid_from_words() which has the correct column_en/column_de types.
|
||||||
page_vocabulary = []
|
page_vocabulary = []
|
||||||
|
|
||||||
|
# Collect improved cell texts from grid zones (if available)
|
||||||
|
grid_cells = cells # default: raw cells from dual-engine OCR
|
||||||
if grid_result and grid_result.get("zones"):
|
if grid_result and grid_result.get("zones"):
|
||||||
# Extract from the improved zone-based grid
|
grid_cells = []
|
||||||
for zone in grid_result["zones"]:
|
for zone in grid_result["zones"]:
|
||||||
zone_cols = zone.get("columns", [])
|
for cell in zone.get("cells", []):
|
||||||
zone_cells = zone.get("cells", [])
|
grid_cells.append(cell)
|
||||||
if not zone_cols or not zone_cells:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Build col_index → col_type map
|
# Use _cells_to_vocab_entries with original columns_meta for classification
|
||||||
col_type_map = {}
|
entries = _cells_to_vocab_entries(grid_cells, columns_meta)
|
||||||
for col in zone_cols:
|
entries = _fix_phonetic_brackets(entries, pronunciation="british")
|
||||||
ci = col.get("col_index", col.get("index", -1))
|
for entry in entries:
|
||||||
col_type_map[ci] = col.get("type", col.get("col_type", ""))
|
if not entry.get("english") and not entry.get("german"):
|
||||||
|
continue
|
||||||
# Group cells by row
|
page_vocabulary.append({
|
||||||
rows_map = {}
|
"id": str(uuid.uuid4()),
|
||||||
for cell in zone_cells:
|
"english": entry.get("english", ""),
|
||||||
ri = cell.get("row_index", 0)
|
"german": entry.get("german", ""),
|
||||||
if ri not in rows_map:
|
"example_sentence": entry.get("example", ""),
|
||||||
rows_map[ri] = {}
|
"source_page": page_number + 1,
|
||||||
ci = cell.get("col_index", 0)
|
})
|
||||||
rows_map[ri][ci] = cell
|
|
||||||
|
|
||||||
for ri in sorted(rows_map.keys()):
|
|
||||||
row_cells = rows_map[ri]
|
|
||||||
en = ""
|
|
||||||
de = ""
|
|
||||||
ex = ""
|
|
||||||
for ci, cell in row_cells.items():
|
|
||||||
ct = col_type_map.get(ci, "")
|
|
||||||
text = (cell.get("text") or "").strip()
|
|
||||||
if not text:
|
|
||||||
continue
|
|
||||||
if "en" in ct:
|
|
||||||
en = text
|
|
||||||
elif "de" in ct:
|
|
||||||
de = text
|
|
||||||
elif "example" in ct or "text" in ct:
|
|
||||||
ex = text if not ex else ex + " " + text
|
|
||||||
|
|
||||||
if en or de:
|
|
||||||
page_vocabulary.append({
|
|
||||||
"id": str(uuid.uuid4()),
|
|
||||||
"english": en,
|
|
||||||
"german": de,
|
|
||||||
"example_sentence": ex,
|
|
||||||
"source_page": page_number + 1,
|
|
||||||
})
|
|
||||||
else:
|
|
||||||
# Fallback: use basic cells → vocab entries
|
|
||||||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
|
||||||
entries = _fix_phonetic_brackets(entries, pronunciation="british")
|
|
||||||
for entry in entries:
|
|
||||||
if not entry.get("english") and not entry.get("german"):
|
|
||||||
continue
|
|
||||||
page_vocabulary.append({
|
|
||||||
"id": str(uuid.uuid4()),
|
|
||||||
"english": entry.get("english", ""),
|
|
||||||
"german": entry.get("german", ""),
|
|
||||||
"example_sentence": entry.get("example", ""),
|
|
||||||
"source_page": page_number + 1,
|
|
||||||
})
|
|
||||||
|
|
||||||
total_duration = _time.time() - t_total
|
total_duration = _time.time() - t_total
|
||||||
logger.info(f"Kombi Pipeline page {page_number + 1}: "
|
logger.info(f"Kombi Pipeline page {page_number + 1}: "
|
||||||
|
|||||||
Reference in New Issue
Block a user