Use grid-build zones for vocab extraction (4-column detection)
Some checks failed
CI / go-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / test-go-school (push) Successful in 41s
CI / test-go-edu-search (push) Successful in 42s
CI / test-python-klausur (push) Failing after 2m44s
CI / test-python-agent-core (push) Successful in 29s
CI / test-nodejs-website (push) Successful in 36s
Some checks failed
CI / go-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / test-go-school (push) Successful in 41s
CI / test-go-edu-search (push) Successful in 42s
CI / test-python-klausur (push) Failing after 2m44s
CI / test-python-agent-core (push) Successful in 29s
CI / test-nodejs-website (push) Successful in 36s
The initial build_grid_from_words() under-clusters to 1 column while _build_grid_core() correctly finds 4 columns (marker, EN, DE, example). Now extracts vocab from grid zones directly, with heuristic to skip narrow marker columns. Falls back to original cells if zones fail. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1585,62 +1585,118 @@ async def _run_ocr_pipeline_for_page(
|
|||||||
logger.warning(f" grid-build failed: {e}, falling back to basic grid")
|
logger.warning(f" grid-build failed: {e}, falling back to basic grid")
|
||||||
grid_result = None
|
grid_result = None
|
||||||
|
|
||||||
# 9. Extract vocab entries from original cells + columns_meta
|
# 9. Extract vocab entries
|
||||||
# (Grid-build still runs for pipeline session / admin debugging,
|
# Prefer grid-build result (better column detection, more cells) over
|
||||||
# but its zone col_index values don't match the original columns_meta.)
|
# the initial build_grid_from_words() which often under-clusters.
|
||||||
page_vocabulary = []
|
page_vocabulary = []
|
||||||
|
extraction_source = "none"
|
||||||
|
|
||||||
col_types = {c.get("type") for c in columns_meta}
|
# A) Try grid-build zones first (best quality: 4-column detection, autocorrect)
|
||||||
is_vocab = bool(col_types & {"column_en", "column_de"})
|
if grid_result and grid_result.get("zones"):
|
||||||
|
for zone in grid_result["zones"]:
|
||||||
if is_vocab:
|
zone_cols = zone.get("columns", [])
|
||||||
# Classified EN/DE columns → extract structured vocab entries
|
zone_cells = zone.get("cells", [])
|
||||||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
if not zone_cols or not zone_cells:
|
||||||
entries = _fix_phonetic_brackets(entries, pronunciation="british")
|
|
||||||
for entry in entries:
|
|
||||||
if not entry.get("english") and not entry.get("german"):
|
|
||||||
continue
|
continue
|
||||||
page_vocabulary.append({
|
|
||||||
"id": str(uuid.uuid4()),
|
|
||||||
"english": entry.get("english", ""),
|
|
||||||
"german": entry.get("german", ""),
|
|
||||||
"example_sentence": entry.get("example", ""),
|
|
||||||
"source_page": page_number + 1,
|
|
||||||
})
|
|
||||||
else:
|
|
||||||
# Generic layout — return ALL columns as-is
|
|
||||||
# Group cells by row, collect text per column in order
|
|
||||||
rows_map: dict = {}
|
|
||||||
for cell in cells:
|
|
||||||
ri = cell.get("row_index", 0)
|
|
||||||
if ri not in rows_map:
|
|
||||||
rows_map[ri] = {}
|
|
||||||
ci = cell.get("col_index", 0)
|
|
||||||
rows_map[ri][ci] = (cell.get("text") or "").strip()
|
|
||||||
|
|
||||||
# Sort columns by index
|
# Sort columns by x position to determine roles
|
||||||
all_col_indices = sorted({ci for row in rows_map.values() for ci in row.keys()})
|
sorted_cols = sorted(zone_cols, key=lambda c: c.get("x_min_px", 0))
|
||||||
|
col_idx_to_pos = {}
|
||||||
|
for pos, col in enumerate(sorted_cols):
|
||||||
|
ci = col.get("col_index", col.get("index", -1))
|
||||||
|
col_idx_to_pos[ci] = pos
|
||||||
|
|
||||||
for ri in sorted(rows_map.keys()):
|
# Skip zones with only 1 column (likely headers/boxes)
|
||||||
row = rows_map[ri]
|
if len(sorted_cols) < 2:
|
||||||
texts = [row.get(ci, "") for ci in all_col_indices]
|
|
||||||
# Skip completely empty rows
|
|
||||||
if not any(texts):
|
|
||||||
continue
|
continue
|
||||||
# Map to english/german/example by position (best effort)
|
|
||||||
entry = {
|
|
||||||
"id": str(uuid.uuid4()),
|
|
||||||
"english": texts[0] if len(texts) > 0 else "",
|
|
||||||
"german": texts[1] if len(texts) > 1 else "",
|
|
||||||
"example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "",
|
|
||||||
"source_page": page_number + 1,
|
|
||||||
}
|
|
||||||
if entry["english"] or entry["german"]:
|
|
||||||
page_vocabulary.append(entry)
|
|
||||||
|
|
||||||
logger.info(f" vocab extraction: {len(page_vocabulary)} entries "
|
# Group cells by row
|
||||||
f"(layout={'vocab' if is_vocab else 'generic'}, "
|
rows_map: dict = {}
|
||||||
f"cols={len(columns_meta)})")
|
for cell in zone_cells:
|
||||||
|
ri = cell.get("row_index", 0)
|
||||||
|
if ri not in rows_map:
|
||||||
|
rows_map[ri] = {}
|
||||||
|
ci = cell.get("col_index", 0)
|
||||||
|
rows_map[ri][ci] = (cell.get("text") or "").strip()
|
||||||
|
|
||||||
|
n_cols = len(sorted_cols)
|
||||||
|
for ri in sorted(rows_map.keys()):
|
||||||
|
row = rows_map[ri]
|
||||||
|
# Collect texts in column-position order
|
||||||
|
texts = []
|
||||||
|
for col in sorted_cols:
|
||||||
|
ci = col.get("col_index", col.get("index", -1))
|
||||||
|
texts.append(row.get(ci, ""))
|
||||||
|
|
||||||
|
if not any(texts):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Map by position, skipping narrow first column (page refs/markers)
|
||||||
|
# Heuristic: if first column is very narrow (<15% of zone width),
|
||||||
|
# it's likely a marker/ref column — skip it for vocab
|
||||||
|
first_col_width = sorted_cols[0].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)
|
||||||
|
zone_width = max(1, (sorted_cols[-1].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)))
|
||||||
|
skip_first = first_col_width / zone_width < 0.15 and n_cols >= 3
|
||||||
|
|
||||||
|
data_texts = texts[1:] if skip_first else texts
|
||||||
|
|
||||||
|
entry = {
|
||||||
|
"id": str(uuid.uuid4()),
|
||||||
|
"english": data_texts[0] if len(data_texts) > 0 else "",
|
||||||
|
"german": data_texts[1] if len(data_texts) > 1 else "",
|
||||||
|
"example_sentence": " ".join(t for t in data_texts[2:] if t) if len(data_texts) > 2 else "",
|
||||||
|
"source_page": page_number + 1,
|
||||||
|
}
|
||||||
|
if entry["english"] or entry["german"]:
|
||||||
|
page_vocabulary.append(entry)
|
||||||
|
|
||||||
|
if page_vocabulary:
|
||||||
|
extraction_source = f"grid-zones ({len(grid_result['zones'])} zones)"
|
||||||
|
|
||||||
|
# B) Fallback: original cells with column classification
|
||||||
|
if not page_vocabulary:
|
||||||
|
col_types = {c.get("type") for c in columns_meta}
|
||||||
|
is_vocab = bool(col_types & {"column_en", "column_de"})
|
||||||
|
|
||||||
|
if is_vocab:
|
||||||
|
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||||||
|
entries = _fix_phonetic_brackets(entries, pronunciation="british")
|
||||||
|
for entry in entries:
|
||||||
|
if not entry.get("english") and not entry.get("german"):
|
||||||
|
continue
|
||||||
|
page_vocabulary.append({
|
||||||
|
"id": str(uuid.uuid4()),
|
||||||
|
"english": entry.get("english", ""),
|
||||||
|
"german": entry.get("german", ""),
|
||||||
|
"example_sentence": entry.get("example", ""),
|
||||||
|
"source_page": page_number + 1,
|
||||||
|
})
|
||||||
|
extraction_source = f"classified ({len(columns_meta)} cols)"
|
||||||
|
else:
|
||||||
|
# Last resort: all cells by position
|
||||||
|
rows_map2: dict = {}
|
||||||
|
for cell in cells:
|
||||||
|
ri = cell.get("row_index", 0)
|
||||||
|
if ri not in rows_map2:
|
||||||
|
rows_map2[ri] = {}
|
||||||
|
ci = cell.get("col_index", 0)
|
||||||
|
rows_map2[ri][ci] = (cell.get("text") or "").strip()
|
||||||
|
all_ci = sorted({ci for r in rows_map2.values() for ci in r.keys()})
|
||||||
|
for ri in sorted(rows_map2.keys()):
|
||||||
|
row = rows_map2[ri]
|
||||||
|
texts = [row.get(ci, "") for ci in all_ci]
|
||||||
|
if not any(texts):
|
||||||
|
continue
|
||||||
|
page_vocabulary.append({
|
||||||
|
"id": str(uuid.uuid4()),
|
||||||
|
"english": texts[0] if len(texts) > 0 else "",
|
||||||
|
"german": texts[1] if len(texts) > 1 else "",
|
||||||
|
"example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "",
|
||||||
|
"source_page": page_number + 1,
|
||||||
|
})
|
||||||
|
extraction_source = f"generic ({len(all_ci)} cols)"
|
||||||
|
|
||||||
|
logger.info(f" vocab extraction: {len(page_vocabulary)} entries via {extraction_source}")
|
||||||
|
|
||||||
total_duration = _time.time() - t_total
|
total_duration = _time.time() - t_total
|
||||||
logger.info(f"Kombi Pipeline page {page_number + 1}: "
|
logger.info(f"Kombi Pipeline page {page_number + 1}: "
|
||||||
|
|||||||
Reference in New Issue
Block a user