Fix: merge cell-wrap continuation rows in vocabulary extraction
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 58s
CI / test-go-edu-search (push) Successful in 48s
CI / test-python-agent-core (push) Has been cancelled
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-klausur (push) Has started running
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 58s
CI / test-go-edu-search (push) Successful in 48s
CI / test-python-agent-core (push) Has been cancelled
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-klausur (push) Has started running
When textbook authors wrap text within a cell (e.g. long German translations), OCR treats each physical line as a separate row. New _merge_wrapped_rows() detects this by checking if the primary column (EN) is empty — indicating a continuation, not a new entry. Handles: empty EN + DE text, empty EN + example text, parenthetical continuations like "(bei)", triple wraps, comma-separated lists. 12 tests added covering all cases. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1447,6 +1447,90 @@ def _merge_phonetic_continuation_rows(
|
||||
return merged
|
||||
|
||||
|
||||
def _merge_wrapped_rows(
|
||||
entries: List[Dict[str, Any]],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Merge rows where the primary column (EN) is empty — cell wrap continuation.
|
||||
|
||||
In textbook vocabulary tables, columns are often narrow, so the author
|
||||
wraps text within a cell. OCR treats each physical line as a separate row.
|
||||
The key indicator: if the EN column is empty but DE/example have text,
|
||||
this row is a continuation of the previous row's cells.
|
||||
|
||||
Example (original textbook has ONE row):
|
||||
Row 2: EN="take part (in)" DE="teilnehmen (an), mitmachen" EX="More than 200 singers took"
|
||||
Row 3: EN="" DE="(bei)" EX="part in the concert."
|
||||
→ Merged: EN="take part (in)" DE="teilnehmen (an), mitmachen (bei)" EX="More than 200 singers took part in the concert."
|
||||
|
||||
Also handles the reverse case: DE empty but EN has text (wrap in EN column).
|
||||
"""
|
||||
if len(entries) < 2:
|
||||
return entries
|
||||
|
||||
merged: List[Dict[str, Any]] = []
|
||||
for entry in entries:
|
||||
en = (entry.get('english') or '').strip()
|
||||
de = (entry.get('german') or '').strip()
|
||||
ex = (entry.get('example') or '').strip()
|
||||
|
||||
if not merged:
|
||||
merged.append(entry)
|
||||
continue
|
||||
|
||||
prev = merged[-1]
|
||||
prev_en = (prev.get('english') or '').strip()
|
||||
prev_de = (prev.get('german') or '').strip()
|
||||
prev_ex = (prev.get('example') or '').strip()
|
||||
|
||||
# Case 1: EN is empty → continuation of previous row
|
||||
# (DE or EX have text that should be appended to previous row)
|
||||
if not en and (de or ex) and prev_en:
|
||||
if de:
|
||||
if prev_de.endswith(','):
|
||||
sep = ' ' # "Wort," + " " + "Ausdruck"
|
||||
elif prev_de.endswith(('-', '(')):
|
||||
sep = '' # "teil-" + "nehmen" or "(" + "bei)"
|
||||
else:
|
||||
sep = ' '
|
||||
prev['german'] = (prev_de + sep + de).strip()
|
||||
if ex:
|
||||
sep = ' ' if prev_ex else ''
|
||||
prev['example'] = (prev_ex + sep + ex).strip()
|
||||
logger.debug(
|
||||
f"Merged wrapped row {entry.get('row_index')} into previous "
|
||||
f"(empty EN): DE={prev['german']!r}, EX={prev.get('example', '')!r}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Case 2: DE is empty, EN has text that looks like continuation
|
||||
# (starts with lowercase or is a parenthetical like "(bei)")
|
||||
if en and not de and prev_de:
|
||||
is_paren = en.startswith('(')
|
||||
first_alpha = next((c for c in en if c.isalpha()), '')
|
||||
starts_lower = first_alpha and first_alpha.islower()
|
||||
|
||||
if (is_paren or starts_lower) and len(en.split()) < 5:
|
||||
sep = ' ' if prev_en and not prev_en.endswith((',', '-', '(')) else ''
|
||||
prev['english'] = (prev_en + sep + en).strip()
|
||||
if ex:
|
||||
sep2 = ' ' if prev_ex else ''
|
||||
prev['example'] = (prev_ex + sep2 + ex).strip()
|
||||
logger.debug(
|
||||
f"Merged wrapped row {entry.get('row_index')} into previous "
|
||||
f"(empty DE): EN={prev['english']!r}"
|
||||
)
|
||||
continue
|
||||
|
||||
merged.append(entry)
|
||||
|
||||
if len(merged) < len(entries):
|
||||
logger.info(
|
||||
f"_merge_wrapped_rows: merged {len(entries) - len(merged)} "
|
||||
f"continuation rows ({len(entries)} → {len(merged)})"
|
||||
)
|
||||
return merged
|
||||
|
||||
|
||||
def _merge_continuation_rows(
|
||||
entries: List[Dict[str, Any]],
|
||||
) -> List[Dict[str, Any]]:
|
||||
@@ -1561,6 +1645,9 @@ def build_word_grid(
|
||||
# --- Post-processing pipeline (deterministic, no LLM) ---
|
||||
n_raw = len(entries)
|
||||
|
||||
# 0. Merge cell-wrap continuation rows (empty primary column = text wrap)
|
||||
entries = _merge_wrapped_rows(entries)
|
||||
|
||||
# 0a. Merge phonetic-only continuation rows into previous entry
|
||||
entries = _merge_phonetic_continuation_rows(entries)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user