""" Row-merging logic for vocabulary entries (phonetic, wrapped, continuation rows). Extracted from cv_cell_grid.py. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging import re from typing import Any, Dict, List from cv_ocr_engines import _RE_ALPHA logger = logging.getLogger(__name__) # Regex: line starts with phonetic bracket content only (no real word before it) _PHONETIC_ONLY_RE = re.compile( r'''^\s*[\[\('"]*[^\]]*[\])\s]*$''' ) def _is_phonetic_only_text(text: str) -> bool: """Check if text consists only of phonetic transcription. Phonetic-only patterns: ['mani serva] -> True [dance] -> True ["a:mand] -> True almond ['a:mand] -> False (has real word before bracket) Mandel -> False """ t = text.strip() if not t: return False # Must contain at least one bracket if '[' not in t and ']' not in t: return False # Remove all bracket content and surrounding punctuation/whitespace without_brackets = re.sub(r"\[.*?\]", '', t) without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets) # If nothing meaningful remains, it's phonetic-only alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets)) return len(alpha_remaining) < 2 def _merge_phonetic_continuation_rows( entries: List[Dict[str, Any]], ) -> List[Dict[str, Any]]: """Merge rows that contain only phonetic transcription into previous entry. In dictionary pages, phonetic transcription sometimes wraps to the next row. E.g.: Row 28: EN="it's a money-saver" DE="es spart Kosten" Row 29: EN="['mani serva]" DE="" Row 29 is phonetic-only -> merge into row 28's EN field. """ if len(entries) < 2: return entries merged: List[Dict[str, Any]] = [] for entry in entries: en = (entry.get('english') or '').strip() de = (entry.get('german') or '').strip() ex = (entry.get('example') or '').strip() # Check if this entry is phonetic-only (EN has only phonetics, DE empty) if merged and _is_phonetic_only_text(en) and not de: prev = merged[-1] prev_en = (prev.get('english') or '').strip() # Append phonetic to previous entry's EN if prev_en: prev['english'] = prev_en + ' ' + en else: prev['english'] = en # If there was an example, append to previous too if ex: prev_ex = (prev.get('example') or '').strip() prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex logger.debug( f"Merged phonetic row {entry.get('row_index')} " f"into previous entry: {prev['english']!r}" ) continue merged.append(entry) return merged def _merge_wrapped_rows( entries: List[Dict[str, Any]], ) -> List[Dict[str, Any]]: """Merge rows where the primary column (EN) is empty -- cell wrap continuation. In textbook vocabulary tables, columns are often narrow, so the author wraps text within a cell. OCR treats each physical line as a separate row. The key indicator: if the EN column is empty but DE/example have text, this row is a continuation of the previous row's cells. Example (original textbook has ONE row): Row 2: EN="take part (in)" DE="teilnehmen (an), mitmachen" EX="More than 200 singers took" Row 3: EN="" DE="(bei)" EX="part in the concert." -> Merged: EN="take part (in)" DE="teilnehmen (an), mitmachen (bei)" EX="..." Also handles the reverse case: DE empty but EN has text (wrap in EN column). """ if len(entries) < 2: return entries merged: List[Dict[str, Any]] = [] for entry in entries: en = (entry.get('english') or '').strip() de = (entry.get('german') or '').strip() ex = (entry.get('example') or '').strip() if not merged: merged.append(entry) continue prev = merged[-1] prev_en = (prev.get('english') or '').strip() prev_de = (prev.get('german') or '').strip() prev_ex = (prev.get('example') or '').strip() # Case 1: EN is empty -> continuation of previous row if not en and (de or ex) and prev_en: if de: if prev_de.endswith(','): sep = ' ' elif prev_de.endswith(('-', '(')): sep = '' else: sep = ' ' prev['german'] = (prev_de + sep + de).strip() if ex: sep = ' ' if prev_ex else '' prev['example'] = (prev_ex + sep + ex).strip() logger.debug( f"Merged wrapped row {entry.get('row_index')} into previous " f"(empty EN): DE={prev['german']!r}, EX={prev.get('example', '')!r}" ) continue # Case 2: DE is empty, EN has text that looks like continuation if en and not de and prev_de: is_paren = en.startswith('(') first_alpha = next((c for c in en if c.isalpha()), '') starts_lower = first_alpha and first_alpha.islower() if (is_paren or starts_lower) and len(en.split()) < 5: sep = ' ' if prev_en and not prev_en.endswith((',', '-', '(')) else '' prev['english'] = (prev_en + sep + en).strip() if ex: sep2 = ' ' if prev_ex else '' prev['example'] = (prev_ex + sep2 + ex).strip() logger.debug( f"Merged wrapped row {entry.get('row_index')} into previous " f"(empty DE): EN={prev['english']!r}" ) continue merged.append(entry) if len(merged) < len(entries): logger.info( f"_merge_wrapped_rows: merged {len(entries) - len(merged)} " f"continuation rows ({len(entries)} -> {len(merged)})" ) return merged def _merge_continuation_rows( entries: List[Dict[str, Any]], ) -> List[Dict[str, Any]]: """Merge multi-line vocabulary entries where text wraps to the next row. A row is a continuation of the previous entry when: - EN has text, but DE is empty - EN starts with a lowercase letter (not a new vocab entry) - Previous entry's EN does NOT end with a sentence terminator (.!?) - The continuation text has fewer than 4 words (not an example sentence) - The row was not already merged as phonetic Example: Row 5: EN="to put up" DE="aufstellen" Row 6: EN="with sth." DE="" -> Merged: EN="to put up with sth." DE="aufstellen" """ if len(entries) < 2: return entries merged: List[Dict[str, Any]] = [] for entry in entries: en = (entry.get('english') or '').strip() de = (entry.get('german') or '').strip() if merged and en and not de: # Check: not phonetic (already handled) if _is_phonetic_only_text(en): merged.append(entry) continue # Check: starts with lowercase first_alpha = next((c for c in en if c.isalpha()), '') starts_lower = first_alpha and first_alpha.islower() # Check: fewer than 4 words (not an example sentence) word_count = len(en.split()) is_short = word_count < 4 # Check: previous entry doesn't end with sentence terminator prev = merged[-1] prev_en = (prev.get('english') or '').strip() prev_ends_sentence = prev_en and prev_en[-1] in '.!?' if starts_lower and is_short and not prev_ends_sentence: # Merge into previous entry prev['english'] = (prev_en + ' ' + en).strip() # Merge example if present ex = (entry.get('example') or '').strip() if ex: prev_ex = (prev.get('example') or '').strip() prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex logger.debug( f"Merged continuation row {entry.get('row_index')} " f"into previous entry: {prev['english']!r}" ) continue merged.append(entry) return merged