klausur-service (7 monoliths): - grid_editor_helpers.py (1,737 → 5 files: columns, filters, headers, zones) - cv_cell_grid.py (1,675 → 7 files: build, legacy, streaming, merge, vocab) - worksheet_editor_api.py (1,305 → 4 files: models, AI, reconstruct, routes) - legal_corpus_ingestion.py (1,280 → 3 files: registry, chunking, ingestion) - cv_review.py (1,248 → 4 files: pipeline, spell, LLM, barrel) - cv_preprocessing.py (1,166 → 3 files: deskew, dewarp, barrel) - rbac.py, admin_api.py, routes/eh.py remain (next batch) backend-lehrer (1 monolith): - classroom_engine/repository.py (1,705 → 7 files by domain) All re-export barrels preserve backward compatibility. Zero import errors verified. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
236 lines
8.4 KiB
Python
236 lines
8.4 KiB
Python
"""
|
|
Row-merging logic for vocabulary entries (phonetic, wrapped, continuation rows).
|
|
|
|
Extracted from cv_cell_grid.py.
|
|
|
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from typing import Any, Dict, List
|
|
|
|
from cv_ocr_engines import _RE_ALPHA
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Regex: line starts with phonetic bracket content only (no real word before it)
|
|
_PHONETIC_ONLY_RE = re.compile(
|
|
r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
|
|
)
|
|
|
|
|
|
def _is_phonetic_only_text(text: str) -> bool:
|
|
"""Check if text consists only of phonetic transcription.
|
|
|
|
Phonetic-only patterns:
|
|
['mani serva] -> True
|
|
[dance] -> True
|
|
["a:mand] -> True
|
|
almond ['a:mand] -> False (has real word before bracket)
|
|
Mandel -> False
|
|
"""
|
|
t = text.strip()
|
|
if not t:
|
|
return False
|
|
# Must contain at least one bracket
|
|
if '[' not in t and ']' not in t:
|
|
return False
|
|
# Remove all bracket content and surrounding punctuation/whitespace
|
|
without_brackets = re.sub(r"\[.*?\]", '', t)
|
|
without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
|
|
# If nothing meaningful remains, it's phonetic-only
|
|
alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
|
|
return len(alpha_remaining) < 2
|
|
|
|
|
|
def _merge_phonetic_continuation_rows(
|
|
entries: List[Dict[str, Any]],
|
|
) -> List[Dict[str, Any]]:
|
|
"""Merge rows that contain only phonetic transcription into previous entry.
|
|
|
|
In dictionary pages, phonetic transcription sometimes wraps to the next
|
|
row. E.g.:
|
|
Row 28: EN="it's a money-saver" DE="es spart Kosten"
|
|
Row 29: EN="['mani serva]" DE=""
|
|
|
|
Row 29 is phonetic-only -> merge into row 28's EN field.
|
|
"""
|
|
if len(entries) < 2:
|
|
return entries
|
|
|
|
merged: List[Dict[str, Any]] = []
|
|
for entry in entries:
|
|
en = (entry.get('english') or '').strip()
|
|
de = (entry.get('german') or '').strip()
|
|
ex = (entry.get('example') or '').strip()
|
|
|
|
# Check if this entry is phonetic-only (EN has only phonetics, DE empty)
|
|
if merged and _is_phonetic_only_text(en) and not de:
|
|
prev = merged[-1]
|
|
prev_en = (prev.get('english') or '').strip()
|
|
# Append phonetic to previous entry's EN
|
|
if prev_en:
|
|
prev['english'] = prev_en + ' ' + en
|
|
else:
|
|
prev['english'] = en
|
|
# If there was an example, append to previous too
|
|
if ex:
|
|
prev_ex = (prev.get('example') or '').strip()
|
|
prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
|
|
logger.debug(
|
|
f"Merged phonetic row {entry.get('row_index')} "
|
|
f"into previous entry: {prev['english']!r}"
|
|
)
|
|
continue
|
|
|
|
merged.append(entry)
|
|
|
|
return merged
|
|
|
|
|
|
def _merge_wrapped_rows(
|
|
entries: List[Dict[str, Any]],
|
|
) -> List[Dict[str, Any]]:
|
|
"""Merge rows where the primary column (EN) is empty -- cell wrap continuation.
|
|
|
|
In textbook vocabulary tables, columns are often narrow, so the author
|
|
wraps text within a cell. OCR treats each physical line as a separate row.
|
|
The key indicator: if the EN column is empty but DE/example have text,
|
|
this row is a continuation of the previous row's cells.
|
|
|
|
Example (original textbook has ONE row):
|
|
Row 2: EN="take part (in)" DE="teilnehmen (an), mitmachen" EX="More than 200 singers took"
|
|
Row 3: EN="" DE="(bei)" EX="part in the concert."
|
|
-> Merged: EN="take part (in)" DE="teilnehmen (an), mitmachen (bei)" EX="..."
|
|
|
|
Also handles the reverse case: DE empty but EN has text (wrap in EN column).
|
|
"""
|
|
if len(entries) < 2:
|
|
return entries
|
|
|
|
merged: List[Dict[str, Any]] = []
|
|
for entry in entries:
|
|
en = (entry.get('english') or '').strip()
|
|
de = (entry.get('german') or '').strip()
|
|
ex = (entry.get('example') or '').strip()
|
|
|
|
if not merged:
|
|
merged.append(entry)
|
|
continue
|
|
|
|
prev = merged[-1]
|
|
prev_en = (prev.get('english') or '').strip()
|
|
prev_de = (prev.get('german') or '').strip()
|
|
prev_ex = (prev.get('example') or '').strip()
|
|
|
|
# Case 1: EN is empty -> continuation of previous row
|
|
if not en and (de or ex) and prev_en:
|
|
if de:
|
|
if prev_de.endswith(','):
|
|
sep = ' '
|
|
elif prev_de.endswith(('-', '(')):
|
|
sep = ''
|
|
else:
|
|
sep = ' '
|
|
prev['german'] = (prev_de + sep + de).strip()
|
|
if ex:
|
|
sep = ' ' if prev_ex else ''
|
|
prev['example'] = (prev_ex + sep + ex).strip()
|
|
logger.debug(
|
|
f"Merged wrapped row {entry.get('row_index')} into previous "
|
|
f"(empty EN): DE={prev['german']!r}, EX={prev.get('example', '')!r}"
|
|
)
|
|
continue
|
|
|
|
# Case 2: DE is empty, EN has text that looks like continuation
|
|
if en and not de and prev_de:
|
|
is_paren = en.startswith('(')
|
|
first_alpha = next((c for c in en if c.isalpha()), '')
|
|
starts_lower = first_alpha and first_alpha.islower()
|
|
|
|
if (is_paren or starts_lower) and len(en.split()) < 5:
|
|
sep = ' ' if prev_en and not prev_en.endswith((',', '-', '(')) else ''
|
|
prev['english'] = (prev_en + sep + en).strip()
|
|
if ex:
|
|
sep2 = ' ' if prev_ex else ''
|
|
prev['example'] = (prev_ex + sep2 + ex).strip()
|
|
logger.debug(
|
|
f"Merged wrapped row {entry.get('row_index')} into previous "
|
|
f"(empty DE): EN={prev['english']!r}"
|
|
)
|
|
continue
|
|
|
|
merged.append(entry)
|
|
|
|
if len(merged) < len(entries):
|
|
logger.info(
|
|
f"_merge_wrapped_rows: merged {len(entries) - len(merged)} "
|
|
f"continuation rows ({len(entries)} -> {len(merged)})"
|
|
)
|
|
return merged
|
|
|
|
|
|
def _merge_continuation_rows(
|
|
entries: List[Dict[str, Any]],
|
|
) -> List[Dict[str, Any]]:
|
|
"""Merge multi-line vocabulary entries where text wraps to the next row.
|
|
|
|
A row is a continuation of the previous entry when:
|
|
- EN has text, but DE is empty
|
|
- EN starts with a lowercase letter (not a new vocab entry)
|
|
- Previous entry's EN does NOT end with a sentence terminator (.!?)
|
|
- The continuation text has fewer than 4 words (not an example sentence)
|
|
- The row was not already merged as phonetic
|
|
|
|
Example:
|
|
Row 5: EN="to put up" DE="aufstellen"
|
|
Row 6: EN="with sth." DE=""
|
|
-> Merged: EN="to put up with sth." DE="aufstellen"
|
|
"""
|
|
if len(entries) < 2:
|
|
return entries
|
|
|
|
merged: List[Dict[str, Any]] = []
|
|
for entry in entries:
|
|
en = (entry.get('english') or '').strip()
|
|
de = (entry.get('german') or '').strip()
|
|
|
|
if merged and en and not de:
|
|
# Check: not phonetic (already handled)
|
|
if _is_phonetic_only_text(en):
|
|
merged.append(entry)
|
|
continue
|
|
|
|
# Check: starts with lowercase
|
|
first_alpha = next((c for c in en if c.isalpha()), '')
|
|
starts_lower = first_alpha and first_alpha.islower()
|
|
|
|
# Check: fewer than 4 words (not an example sentence)
|
|
word_count = len(en.split())
|
|
is_short = word_count < 4
|
|
|
|
# Check: previous entry doesn't end with sentence terminator
|
|
prev = merged[-1]
|
|
prev_en = (prev.get('english') or '').strip()
|
|
prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
|
|
|
|
if starts_lower and is_short and not prev_ends_sentence:
|
|
# Merge into previous entry
|
|
prev['english'] = (prev_en + ' ' + en).strip()
|
|
# Merge example if present
|
|
ex = (entry.get('example') or '').strip()
|
|
if ex:
|
|
prev_ex = (prev.get('example') or '').strip()
|
|
prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
|
|
logger.debug(
|
|
f"Merged continuation row {entry.get('row_index')} "
|
|
f"into previous entry: {prev['english']!r}"
|
|
)
|
|
continue
|
|
|
|
merged.append(entry)
|
|
|
|
return merged
|