diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 50e0a4a..eea7a64 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -2868,15 +2868,21 @@ def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, A de = (entry.get('german', '') or '').strip() ex = (entry.get('example', '') or '').strip() - # Treat very short DE (≤2 chars) as OCR noise, not real translation - has_de = len(de) > 2 + # Treat single-char DE as OCR noise, not real translation. + # "Ei" (2 chars) is a valid German word, so threshold is 1. + has_de = len(de) > 1 has_en = bool(en) + has_ex = bool(ex) - if has_en and not has_de and vocab_entries: + # A row is an example candidate ONLY if it has EN text but + # NO DE translation AND NO example-column text. Rows with + # text in the example column are real vocab entries (e.g. + # continuation lines like "stand ..." / "German: Ich möchte..."). + is_example_candidate = has_en and not has_de and not has_ex and vocab_entries + + if is_example_candidate: # This is an example sentence — find best matching vocab entry example_text = en - if ex: - example_text = f"{en} — {ex}" match_idx = _find_best_vocab_match(en, vocab_entries) if match_idx < 0: diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 24f1d4a..e900e9c 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -1179,7 +1179,9 @@ async def detect_words( entries = _cells_to_vocab_entries(cells, columns_meta) entries = _fix_character_confusion(entries) entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) - entries = _split_comma_entries(entries) + # NOTE: _split_comma_entries disabled — word forms like "mouse, mice" + # / "Maus, Mäuse" belong together in one entry. + # entries = _split_comma_entries(entries) entries = _attach_example_sentences(entries) word_result["vocab_entries"] = entries # Also keep "entries" key for backwards compatibility @@ -1308,7 +1310,9 @@ async def _word_stream_generator( entries = _cells_to_vocab_entries(all_cells, columns_meta) entries = _fix_character_confusion(entries) entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) - entries = _split_comma_entries(entries) + # NOTE: _split_comma_entries disabled — word forms like "mouse, mice" + # / "Maus, Mäuse" belong together in one entry. + # entries = _split_comma_entries(entries) entries = _attach_example_sentences(entries) word_result["vocab_entries"] = entries word_result["entries"] = entries