From 96ea23164d2ab7d6f395f37c06fd21cb9c8514ec Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 27 Mar 2026 15:35:12 +0100 Subject: [PATCH] Fix word-gap merge: add missing pronouns to stop words, reduce threshold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add du/dich/dir/mich/mir/uns/euch/ihm/ihn to _STOP_WORDS to prevent false merges like "du" + "zerlegst" → "duzerlegst" - Reduce max_short threshold from 6 to 5 to prevent merging multi-word phrases like "ziehen lassen" → "ziehenlassen" Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_syllable_detect.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/klausur-service/backend/cv_syllable_detect.py b/klausur-service/backend/cv_syllable_detect.py index 8a47314..c67f3c6 100644 --- a/klausur-service/backend/cv_syllable_detect.py +++ b/klausur-service/backend/cv_syllable_detect.py @@ -34,7 +34,8 @@ _STOP_WORDS = frozenset([ 'der', 'die', 'das', 'dem', 'den', 'des', 'ein', 'eine', 'einem', 'einen', 'einer', # Pronouns - 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich', + 'du', 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich', + 'dich', 'dir', 'mich', 'mir', 'uns', 'euch', 'ihm', 'ihn', # Prepositions 'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im', 'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter', @@ -146,7 +147,7 @@ def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int: producing text like "zerknit tert" instead of "zerknittert". This function tries to merge adjacent fragments in every content cell. - More permissive than ``_try_merge_pipe_gaps`` (threshold 6 instead of 3) + More permissive than ``_try_merge_pipe_gaps`` (threshold 5 instead of 3) but still guarded by pyphen dictionary lookup and stop-word exclusion. Returns the number of cells modified. @@ -186,8 +187,9 @@ def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int: def _try_merge_word_gaps(text: str, hyph_de) -> str: """Merge OCR word fragments with relaxed threshold (max_short=6). - Similar to ``_try_merge_pipe_gaps`` but allows longer fragments to be - merged. Still requires pyphen to recognize the merged word. + Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments + (max_short=5 instead of 3). Still requires pyphen to recognize the + merged word. """ parts = text.split(' ') if len(parts) < 2: @@ -207,7 +209,7 @@ def _try_merge_word_gaps(text: str, hyph_de) -> str: and prev_alpha and curr_alpha and prev_alpha.lower() not in _STOP_WORDS and curr_alpha.lower() not in _STOP_WORDS - and min(len(prev_alpha), len(curr_alpha)) <= 6 + and min(len(prev_alpha), len(curr_alpha)) <= 5 and len(prev_alpha) + len(curr_alpha) >= 4 )