Fix word-gap merge: add missing pronouns to stop words, reduce threshold

- Add du/dich/dir/mich/mir/uns/euch/ihm/ihn to _STOP_WORDS to prevent false merges like "du" + "zerlegst" → "duzerlegst" - Reduce max_short threshold from 6 to 5 to prevent merging multi-word phrases like "ziehen lassen" → "ziehenlassen" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-27 15:35:12 +01:00
parent a8773d5b00
commit 96ea23164d
1 changed files with 7 additions and 5 deletions
@@ -34,7 +34,8 @@ _STOP_WORDS = frozenset([
    'der', 'die', 'das', 'dem', 'den', 'des',
    'ein', 'eine', 'einem', 'einen', 'einer',
    # Pronouns
-    'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
+    'du', 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
+    'dich', 'dir', 'mich', 'mir', 'uns', 'euch', 'ihm', 'ihn',
    # Prepositions
    'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im',
    'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter',
@@ -146,7 +147,7 @@ def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
    producing text like "zerknit tert" instead of "zerknittert".  This
    function tries to merge adjacent fragments in every content cell.

-    More permissive than ``_try_merge_pipe_gaps`` (threshold 6 instead of 3)
+    More permissive than ``_try_merge_pipe_gaps`` (threshold 5 instead of 3)
    but still guarded by pyphen dictionary lookup and stop-word exclusion.

    Returns the number of cells modified.
@@ -186,8 +187,9 @@ def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
 def _try_merge_word_gaps(text: str, hyph_de) -> str:
    """Merge OCR word fragments with relaxed threshold (max_short=6).

-    Similar to ``_try_merge_pipe_gaps`` but allows longer fragments to be
-    merged.  Still requires pyphen to recognize the merged word.
+    Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
+    (max_short=5 instead of 3).  Still requires pyphen to recognize the
+    merged word.
    """
    parts = text.split(' ')
    if len(parts) < 2:
@@ -207,7 +209,7 @@ def _try_merge_word_gaps(text: str, hyph_de) -> str:
            and prev_alpha and curr_alpha
            and prev_alpha.lower() not in _STOP_WORDS
            and curr_alpha.lower() not in _STOP_WORDS
-            and min(len(prev_alpha), len(curr_alpha)) <= 6
+            and min(len(prev_alpha), len(curr_alpha)) <= 5
            and len(prev_alpha) + len(curr_alpha) >= 4
        )