From 5c96def4ecd65f6ab7ff4e4ffff1ae41b28911c7 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 11 Apr 2026 00:14:21 +0200 Subject: [PATCH] Skip valid line-break hyphenations in gutter repair MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Words ending with "-" where the stem is a known word (e.g. "wunder-" → "wunder" is known) are valid line-break hyphenations, not gutter errors. Gutter problems cause the hyphen to be LOST ("ve" instead of "ver-"), so a visible hyphen + known stem = intentional word-wrap. Co-Authored-By: Claude Opus 4.6 (1M context) --- klausur-service/backend/cv_gutter_repair.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/klausur-service/backend/cv_gutter_repair.py b/klausur-service/backend/cv_gutter_repair.py index 4f132fe..5c64cd5 100644 --- a/klausur-service/backend/cv_gutter_repair.py +++ b/klausur-service/backend/cv_gutter_repair.py @@ -388,11 +388,22 @@ def analyse_grid_for_gutter_repair( if _is_known(last_word_clean): continue - gutter_candidates += 1 - # Check if the word ends with "-" (explicit hyphen break) ends_with_hyphen = last_word.endswith("-") + # If the word already ends with "-" and the stem (without + # the hyphen) is a known word, this is a VALID line-break + # hyphenation — not a gutter error. Gutter problems cause + # the hyphen to be LOST ("ve" instead of "ver-"), so a + # visible hyphen + known stem = intentional word-wrap. + # Example: "wunder-" → "wunder" is known → skip. + if ends_with_hyphen: + stem = last_word_clean.rstrip("-") + if stem and _is_known(stem): + continue + + gutter_candidates += 1 + # --- Strategy 1: Hyphen join with next row --- next_cell = cell_map.get((ri + 1, ci)) if next_cell: @@ -403,9 +414,8 @@ def analyse_grid_for_gutter_repair( first_next_clean = _TRAILING_PUNCT_RE.sub('', first_next) first_alpha = next((c for c in first_next if c.isalpha()), "") - # If the word already ends with "-" and the direct join - # (no missing chars) is a known word, this is a VALID - # hyphenation — not a gutter error. Skip it. + # Also skip if the joined word is known (covers compound + # words where the stem alone might not be in the dictionary) if ends_with_hyphen and first_next_clean: direct = last_word_clean.rstrip("-") + first_next_clean if _is_known(direct):