From 49d5212f0c35bfc449530422dc586867be52ee8b Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 10 Apr 2026 19:49:07 +0200 Subject: [PATCH] Fix hyphen-join: preserve next row + skip valid hyphenations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs fixed: - Apply no longer removes the continuation word from the next row. "künden" stays in row 31 — only the current row is repaired ("ve" → "ver-"). The original line-break layout is preserved. - Analysis now skips words that already end with "-" when the direct join with the next row is a known word (valid hyphenation, not an error). Co-Authored-By: Claude Opus 4.6 (1M context) --- klausur-service/backend/cv_gutter_repair.py | 40 +++++++------------ .../backend/tests/test_gutter_repair.py | 4 +- 2 files changed, 16 insertions(+), 28 deletions(-) diff --git a/klausur-service/backend/cv_gutter_repair.py b/klausur-service/backend/cv_gutter_repair.py index 2eead8e..4f132fe 100644 --- a/klausur-service/backend/cv_gutter_repair.py +++ b/klausur-service/backend/cv_gutter_repair.py @@ -400,8 +400,17 @@ def analyse_grid_for_gutter_repair( next_words = next_text.split() if next_words: first_next = next_words[0] + first_next_clean = _TRAILING_PUNCT_RE.sub('', first_next) first_alpha = next((c for c in first_next if c.isalpha()), "") + # If the word already ends with "-" and the direct join + # (no missing chars) is a known word, this is a VALID + # hyphenation — not a gutter error. Skip it. + if ends_with_hyphen and first_next_clean: + direct = last_word_clean.rstrip("-") + first_next_clean + if _is_known(direct): + continue + # Continuation likely if: # - explicit hyphen, OR # - next row starts lowercase (= not a new entry) @@ -557,13 +566,16 @@ def apply_gutter_suggestions( # The first display part is what goes in the current row first_part = display_parts[0] if display_parts else "" - # Replace the last word in current cell + # Replace the last word in current cell with the restored form. + # The next row is NOT modified — "künden" stays in its row + # because the original book layout has it there. We only fix + # the truncated word in the current row (e.g. "ve" → "ver-"). idx = old_text.rfind(original_word) if idx >= 0: new_text = old_text[:idx] + first_part + old_text[idx + len(original_word):] target_cell["text"] = new_text changes.append({ - "type": "hyphen_join_current", + "type": "hyphen_join", "zone_index": zi, "row_index": ri, "col_index": ci, @@ -573,30 +585,6 @@ def apply_gutter_suggestions( "joined_word": joined, }) - # Next row: remove the first word (it's now joined into current row) - if next_ri >= 0: - next_cell = None - for cell in zone_cells: - if cell.get("row_index") == next_ri and cell.get("col_index") == ci: - next_cell = cell - break - - if next_cell: - next_old = next_cell.get("text", "") - next_words = next_old.split() - if next_words: - next_new = " ".join(next_words[1:]) - next_cell["text"] = next_new - changes.append({ - "type": "hyphen_join_next", - "zone_index": zi, - "row_index": next_ri, - "col_index": ci, - "cell_id": next_cell.get("cell_id", ""), - "old_text": next_old, - "new_text": next_new, - }) - logger.info("Gutter repair applied: %d/%d suggestions", len(changes), len(accepted_suggestions)) return { diff --git a/klausur-service/backend/tests/test_gutter_repair.py b/klausur-service/backend/tests/test_gutter_repair.py index 353f8d1..931c19e 100644 --- a/klausur-service/backend/tests/test_gutter_repair.py +++ b/klausur-service/backend/tests/test_gutter_repair.py @@ -310,8 +310,8 @@ class TestApplySuggestions: assert result["applied_count"] == 1 # Current row: "ve" replaced with "ver-" assert grid["zones"][0]["cells"][0]["text"] == "ver-" - # Next row: "künden" removed, "und" remains - assert grid["zones"][0]["cells"][1]["text"] == "und" + # Next row: UNCHANGED — "künden" stays in its original row + assert grid["zones"][0]["cells"][1]["text"] == "künden und" def test_apply_nothing_when_no_accepted(self): grid = _make_grid([])