From 2e21a4b6d07ce8edcb37b3781f325e4066d22b25 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 11 Mar 2026 23:40:18 +0100 Subject: [PATCH] =?UTF-8?q?fix:=20IPA=20nur=20einf=C3=BCgen=20wenn=20word?= =?UTF-8?q?=5Fboxes=20Gap=20>80px=20zeigen=20(kein=20falsches=20IPA)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _has_ipa_gap() prüft ob Tesseract eine IPA-Klammer übersehen hat anhand des physischen Abstands zwischen Headword und nächstem Wort. Ohne Gap (z.B. "be good at sth.", "Focus on language") wird kein IPA eingefügt. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_ocr_engines.py | 55 ++++++++++++++++++++--- 1 file changed, 49 insertions(+), 6 deletions(-) diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index ee7fad5..9081cd9 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -1036,13 +1036,16 @@ def fix_cell_phonetics( else: # column_text: replace garbled IPA, no orphan stripping new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False) - # Insert headword IPA for long mixed-language lines AND sync - # word_boxes so overlay positioning stays consistent. + # Insert headword IPA ONLY if there's a gap in word_boxes + # suggesting Tesseract missed an IPA bracket on the page. + # Without gap evidence, the original page had no IPA. if new_text == text: - inserted = _insert_headword_ipa(text, pronunciation) - if inserted != text: - new_text = inserted - _sync_word_boxes_after_ipa_insert(cell, text, new_text) + wb = cell.get('word_boxes', []) + if _has_ipa_gap(text, wb): + inserted = _insert_headword_ipa(text, pronunciation) + if inserted != text: + new_text = inserted + _sync_word_boxes_after_ipa_insert(cell, text, new_text) if new_text != text: logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'") @@ -1054,6 +1057,46 @@ def fix_cell_phonetics( return cells +def _has_ipa_gap(text: str, word_boxes: List[Dict]) -> bool: + """Check if word_boxes show a gap where IPA brackets should be. + + On a typical vocab page, the layout is: + headword [ipa] German translation + + If Tesseract missed the IPA bracket, the gap between the headword + and the next word (German translation) is unusually large (>80px) + because the IPA occupied physical space on the page. + + If no IPA was on the page (e.g. "be good at sth."), the words are + close together (<30px). + """ + if not word_boxes or len(word_boxes) < 2: + return False + + tokens = text.split() + if not tokens: + return False + + # Find the headword index: skip numeric prefixes like "».55", "0.56" + hw_box_idx = 0 + for i, wb in enumerate(word_boxes): + wt = wb.get('text', '') + clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', wt) + if len(clean) >= 2: + hw_box_idx = i + break + + if hw_box_idx >= len(word_boxes) - 1: + return False + + # Check gap between headword and the next word_box + hw = word_boxes[hw_box_idx] + next_wb = word_boxes[hw_box_idx + 1] + gap = next_wb['left'] - (hw['left'] + hw['width']) + + return gap > 80 + + def _sync_word_boxes_after_ipa_insert( cell: Dict[str, Any], old_text: str,