fix: IPA nur einfügen wenn word_boxes Gap >80px zeigen (kein falsches IPA)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 55s
CI / test-go-edu-search (push) Successful in 48s
CI / test-python-klausur (push) Failing after 2m11s
CI / test-python-agent-core (push) Successful in 23s
CI / test-nodejs-website (push) Successful in 26s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 55s
CI / test-go-edu-search (push) Successful in 48s
CI / test-python-klausur (push) Failing after 2m11s
CI / test-python-agent-core (push) Successful in 23s
CI / test-nodejs-website (push) Successful in 26s
_has_ipa_gap() prüft ob Tesseract eine IPA-Klammer übersehen hat anhand des physischen Abstands zwischen Headword und nächstem Wort. Ohne Gap (z.B. "be good at sth.", "Focus on language") wird kein IPA eingefügt. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1036,13 +1036,16 @@ def fix_cell_phonetics(
|
||||
else:
|
||||
# column_text: replace garbled IPA, no orphan stripping
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
|
||||
# Insert headword IPA for long mixed-language lines AND sync
|
||||
# word_boxes so overlay positioning stays consistent.
|
||||
# Insert headword IPA ONLY if there's a gap in word_boxes
|
||||
# suggesting Tesseract missed an IPA bracket on the page.
|
||||
# Without gap evidence, the original page had no IPA.
|
||||
if new_text == text:
|
||||
inserted = _insert_headword_ipa(text, pronunciation)
|
||||
if inserted != text:
|
||||
new_text = inserted
|
||||
_sync_word_boxes_after_ipa_insert(cell, text, new_text)
|
||||
wb = cell.get('word_boxes', [])
|
||||
if _has_ipa_gap(text, wb):
|
||||
inserted = _insert_headword_ipa(text, pronunciation)
|
||||
if inserted != text:
|
||||
new_text = inserted
|
||||
_sync_word_boxes_after_ipa_insert(cell, text, new_text)
|
||||
|
||||
if new_text != text:
|
||||
logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'")
|
||||
@@ -1054,6 +1057,46 @@ def fix_cell_phonetics(
|
||||
return cells
|
||||
|
||||
|
||||
def _has_ipa_gap(text: str, word_boxes: List[Dict]) -> bool:
|
||||
"""Check if word_boxes show a gap where IPA brackets should be.
|
||||
|
||||
On a typical vocab page, the layout is:
|
||||
headword [ipa] German translation
|
||||
|
||||
If Tesseract missed the IPA bracket, the gap between the headword
|
||||
and the next word (German translation) is unusually large (>80px)
|
||||
because the IPA occupied physical space on the page.
|
||||
|
||||
If no IPA was on the page (e.g. "be good at sth."), the words are
|
||||
close together (<30px).
|
||||
"""
|
||||
if not word_boxes or len(word_boxes) < 2:
|
||||
return False
|
||||
|
||||
tokens = text.split()
|
||||
if not tokens:
|
||||
return False
|
||||
|
||||
# Find the headword index: skip numeric prefixes like "».55", "0.56"
|
||||
hw_box_idx = 0
|
||||
for i, wb in enumerate(word_boxes):
|
||||
wt = wb.get('text', '')
|
||||
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', wt)
|
||||
if len(clean) >= 2:
|
||||
hw_box_idx = i
|
||||
break
|
||||
|
||||
if hw_box_idx >= len(word_boxes) - 1:
|
||||
return False
|
||||
|
||||
# Check gap between headword and the next word_box
|
||||
hw = word_boxes[hw_box_idx]
|
||||
next_wb = word_boxes[hw_box_idx + 1]
|
||||
gap = next_wb['left'] - (hw['left'] + hw['width'])
|
||||
|
||||
return gap > 80
|
||||
|
||||
|
||||
def _sync_word_boxes_after_ipa_insert(
|
||||
cell: Dict[str, Any],
|
||||
old_text: str,
|
||||
|
||||
Reference in New Issue
Block a user