Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
189
klausur-service/backend/cv_ocr_cell_phonetics.py
Normal file
189
klausur-service/backend/cv_ocr_cell_phonetics.py
Normal file
@@ -0,0 +1,189 @@
|
||||
"""Cell-level IPA phonetic fixes for overlay mode.
|
||||
|
||||
In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
|
||||
(entry['english']). But the overlay reads cell['text'] directly, so
|
||||
phonetic fixes must be applied to cells too.
|
||||
|
||||
Split from cv_ocr_engines.py — contains fix_cell_phonetics() and helpers.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from cv_vocab_types import IPA_AVAILABLE
|
||||
|
||||
from cv_ocr_ipa_lookup import (
|
||||
_insert_missing_ipa,
|
||||
_replace_phonetics_in_text,
|
||||
_text_has_garbled_ipa,
|
||||
)
|
||||
from cv_ocr_ipa_repair import (
|
||||
_has_non_dict_trailing,
|
||||
_insert_headword_ipa,
|
||||
_strip_post_bracket_garbled,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def fix_cell_phonetics(
|
||||
cells: List[Dict[str, Any]],
|
||||
pronunciation: str = 'british',
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Apply IPA phonetic fixes to cell texts for overlay mode.
|
||||
|
||||
In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
|
||||
(entry['english']). But the overlay reads cell['text'] directly, so
|
||||
phonetic fixes must be applied to cells too.
|
||||
|
||||
Processing depends on column type:
|
||||
- column_en: Full processing (replace garbled IPA + strip orphan brackets
|
||||
+ insert missing IPA). Safe because these cells contain only English
|
||||
headwords.
|
||||
- column_text: Light processing (replace garbled IPA ONLY). No orphan
|
||||
bracket stripping (brackets may be German content like "(probieren)")
|
||||
and no IPA insertion (would add tokens and break overlay positioning).
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return cells
|
||||
|
||||
ipa_col_types = {'column_en', 'column_text'}
|
||||
replaced = 0
|
||||
|
||||
for cell in cells:
|
||||
col_type = cell.get('col_type', '')
|
||||
if col_type not in ipa_col_types:
|
||||
continue
|
||||
text = cell.get('text', '') or ''
|
||||
if not text.strip():
|
||||
continue
|
||||
|
||||
if col_type == 'column_en':
|
||||
# Full processing: replace garbled IPA, strip orphan brackets.
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True)
|
||||
if new_text == text:
|
||||
# Insert IPA when garbled phonetics exist OR when trailing
|
||||
# non-dictionary words suggest garbled IPA in plain ASCII.
|
||||
if _text_has_garbled_ipa(text) or _has_non_dict_trailing(text, pronunciation):
|
||||
new_text = _insert_missing_ipa(text, pronunciation)
|
||||
# Strip trailing garbled fragments after proper [IPA] brackets
|
||||
# (e.g. "sea [sˈiː] si:" → "sea [sˈiː]")
|
||||
if ']' in new_text:
|
||||
new_text = _strip_post_bracket_garbled(new_text, pronunciation)
|
||||
else:
|
||||
# column_text: replace garbled IPA, no orphan stripping
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
|
||||
# Insert headword IPA ONLY if there's a gap in word_boxes
|
||||
# suggesting Tesseract missed an IPA bracket on the page.
|
||||
# Without gap evidence, the original page had no IPA.
|
||||
if new_text == text:
|
||||
wb = cell.get('word_boxes', [])
|
||||
if _has_ipa_gap(text, wb):
|
||||
inserted = _insert_headword_ipa(text, pronunciation)
|
||||
if inserted != text:
|
||||
new_text = inserted
|
||||
_sync_word_boxes_after_ipa_insert(cell, text, new_text)
|
||||
|
||||
if new_text != text:
|
||||
logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'")
|
||||
cell['text'] = new_text
|
||||
replaced += 1
|
||||
|
||||
if replaced:
|
||||
logger.info(f"fix_cell_phonetics: {replaced} IPA fixes in {len(cells)} cells")
|
||||
return cells
|
||||
|
||||
|
||||
def _has_ipa_gap(text: str, word_boxes: List[Dict]) -> bool:
|
||||
"""Check if word_boxes show a gap where IPA brackets should be.
|
||||
|
||||
On a typical vocab page, the layout is:
|
||||
headword [ipa] German translation
|
||||
|
||||
If Tesseract missed the IPA bracket, the gap between the headword
|
||||
and the next word (German translation) is unusually large (>80px)
|
||||
because the IPA occupied physical space on the page.
|
||||
|
||||
If no IPA was on the page (e.g. "be good at sth."), the words are
|
||||
close together (<30px).
|
||||
"""
|
||||
if not word_boxes or len(word_boxes) < 2:
|
||||
return False
|
||||
|
||||
tokens = text.split()
|
||||
if not tokens:
|
||||
return False
|
||||
|
||||
# Find the headword index: skip numeric prefixes like "».55", "0.56"
|
||||
hw_box_idx = 0
|
||||
for i, wb in enumerate(word_boxes):
|
||||
wt = wb.get('text', '')
|
||||
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', wt)
|
||||
if len(clean) >= 2:
|
||||
hw_box_idx = i
|
||||
break
|
||||
|
||||
if hw_box_idx >= len(word_boxes) - 1:
|
||||
return False
|
||||
|
||||
# Check gap between headword and the next word_box
|
||||
hw = word_boxes[hw_box_idx]
|
||||
next_wb = word_boxes[hw_box_idx + 1]
|
||||
gap = next_wb['left'] - (hw['left'] + hw['width'])
|
||||
|
||||
return gap > 80
|
||||
|
||||
|
||||
def _sync_word_boxes_after_ipa_insert(
|
||||
cell: Dict[str, Any],
|
||||
old_text: str,
|
||||
new_text: str,
|
||||
) -> None:
|
||||
"""Insert a synthetic word_box for an IPA token added by IPA insertion.
|
||||
|
||||
E.g. "challenge ..." → "challenge [tʃælɪndʒ] ..."
|
||||
Adds a new word_box right after the headword's box so the 1:1
|
||||
token-to-box mapping in the frontend overlay stays consistent.
|
||||
"""
|
||||
word_boxes = cell.get('word_boxes')
|
||||
if not word_boxes:
|
||||
return
|
||||
|
||||
old_tokens = old_text.split()
|
||||
new_tokens = new_text.split()
|
||||
|
||||
if len(new_tokens) != len(old_tokens) + 1:
|
||||
return # unexpected change, skip
|
||||
|
||||
# Find the inserted token by walking both lists in parallel.
|
||||
# One token in new_tokens won't match — that's the inserted IPA.
|
||||
insert_idx = -1
|
||||
j = 0 # index into old_tokens
|
||||
for i in range(len(new_tokens)):
|
||||
if j < len(old_tokens) and new_tokens[i] == old_tokens[j]:
|
||||
j += 1
|
||||
else:
|
||||
insert_idx = i
|
||||
break
|
||||
|
||||
if insert_idx < 0 or insert_idx >= len(new_tokens):
|
||||
return
|
||||
|
||||
ipa_token = new_tokens[insert_idx]
|
||||
|
||||
# The headword is at insert_idx - 1 in old_tokens (and word_boxes)
|
||||
ref_idx = insert_idx - 1
|
||||
if ref_idx < 0 or ref_idx >= len(word_boxes):
|
||||
return
|
||||
|
||||
ref_box = word_boxes[ref_idx]
|
||||
ipa_box = {
|
||||
'text': ipa_token,
|
||||
'left': ref_box['left'] + ref_box['width'] + 2,
|
||||
'top': ref_box['top'],
|
||||
'width': ref_box['width'],
|
||||
'height': ref_box['height'],
|
||||
'conf': ref_box.get('conf', 90),
|
||||
}
|
||||
word_boxes.insert(insert_idx, ipa_box)
|
||||
Reference in New Issue
Block a user