Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s

sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions

View File

@@ -0,0 +1,189 @@
"""Cell-level IPA phonetic fixes for overlay mode.
In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
(entry['english']). But the overlay reads cell['text'] directly, so
phonetic fixes must be applied to cells too.
Split from cv_ocr_engines.py — contains fix_cell_phonetics() and helpers.
"""
import logging
import re
from typing import Any, Dict, List
from cv_vocab_types import IPA_AVAILABLE
from cv_ocr_ipa_lookup import (
_insert_missing_ipa,
_replace_phonetics_in_text,
_text_has_garbled_ipa,
)
from cv_ocr_ipa_repair import (
_has_non_dict_trailing,
_insert_headword_ipa,
_strip_post_bracket_garbled,
)
logger = logging.getLogger(__name__)
def fix_cell_phonetics(
cells: List[Dict[str, Any]],
pronunciation: str = 'british',
) -> List[Dict[str, Any]]:
"""Apply IPA phonetic fixes to cell texts for overlay mode.
In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
(entry['english']). But the overlay reads cell['text'] directly, so
phonetic fixes must be applied to cells too.
Processing depends on column type:
- column_en: Full processing (replace garbled IPA + strip orphan brackets
+ insert missing IPA). Safe because these cells contain only English
headwords.
- column_text: Light processing (replace garbled IPA ONLY). No orphan
bracket stripping (brackets may be German content like "(probieren)")
and no IPA insertion (would add tokens and break overlay positioning).
"""
if not IPA_AVAILABLE:
return cells
ipa_col_types = {'column_en', 'column_text'}
replaced = 0
for cell in cells:
col_type = cell.get('col_type', '')
if col_type not in ipa_col_types:
continue
text = cell.get('text', '') or ''
if not text.strip():
continue
if col_type == 'column_en':
# Full processing: replace garbled IPA, strip orphan brackets.
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True)
if new_text == text:
# Insert IPA when garbled phonetics exist OR when trailing
# non-dictionary words suggest garbled IPA in plain ASCII.
if _text_has_garbled_ipa(text) or _has_non_dict_trailing(text, pronunciation):
new_text = _insert_missing_ipa(text, pronunciation)
# Strip trailing garbled fragments after proper [IPA] brackets
# (e.g. "sea [sˈiː] si:" → "sea [sˈiː]")
if ']' in new_text:
new_text = _strip_post_bracket_garbled(new_text, pronunciation)
else:
# column_text: replace garbled IPA, no orphan stripping
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
# Insert headword IPA ONLY if there's a gap in word_boxes
# suggesting Tesseract missed an IPA bracket on the page.
# Without gap evidence, the original page had no IPA.
if new_text == text:
wb = cell.get('word_boxes', [])
if _has_ipa_gap(text, wb):
inserted = _insert_headword_ipa(text, pronunciation)
if inserted != text:
new_text = inserted
_sync_word_boxes_after_ipa_insert(cell, text, new_text)
if new_text != text:
logger.debug(f"fix_cell_phonetics: '{text}''{new_text}'")
cell['text'] = new_text
replaced += 1
if replaced:
logger.info(f"fix_cell_phonetics: {replaced} IPA fixes in {len(cells)} cells")
return cells
def _has_ipa_gap(text: str, word_boxes: List[Dict]) -> bool:
"""Check if word_boxes show a gap where IPA brackets should be.
On a typical vocab page, the layout is:
headword [ipa] German translation
If Tesseract missed the IPA bracket, the gap between the headword
and the next word (German translation) is unusually large (>80px)
because the IPA occupied physical space on the page.
If no IPA was on the page (e.g. "be good at sth."), the words are
close together (<30px).
"""
if not word_boxes or len(word_boxes) < 2:
return False
tokens = text.split()
if not tokens:
return False
# Find the headword index: skip numeric prefixes like "».55", "0.56"
hw_box_idx = 0
for i, wb in enumerate(word_boxes):
wt = wb.get('text', '')
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', wt)
if len(clean) >= 2:
hw_box_idx = i
break
if hw_box_idx >= len(word_boxes) - 1:
return False
# Check gap between headword and the next word_box
hw = word_boxes[hw_box_idx]
next_wb = word_boxes[hw_box_idx + 1]
gap = next_wb['left'] - (hw['left'] + hw['width'])
return gap > 80
def _sync_word_boxes_after_ipa_insert(
cell: Dict[str, Any],
old_text: str,
new_text: str,
) -> None:
"""Insert a synthetic word_box for an IPA token added by IPA insertion.
E.g. "challenge ...""challenge [tʃælɪndʒ] ..."
Adds a new word_box right after the headword's box so the 1:1
token-to-box mapping in the frontend overlay stays consistent.
"""
word_boxes = cell.get('word_boxes')
if not word_boxes:
return
old_tokens = old_text.split()
new_tokens = new_text.split()
if len(new_tokens) != len(old_tokens) + 1:
return # unexpected change, skip
# Find the inserted token by walking both lists in parallel.
# One token in new_tokens won't match — that's the inserted IPA.
insert_idx = -1
j = 0 # index into old_tokens
for i in range(len(new_tokens)):
if j < len(old_tokens) and new_tokens[i] == old_tokens[j]:
j += 1
else:
insert_idx = i
break
if insert_idx < 0 or insert_idx >= len(new_tokens):
return
ipa_token = new_tokens[insert_idx]
# The headword is at insert_idx - 1 in old_tokens (and word_boxes)
ref_idx = insert_idx - 1
if ref_idx < 0 or ref_idx >= len(word_boxes):
return
ref_box = word_boxes[ref_idx]
ipa_box = {
'text': ipa_token,
'left': ref_box['left'] + ref_box['width'] + 2,
'top': ref_box['top'],
'width': ref_box['width'],
'height': ref_box['height'],
'conf': ref_box.get('conf', 90),
}
word_boxes.insert(insert_idx, ipa_box)