Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s

sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions

View File

@@ -0,0 +1,476 @@
"""
IPA lookup and phonetic bracket handling for OCR-extracted vocabulary.
Tesseract and other OCR engines frequently garble IPA phonetic transcriptions
in vocabulary tables (e.g. [ˈdɑːns] → {'tfatno] or (cy)). This module
provides functions to:
- Look up correct IPA pronunciations (British/American) for English words.
- Detect and replace garbled phonetic brackets with dictionary IPA.
- Insert missing IPA for headwords where OCR destroyed the brackets entirely.
- Strip orphan brackets and post-bracket garbled fragments.
- Handle IPA continuation cells (phonetics on a separate row from headword).
All IPA data comes from open-source dictionaries:
- Britfone (MIT) for British English
- eng_to_ipa / CMU (MIT) for American English
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import re
from typing import Any, Dict, List, Optional
from cv_vocab_types import (
IPA_AVAILABLE,
_britfone_dict,
_ipa_convert_american,
)
logger = logging.getLogger(__name__)
# --- D. Phonetic Bracket IPA Replacement ---
# Pattern: word followed by any bracket type containing phonetic content.
# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
# This intentionally matches mixed brackets (e.g. {content]) because
# Tesseract frequently misrecognizes bracket characters.
_PHONETIC_BRACKET_RE = re.compile(
r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
)
# Unicode IPA characters — used to distinguish correct IPA (from dictionary
# lookup) from garbled OCR content when stripping orphan brackets.
_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ')
# Minimum word confidence for full-page Tesseract results (0-100).
# Words below this threshold are OCR noise (scanner shadows, borders).
_MIN_WORD_CONF = 30
def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
"""Look up IPA for a word using the selected pronunciation dictionary.
Args:
word: English word to look up.
pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).
Returns:
IPA string or None if not found.
"""
word_lower = word.lower().strip()
if not word_lower:
return None
if pronunciation == 'british' and _britfone_dict:
ipa = _britfone_dict.get(word_lower)
if ipa:
return ipa
# Fallback to American if not in Britfone
if _ipa_convert_american:
result = _ipa_convert_american(word_lower)
if result and '*' not in result:
return result
return None
if pronunciation == 'american' and _ipa_convert_american:
result = _ipa_convert_american(word_lower)
if result and '*' not in result:
return result
# Fallback to Britfone if not in CMU
if _britfone_dict:
ipa = _britfone_dict.get(word_lower)
if ipa:
return ipa
return None
# Try any available source
if _britfone_dict:
ipa = _britfone_dict.get(word_lower)
if ipa:
return ipa
if _ipa_convert_american:
result = _ipa_convert_american(word_lower)
if result and '*' not in result:
return result
return None
def _fix_phonetic_brackets(
entries: List[Dict[str, Any]],
pronunciation: str = 'british',
) -> List[Dict[str, Any]]:
"""Replace OCR'd phonetic transcriptions with dictionary IPA.
Detects patterns like "dance [du:ns]" and replaces with correct IPA:
- British: "dance [dˈɑːns]" (Britfone, MIT)
- American: "dance [dæns]" (eng_to_ipa/CMU, MIT)
Only replaces if the word before brackets is found in the dictionary.
"""
if not IPA_AVAILABLE:
return entries
# IPA phonetics only appear in the ENGLISH field of vocab tables.
# German and example fields contain meaningful parenthetical content:
# german: "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
# example: "(sich beschweren)", "(brauchen)", "(jammern)"
# These must NEVER be processed as phonetic transcriptions.
replaced_count = 0
for entry in entries:
text = entry.get('english', '') or ''
if not any(ch in text for ch in '[{('):
continue
new_text = _replace_phonetics_in_text(text, pronunciation)
if new_text != text:
logger.debug(f"_fix_phonetic_brackets: '{text}''{new_text}'")
replaced_count += 1
entry['english'] = new_text
if replaced_count:
logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries")
return entries
# Grammar particles that appear in brackets after English words:
# cross (with), complain (about/of), agree (on/with), look (sth) up
# These must NOT be replaced with IPA. Only used for the English field
# (German/example fields are never processed for IPA replacement).
_GRAMMAR_BRACKET_WORDS = frozenset({
# English prepositions/particles commonly in vocab tables
'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
# English grammar abbreviations used in vocab tables
'sth', 'sb', 'adj', 'adv',
# Number/plural/grammar annotations
'pl', 'sg', 'sing', 'no', 'also', 'auch',
# Regional English markers
'ae', 'be', 'ame', 'bre',
})
def _is_grammar_bracket_content(content: str) -> bool:
"""Return True if bracket content is grammar info in the ENGLISH field.
Grammar info: cross (with), complain (about/of), agree (on/with)
NOT grammar: [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
Since we only process the English field, we only need to recognize
English grammar particles. Everything else is (garbled) IPA.
"""
if not content:
return False
# Split on / and spaces for patterns like (about/of), (no pl)
tokens = re.split(r'[/\s]+', content.strip().lower())
tokens = [t for t in tokens if t]
if not tokens:
return False
# ALL tokens must be known grammar words
return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
def _replace_phonetics_in_text(
text: str,
pronunciation: str = 'british',
strip_orphans: bool = True,
) -> str:
"""Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
We match any bracket type and replace with dictionary IPA if found.
Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
Args:
strip_orphans: If True, strip orphan brackets that look like garbled IPA.
Set to False for column_text where brackets may be German content.
"""
if not IPA_AVAILABLE:
return text
def replacer(match):
word = match.group(1)
bracket_content = match.group(2).strip()
full_match = match.group(0)
# Skip if bracket content looks like regular text (multiple words)
if len(bracket_content.split()) > 3:
return full_match
# Look up IPA for the word before brackets
ipa = _lookup_ipa(word, pronunciation)
if ipa:
# Word has IPA → bracket content is phonetic (garbled or correct).
# Exception: grammar particles like cross (with) — keep those.
if _is_grammar_bracket_content(bracket_content):
return full_match
logger.debug(f"phonetic: '{full_match}''{word} [{ipa}]'")
return f"{word} [{ipa}]"
# No IPA for this word — keep as-is
return full_match
text = _PHONETIC_BRACKET_RE.sub(replacer, text)
if strip_orphans:
# Second pass: strip remaining orphan brackets that are garbled IPA.
# These have no word before them (the main regex requires \b word \s* bracket).
# Examples: "[mais]", "{'mani setva]", trailing "(kros]"
# Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
def _strip_orphan_bracket(m):
content = m.group(1).strip()
# Keep grammar info: (sich beschweren), (about/of)
if _is_grammar_bracket_content(content):
return m.group(0)
# Keep correct IPA (contains Unicode IPA characters)
if any(ch in _IPA_CHARS for ch in content):
return m.group(0)
# Keep real-word parentheticals like (probieren), (Profit), (Geld).
# Garbled IPA fragments are short nonsense like (kros), (cy), (mais)
# — they never contain a real word ≥4 letters with proper casing.
content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content)
if len(content_alpha) >= 4:
return m.group(0)
logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
return ''
text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
text = text.strip()
return text
def _text_has_garbled_ipa(text: str) -> bool:
"""Check if text contains garbled IPA-like fragments from OCR.
Returns True if there is evidence of OCR-mangled phonetic
transcription, e.g. stress marks, length marks, or IPA special chars.
This is used to decide whether ``_insert_missing_ipa`` should run:
it must only insert IPA to *replace* garbled phonetics that are already
in the text — never to ADD phonetics where none existed on the page.
"""
# Bracketed text that doesn't contain valid IPA symbols is garbled OCR
# of a phonetic transcription, e.g. "[n, nn]" or "[1uedtX,1]".
stripped = text.strip()
if stripped.startswith('[') and stripped.endswith(']'):
inner = stripped[1:-1]
# Real IPA brackets contain IPA symbols (ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ)
if not any(c in inner for c in 'ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ'):
# Not a valid dictionary-style bracket like "(no pl)" — those
# use parentheses, not square brackets. Square brackets with
# no IPA chars are garbled phonetics.
return True
for w in text.strip().split():
# Skip delimiters and very short tokens
if len(w) <= 1 or w in ('', '', '-', '/', '|', ',', ';'):
continue
# Starts with stress mark (OCR read IPA stress ' as apostrophe)
if w.startswith("'") and len(w) > 1 and not w[1:].istitle():
return True
if w.startswith("\u02c8") or w.startswith("\u02cc"): # ˈ ˌ
return True
# Contains IPA length mark ':' in a short non-word fragment
if ':' in w and len(w) < 12:
# But not things like "3:00" (time) or common words
stripped = re.sub(r'[^a-zA-Z:]', '', w)
if ':' in stripped and not stripped.replace(':', '').isalpha():
continue
return True
# Contains IPA special characters
if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'):
return True
# Embedded apostrophe suggesting merged garbled IPA with stress mark.
# E.g. "Scotland'skotland" — OCR reads ˈ as '.
# Guard: apostrophe must be after ≥3 chars and before ≥3 lowercase
# chars to avoid contractions (don't, won't, o'clock).
if "'" in w and not w.startswith("'"):
apos_idx = w.index("'")
after = w[apos_idx + 1:]
if apos_idx >= 3 and len(after) >= 3 and after[0].islower():
return True
return False
def _decompose_compound(word: str, pronunciation: str = 'british') -> Optional[str]:
"""Try to decompose a compound word and concatenate IPA for each part.
E.g. "schoolbag""school"+"bag" → IPA for both concatenated.
Only returns IPA if ALL parts are found in the dictionary.
Tries splits at every position (min 3 chars per part) and picks the
split where the first part is longest.
"""
if not IPA_AVAILABLE:
return None
lower = word.lower().strip()
if len(lower) < 6:
return None # too short for a compound
best_ipa = None
best_first_len = 0
for split_pos in range(3, len(lower) - 2): # min 3 chars each part
first = lower[:split_pos]
second = lower[split_pos:]
ipa_first = _lookup_ipa(first, pronunciation)
ipa_second = _lookup_ipa(second, pronunciation)
if ipa_first and ipa_second:
if split_pos > best_first_len:
best_first_len = split_pos
best_ipa = ipa_first + ipa_second
return best_ipa
def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
"""Insert IPA pronunciation for English words that have no brackets at all.
OCR sometimes garbles the phonetic transcription into plain-text fragments
(e.g. "scare skea" where "skea" is garbled /skɛə/). This scans the text
for the headword, inserts correct [IPA], and strips the garbled fragments.
Only inserts for words that:
- are standalone (not already followed by a bracket)
- have an IPA entry in the dictionary
- appear to be English headwords (at the start of text or after common
separators like ",", ";", "")
This is intentionally conservative: it only inserts at the END of each
whitespace-separated token group to avoid breaking phrases.
"""
if not IPA_AVAILABLE:
return text
if not text or not text.strip():
return text
# Skip if already has brackets (IPA replacement handles those)
if any(ch in text for ch in '[{('):
return text
# Only process short text fragments (typical vocab cells).
# Long sentences / paragraphs should not get IPA insertions.
words = text.strip().split()
if len(words) > 6:
return text
# Try to insert IPA for the first alphanumeric word
# Typical patterns: "challenge", "profit", "film", "badge"
for i, w in enumerate(words):
# Clean punctuation for lookup
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
if not clean or len(clean) < 2:
continue
# Skip German/grammar words
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
continue
ipa = _lookup_ipa(clean, pronunciation)
# Fallback: try without hyphens (e.g. "second-hand" → "secondhand")
if not ipa and '-' in clean:
ipa = _lookup_ipa(clean.replace('-', ''), pronunciation)
# Fallback 0b: compound word decomposition
# E.g. "schoolbag" → "school"+"bag" → concatenated IPA
if not ipa:
ipa = _decompose_compound(clean, pronunciation)
# Fallback 1: IPA-marker split for merged tokens where OCR
# joined headword with its IPA (e.g. "schoolbagsku:lbæg").
# Find the first IPA marker character (:, æ, ɪ, etc.), walk
# backwards ≤3 chars for the onset consonant cluster, and
# split into headword + OCR IPA.
_IPA_SPLIT_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
if not ipa:
first_marker = next(
(p for p, ch in enumerate(w) if ch in _IPA_SPLIT_CHARS), -1,
)
if first_marker >= 3:
split = first_marker
while (split > 0
and split > first_marker - 3
and w[split - 1].isalpha()
and w[split - 1].islower()):
split -= 1
if split >= 2:
headword = w[:split]
ocr_ipa = w[split:]
hw_ipa = _lookup_ipa(headword, pronunciation)
if not hw_ipa:
# Try compound decomposition for the headword part
hw_ipa = _decompose_compound(headword, pronunciation)
if hw_ipa:
words[i] = f"{headword} [{hw_ipa}]"
else:
# Word not in dictionary — use OCR IPA
words[i] = f"{headword} [{ocr_ipa}]"
words = words[:i + 1]
ipa = True # signal that we handled it
break
# Fallback 2: prefix matching for merged tokens WITHOUT IPA
# markers (e.g. "Scotland'skotland"). Find longest dictionary
# prefix using only alpha chars to avoid punctuation matches.
if not ipa:
alpha = re.sub(r'[^a-zA-Z]', '', clean)
if len(alpha) > 5: # need at least 6 chars for meaningful split
for end in range(len(alpha), 3, -1): # min prefix 4 chars
prefix = alpha[:end]
test_ipa = _lookup_ipa(prefix, pronunciation)
if test_ipa:
ipa = test_ipa
w = prefix
words[i] = prefix
break
if ipa:
words[i] = f"{w} [{ipa}]"
# Strip garbled OCR phonetics after the IPA bracket.
# On scanned vocab pages, printed IPA is read as garbled
# text (e.g. "scare skea" where "skea" is garbled /skɛə/).
# After inserting correct IPA, remove remaining words that
# aren't real English words, delimiters, or German text.
kept = words[:i + 1]
for j in range(i + 1, len(words)):
wj = words[j]
# Delimiter — keep this and everything after
if wj in ('', '', '-', '/', '|', ',', ';'):
kept.extend(words[j:])
break
# Pure digits or numbering (e.g. "1", "2.", "3)") — keep
if re.match(r'^[\d.)\-]+$', wj):
kept.extend(words[j:])
break
# Starts with uppercase — likely German or proper noun
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
if clean_j and clean_j[0].isupper():
kept.extend(words[j:])
break
# Known English word (≥2 chars) — keep it and rest
if clean_j and len(clean_j) >= 2:
if _lookup_ipa(clean_j, pronunciation):
kept.extend(words[j:])
break
# Merged token: dictionary word + garbled IPA stuck together.
# E.g. "fictionsalans'fIkfn" starts with "fiction".
# Extract the dictionary prefix (≥4 chars) and add it with
# IPA, but only if enough chars remain after the prefix (≥3)
# to look like garbled IPA, not just a plural 's'.
if clean_j and len(clean_j) >= 7:
for pend in range(min(len(clean_j) - 3, 15), 3, -1):
prefix_j = clean_j[:pend]
prefix_ipa = _lookup_ipa(prefix_j, pronunciation)
if prefix_ipa:
kept.append(f"{prefix_j} [{prefix_ipa}]")
break
break # rest of this token is garbled
# Otherwise — likely garbled phonetics, skip
words = kept
break
return ' '.join(words)