Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
476
klausur-service/backend/cv_ocr_ipa_lookup.py
Normal file
476
klausur-service/backend/cv_ocr_ipa_lookup.py
Normal file
@@ -0,0 +1,476 @@
|
||||
"""
|
||||
IPA lookup and phonetic bracket handling for OCR-extracted vocabulary.
|
||||
|
||||
Tesseract and other OCR engines frequently garble IPA phonetic transcriptions
|
||||
in vocabulary tables (e.g. [ˈdɑːns] → {'tfatno] or (cy)). This module
|
||||
provides functions to:
|
||||
|
||||
- Look up correct IPA pronunciations (British/American) for English words.
|
||||
- Detect and replace garbled phonetic brackets with dictionary IPA.
|
||||
- Insert missing IPA for headwords where OCR destroyed the brackets entirely.
|
||||
- Strip orphan brackets and post-bracket garbled fragments.
|
||||
- Handle IPA continuation cells (phonetics on a separate row from headword).
|
||||
|
||||
All IPA data comes from open-source dictionaries:
|
||||
- Britfone (MIT) for British English
|
||||
- eng_to_ipa / CMU (MIT) for American English
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from cv_vocab_types import (
|
||||
IPA_AVAILABLE,
|
||||
_britfone_dict,
|
||||
_ipa_convert_american,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# --- D. Phonetic Bracket IPA Replacement ---
|
||||
|
||||
# Pattern: word followed by any bracket type containing phonetic content.
|
||||
# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
|
||||
# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
|
||||
# This intentionally matches mixed brackets (e.g. {content]) because
|
||||
# Tesseract frequently misrecognizes bracket characters.
|
||||
_PHONETIC_BRACKET_RE = re.compile(
|
||||
r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
|
||||
)
|
||||
|
||||
# Unicode IPA characters — used to distinguish correct IPA (from dictionary
|
||||
# lookup) from garbled OCR content when stripping orphan brackets.
|
||||
_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ')
|
||||
|
||||
# Minimum word confidence for full-page Tesseract results (0-100).
|
||||
# Words below this threshold are OCR noise (scanner shadows, borders).
|
||||
_MIN_WORD_CONF = 30
|
||||
|
||||
|
||||
def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
|
||||
"""Look up IPA for a word using the selected pronunciation dictionary.
|
||||
|
||||
Args:
|
||||
word: English word to look up.
|
||||
pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).
|
||||
|
||||
Returns:
|
||||
IPA string or None if not found.
|
||||
"""
|
||||
word_lower = word.lower().strip()
|
||||
if not word_lower:
|
||||
return None
|
||||
|
||||
if pronunciation == 'british' and _britfone_dict:
|
||||
ipa = _britfone_dict.get(word_lower)
|
||||
if ipa:
|
||||
return ipa
|
||||
# Fallback to American if not in Britfone
|
||||
if _ipa_convert_american:
|
||||
result = _ipa_convert_american(word_lower)
|
||||
if result and '*' not in result:
|
||||
return result
|
||||
return None
|
||||
|
||||
if pronunciation == 'american' and _ipa_convert_american:
|
||||
result = _ipa_convert_american(word_lower)
|
||||
if result and '*' not in result:
|
||||
return result
|
||||
# Fallback to Britfone if not in CMU
|
||||
if _britfone_dict:
|
||||
ipa = _britfone_dict.get(word_lower)
|
||||
if ipa:
|
||||
return ipa
|
||||
return None
|
||||
|
||||
# Try any available source
|
||||
if _britfone_dict:
|
||||
ipa = _britfone_dict.get(word_lower)
|
||||
if ipa:
|
||||
return ipa
|
||||
if _ipa_convert_american:
|
||||
result = _ipa_convert_american(word_lower)
|
||||
if result and '*' not in result:
|
||||
return result
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _fix_phonetic_brackets(
|
||||
entries: List[Dict[str, Any]],
|
||||
pronunciation: str = 'british',
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Replace OCR'd phonetic transcriptions with dictionary IPA.
|
||||
|
||||
Detects patterns like "dance [du:ns]" and replaces with correct IPA:
|
||||
- British: "dance [dˈɑːns]" (Britfone, MIT)
|
||||
- American: "dance [dæns]" (eng_to_ipa/CMU, MIT)
|
||||
|
||||
Only replaces if the word before brackets is found in the dictionary.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return entries
|
||||
|
||||
# IPA phonetics only appear in the ENGLISH field of vocab tables.
|
||||
# German and example fields contain meaningful parenthetical content:
|
||||
# german: "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
|
||||
# example: "(sich beschweren)", "(brauchen)", "(jammern)"
|
||||
# These must NEVER be processed as phonetic transcriptions.
|
||||
replaced_count = 0
|
||||
for entry in entries:
|
||||
text = entry.get('english', '') or ''
|
||||
if not any(ch in text for ch in '[{('):
|
||||
continue
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation)
|
||||
if new_text != text:
|
||||
logger.debug(f"_fix_phonetic_brackets: '{text}' → '{new_text}'")
|
||||
replaced_count += 1
|
||||
entry['english'] = new_text
|
||||
|
||||
if replaced_count:
|
||||
logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries")
|
||||
return entries
|
||||
|
||||
|
||||
# Grammar particles that appear in brackets after English words:
|
||||
# cross (with), complain (about/of), agree (on/with), look (sth) up
|
||||
# These must NOT be replaced with IPA. Only used for the English field
|
||||
# (German/example fields are never processed for IPA replacement).
|
||||
_GRAMMAR_BRACKET_WORDS = frozenset({
|
||||
# English prepositions/particles commonly in vocab tables
|
||||
'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
|
||||
'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
|
||||
# English grammar abbreviations used in vocab tables
|
||||
'sth', 'sb', 'adj', 'adv',
|
||||
# Number/plural/grammar annotations
|
||||
'pl', 'sg', 'sing', 'no', 'also', 'auch',
|
||||
# Regional English markers
|
||||
'ae', 'be', 'ame', 'bre',
|
||||
})
|
||||
|
||||
|
||||
def _is_grammar_bracket_content(content: str) -> bool:
|
||||
"""Return True if bracket content is grammar info in the ENGLISH field.
|
||||
|
||||
Grammar info: cross (with), complain (about/of), agree (on/with)
|
||||
NOT grammar: [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
|
||||
|
||||
Since we only process the English field, we only need to recognize
|
||||
English grammar particles. Everything else is (garbled) IPA.
|
||||
"""
|
||||
if not content:
|
||||
return False
|
||||
|
||||
# Split on / and spaces for patterns like (about/of), (no pl)
|
||||
tokens = re.split(r'[/\s]+', content.strip().lower())
|
||||
tokens = [t for t in tokens if t]
|
||||
if not tokens:
|
||||
return False
|
||||
|
||||
# ALL tokens must be known grammar words
|
||||
return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
|
||||
|
||||
|
||||
def _replace_phonetics_in_text(
|
||||
text: str,
|
||||
pronunciation: str = 'british',
|
||||
strip_orphans: bool = True,
|
||||
) -> str:
|
||||
"""Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
|
||||
|
||||
Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
|
||||
We match any bracket type and replace with dictionary IPA if found.
|
||||
Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
|
||||
|
||||
Args:
|
||||
strip_orphans: If True, strip orphan brackets that look like garbled IPA.
|
||||
Set to False for column_text where brackets may be German content.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return text
|
||||
|
||||
def replacer(match):
|
||||
word = match.group(1)
|
||||
bracket_content = match.group(2).strip()
|
||||
full_match = match.group(0)
|
||||
|
||||
# Skip if bracket content looks like regular text (multiple words)
|
||||
if len(bracket_content.split()) > 3:
|
||||
return full_match
|
||||
|
||||
# Look up IPA for the word before brackets
|
||||
ipa = _lookup_ipa(word, pronunciation)
|
||||
|
||||
if ipa:
|
||||
# Word has IPA → bracket content is phonetic (garbled or correct).
|
||||
# Exception: grammar particles like cross (with) — keep those.
|
||||
if _is_grammar_bracket_content(bracket_content):
|
||||
return full_match
|
||||
logger.debug(f"phonetic: '{full_match}' → '{word} [{ipa}]'")
|
||||
return f"{word} [{ipa}]"
|
||||
|
||||
# No IPA for this word — keep as-is
|
||||
return full_match
|
||||
|
||||
text = _PHONETIC_BRACKET_RE.sub(replacer, text)
|
||||
|
||||
if strip_orphans:
|
||||
# Second pass: strip remaining orphan brackets that are garbled IPA.
|
||||
# These have no word before them (the main regex requires \b word \s* bracket).
|
||||
# Examples: "[mais]", "{'mani setva]", trailing "(kros]"
|
||||
# Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
|
||||
def _strip_orphan_bracket(m):
|
||||
content = m.group(1).strip()
|
||||
# Keep grammar info: (sich beschweren), (about/of)
|
||||
if _is_grammar_bracket_content(content):
|
||||
return m.group(0)
|
||||
# Keep correct IPA (contains Unicode IPA characters)
|
||||
if any(ch in _IPA_CHARS for ch in content):
|
||||
return m.group(0)
|
||||
# Keep real-word parentheticals like (probieren), (Profit), (Geld).
|
||||
# Garbled IPA fragments are short nonsense like (kros), (cy), (mais)
|
||||
# — they never contain a real word ≥4 letters with proper casing.
|
||||
content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content)
|
||||
if len(content_alpha) >= 4:
|
||||
return m.group(0)
|
||||
logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
|
||||
return ''
|
||||
|
||||
text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
|
||||
|
||||
text = text.strip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def _text_has_garbled_ipa(text: str) -> bool:
|
||||
"""Check if text contains garbled IPA-like fragments from OCR.
|
||||
|
||||
Returns True if there is evidence of OCR-mangled phonetic
|
||||
transcription, e.g. stress marks, length marks, or IPA special chars.
|
||||
This is used to decide whether ``_insert_missing_ipa`` should run:
|
||||
it must only insert IPA to *replace* garbled phonetics that are already
|
||||
in the text — never to ADD phonetics where none existed on the page.
|
||||
"""
|
||||
# Bracketed text that doesn't contain valid IPA symbols is garbled OCR
|
||||
# of a phonetic transcription, e.g. "[n, nn]" or "[1uedtX,1]".
|
||||
stripped = text.strip()
|
||||
if stripped.startswith('[') and stripped.endswith(']'):
|
||||
inner = stripped[1:-1]
|
||||
# Real IPA brackets contain IPA symbols (ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ)
|
||||
if not any(c in inner for c in 'ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ'):
|
||||
# Not a valid dictionary-style bracket like "(no pl)" — those
|
||||
# use parentheses, not square brackets. Square brackets with
|
||||
# no IPA chars are garbled phonetics.
|
||||
return True
|
||||
|
||||
for w in text.strip().split():
|
||||
# Skip delimiters and very short tokens
|
||||
if len(w) <= 1 or w in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
continue
|
||||
# Starts with stress mark (OCR read IPA stress ' as apostrophe)
|
||||
if w.startswith("'") and len(w) > 1 and not w[1:].istitle():
|
||||
return True
|
||||
if w.startswith("\u02c8") or w.startswith("\u02cc"): # ˈ ˌ
|
||||
return True
|
||||
# Contains IPA length mark ':' in a short non-word fragment
|
||||
if ':' in w and len(w) < 12:
|
||||
# But not things like "3:00" (time) or common words
|
||||
stripped = re.sub(r'[^a-zA-Z:]', '', w)
|
||||
if ':' in stripped and not stripped.replace(':', '').isalpha():
|
||||
continue
|
||||
return True
|
||||
# Contains IPA special characters
|
||||
if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'):
|
||||
return True
|
||||
# Embedded apostrophe suggesting merged garbled IPA with stress mark.
|
||||
# E.g. "Scotland'skotland" — OCR reads ˈ as '.
|
||||
# Guard: apostrophe must be after ≥3 chars and before ≥3 lowercase
|
||||
# chars to avoid contractions (don't, won't, o'clock).
|
||||
if "'" in w and not w.startswith("'"):
|
||||
apos_idx = w.index("'")
|
||||
after = w[apos_idx + 1:]
|
||||
if apos_idx >= 3 and len(after) >= 3 and after[0].islower():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _decompose_compound(word: str, pronunciation: str = 'british') -> Optional[str]:
|
||||
"""Try to decompose a compound word and concatenate IPA for each part.
|
||||
|
||||
E.g. "schoolbag" → "school"+"bag" → IPA for both concatenated.
|
||||
Only returns IPA if ALL parts are found in the dictionary.
|
||||
|
||||
Tries splits at every position (min 3 chars per part) and picks the
|
||||
split where the first part is longest.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return None
|
||||
lower = word.lower().strip()
|
||||
if len(lower) < 6:
|
||||
return None # too short for a compound
|
||||
|
||||
best_ipa = None
|
||||
best_first_len = 0
|
||||
|
||||
for split_pos in range(3, len(lower) - 2): # min 3 chars each part
|
||||
first = lower[:split_pos]
|
||||
second = lower[split_pos:]
|
||||
ipa_first = _lookup_ipa(first, pronunciation)
|
||||
ipa_second = _lookup_ipa(second, pronunciation)
|
||||
if ipa_first and ipa_second:
|
||||
if split_pos > best_first_len:
|
||||
best_first_len = split_pos
|
||||
best_ipa = ipa_first + ipa_second
|
||||
|
||||
return best_ipa
|
||||
|
||||
|
||||
def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
|
||||
"""Insert IPA pronunciation for English words that have no brackets at all.
|
||||
|
||||
OCR sometimes garbles the phonetic transcription into plain-text fragments
|
||||
(e.g. "scare skea" where "skea" is garbled /skɛə/). This scans the text
|
||||
for the headword, inserts correct [IPA], and strips the garbled fragments.
|
||||
|
||||
Only inserts for words that:
|
||||
- are standalone (not already followed by a bracket)
|
||||
- have an IPA entry in the dictionary
|
||||
- appear to be English headwords (at the start of text or after common
|
||||
separators like ",", ";", "•")
|
||||
|
||||
This is intentionally conservative: it only inserts at the END of each
|
||||
whitespace-separated token group to avoid breaking phrases.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return text
|
||||
if not text or not text.strip():
|
||||
return text
|
||||
|
||||
# Skip if already has brackets (IPA replacement handles those)
|
||||
if any(ch in text for ch in '[{('):
|
||||
return text
|
||||
|
||||
# Only process short text fragments (typical vocab cells).
|
||||
# Long sentences / paragraphs should not get IPA insertions.
|
||||
words = text.strip().split()
|
||||
if len(words) > 6:
|
||||
return text
|
||||
|
||||
# Try to insert IPA for the first alphanumeric word
|
||||
# Typical patterns: "challenge", "profit", "film", "badge"
|
||||
for i, w in enumerate(words):
|
||||
# Clean punctuation for lookup
|
||||
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
|
||||
if not clean or len(clean) < 2:
|
||||
continue
|
||||
# Skip German/grammar words
|
||||
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
|
||||
continue
|
||||
ipa = _lookup_ipa(clean, pronunciation)
|
||||
# Fallback: try without hyphens (e.g. "second-hand" → "secondhand")
|
||||
if not ipa and '-' in clean:
|
||||
ipa = _lookup_ipa(clean.replace('-', ''), pronunciation)
|
||||
# Fallback 0b: compound word decomposition
|
||||
# E.g. "schoolbag" → "school"+"bag" → concatenated IPA
|
||||
if not ipa:
|
||||
ipa = _decompose_compound(clean, pronunciation)
|
||||
# Fallback 1: IPA-marker split for merged tokens where OCR
|
||||
# joined headword with its IPA (e.g. "schoolbagsku:lbæg").
|
||||
# Find the first IPA marker character (:, æ, ɪ, etc.), walk
|
||||
# backwards ≤3 chars for the onset consonant cluster, and
|
||||
# split into headword + OCR IPA.
|
||||
_IPA_SPLIT_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
|
||||
if not ipa:
|
||||
first_marker = next(
|
||||
(p for p, ch in enumerate(w) if ch in _IPA_SPLIT_CHARS), -1,
|
||||
)
|
||||
if first_marker >= 3:
|
||||
split = first_marker
|
||||
while (split > 0
|
||||
and split > first_marker - 3
|
||||
and w[split - 1].isalpha()
|
||||
and w[split - 1].islower()):
|
||||
split -= 1
|
||||
if split >= 2:
|
||||
headword = w[:split]
|
||||
ocr_ipa = w[split:]
|
||||
hw_ipa = _lookup_ipa(headword, pronunciation)
|
||||
if not hw_ipa:
|
||||
# Try compound decomposition for the headword part
|
||||
hw_ipa = _decompose_compound(headword, pronunciation)
|
||||
if hw_ipa:
|
||||
words[i] = f"{headword} [{hw_ipa}]"
|
||||
else:
|
||||
# Word not in dictionary — use OCR IPA
|
||||
words[i] = f"{headword} [{ocr_ipa}]"
|
||||
words = words[:i + 1]
|
||||
ipa = True # signal that we handled it
|
||||
break
|
||||
# Fallback 2: prefix matching for merged tokens WITHOUT IPA
|
||||
# markers (e.g. "Scotland'skotland"). Find longest dictionary
|
||||
# prefix using only alpha chars to avoid punctuation matches.
|
||||
if not ipa:
|
||||
alpha = re.sub(r'[^a-zA-Z]', '', clean)
|
||||
if len(alpha) > 5: # need at least 6 chars for meaningful split
|
||||
for end in range(len(alpha), 3, -1): # min prefix 4 chars
|
||||
prefix = alpha[:end]
|
||||
test_ipa = _lookup_ipa(prefix, pronunciation)
|
||||
if test_ipa:
|
||||
ipa = test_ipa
|
||||
w = prefix
|
||||
words[i] = prefix
|
||||
break
|
||||
if ipa:
|
||||
words[i] = f"{w} [{ipa}]"
|
||||
# Strip garbled OCR phonetics after the IPA bracket.
|
||||
# On scanned vocab pages, printed IPA is read as garbled
|
||||
# text (e.g. "scare skea" where "skea" is garbled /skɛə/).
|
||||
# After inserting correct IPA, remove remaining words that
|
||||
# aren't real English words, delimiters, or German text.
|
||||
kept = words[:i + 1]
|
||||
for j in range(i + 1, len(words)):
|
||||
wj = words[j]
|
||||
# Delimiter — keep this and everything after
|
||||
if wj in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
kept.extend(words[j:])
|
||||
break
|
||||
# Pure digits or numbering (e.g. "1", "2.", "3)") — keep
|
||||
if re.match(r'^[\d.)\-]+$', wj):
|
||||
kept.extend(words[j:])
|
||||
break
|
||||
# Starts with uppercase — likely German or proper noun
|
||||
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
|
||||
if clean_j and clean_j[0].isupper():
|
||||
kept.extend(words[j:])
|
||||
break
|
||||
# Known English word (≥2 chars) — keep it and rest
|
||||
if clean_j and len(clean_j) >= 2:
|
||||
if _lookup_ipa(clean_j, pronunciation):
|
||||
kept.extend(words[j:])
|
||||
break
|
||||
# Merged token: dictionary word + garbled IPA stuck together.
|
||||
# E.g. "fictionsalans'fIkfn" starts with "fiction".
|
||||
# Extract the dictionary prefix (≥4 chars) and add it with
|
||||
# IPA, but only if enough chars remain after the prefix (≥3)
|
||||
# to look like garbled IPA, not just a plural 's'.
|
||||
if clean_j and len(clean_j) >= 7:
|
||||
for pend in range(min(len(clean_j) - 3, 15), 3, -1):
|
||||
prefix_j = clean_j[:pend]
|
||||
prefix_ipa = _lookup_ipa(prefix_j, pronunciation)
|
||||
if prefix_ipa:
|
||||
kept.append(f"{prefix_j} [{prefix_ipa}]")
|
||||
break
|
||||
break # rest of this token is garbled
|
||||
# Otherwise — likely garbled phonetics, skip
|
||||
words = kept
|
||||
break
|
||||
|
||||
return ' '.join(words)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user