overflow-hidden → overflow-y-auto so all nav items are reachable. Added /parent (Eltern-Portal) link with people icon. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
477 lines
19 KiB
Python
477 lines
19 KiB
Python
"""
|
||
IPA lookup and phonetic bracket handling for OCR-extracted vocabulary.
|
||
|
||
Tesseract and other OCR engines frequently garble IPA phonetic transcriptions
|
||
in vocabulary tables (e.g. [ˈdɑːns] → {'tfatno] or (cy)). This module
|
||
provides functions to:
|
||
|
||
- Look up correct IPA pronunciations (British/American) for English words.
|
||
- Detect and replace garbled phonetic brackets with dictionary IPA.
|
||
- Insert missing IPA for headwords where OCR destroyed the brackets entirely.
|
||
- Strip orphan brackets and post-bracket garbled fragments.
|
||
- Handle IPA continuation cells (phonetics on a separate row from headword).
|
||
|
||
All IPA data comes from open-source dictionaries:
|
||
- Britfone (MIT) for British English
|
||
- eng_to_ipa / CMU (MIT) for American English
|
||
|
||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||
"""
|
||
|
||
import logging
|
||
import re
|
||
from typing import Any, Dict, List, Optional
|
||
|
||
from cv_vocab_types import (
|
||
IPA_AVAILABLE,
|
||
_britfone_dict,
|
||
_ipa_convert_american,
|
||
)
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
# --- D. Phonetic Bracket IPA Replacement ---
|
||
|
||
# Pattern: word followed by any bracket type containing phonetic content.
|
||
# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
|
||
# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
|
||
# This intentionally matches mixed brackets (e.g. {content]) because
|
||
# Tesseract frequently misrecognizes bracket characters.
|
||
_PHONETIC_BRACKET_RE = re.compile(
|
||
r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
|
||
)
|
||
|
||
# Unicode IPA characters — used to distinguish correct IPA (from dictionary
|
||
# lookup) from garbled OCR content when stripping orphan brackets.
|
||
_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ')
|
||
|
||
# Minimum word confidence for full-page Tesseract results (0-100).
|
||
# Words below this threshold are OCR noise (scanner shadows, borders).
|
||
_MIN_WORD_CONF = 30
|
||
|
||
|
||
def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
|
||
"""Look up IPA for a word using the selected pronunciation dictionary.
|
||
|
||
Args:
|
||
word: English word to look up.
|
||
pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).
|
||
|
||
Returns:
|
||
IPA string or None if not found.
|
||
"""
|
||
word_lower = word.lower().strip()
|
||
if not word_lower:
|
||
return None
|
||
|
||
if pronunciation == 'british' and _britfone_dict:
|
||
ipa = _britfone_dict.get(word_lower)
|
||
if ipa:
|
||
return ipa
|
||
# Fallback to American if not in Britfone
|
||
if _ipa_convert_american:
|
||
result = _ipa_convert_american(word_lower)
|
||
if result and '*' not in result:
|
||
return result
|
||
return None
|
||
|
||
if pronunciation == 'american' and _ipa_convert_american:
|
||
result = _ipa_convert_american(word_lower)
|
||
if result and '*' not in result:
|
||
return result
|
||
# Fallback to Britfone if not in CMU
|
||
if _britfone_dict:
|
||
ipa = _britfone_dict.get(word_lower)
|
||
if ipa:
|
||
return ipa
|
||
return None
|
||
|
||
# Try any available source
|
||
if _britfone_dict:
|
||
ipa = _britfone_dict.get(word_lower)
|
||
if ipa:
|
||
return ipa
|
||
if _ipa_convert_american:
|
||
result = _ipa_convert_american(word_lower)
|
||
if result and '*' not in result:
|
||
return result
|
||
|
||
return None
|
||
|
||
|
||
def _fix_phonetic_brackets(
|
||
entries: List[Dict[str, Any]],
|
||
pronunciation: str = 'british',
|
||
) -> List[Dict[str, Any]]:
|
||
"""Replace OCR'd phonetic transcriptions with dictionary IPA.
|
||
|
||
Detects patterns like "dance [du:ns]" and replaces with correct IPA:
|
||
- British: "dance [dˈɑːns]" (Britfone, MIT)
|
||
- American: "dance [dæns]" (eng_to_ipa/CMU, MIT)
|
||
|
||
Only replaces if the word before brackets is found in the dictionary.
|
||
"""
|
||
if not IPA_AVAILABLE:
|
||
return entries
|
||
|
||
# IPA phonetics only appear in the ENGLISH field of vocab tables.
|
||
# German and example fields contain meaningful parenthetical content:
|
||
# german: "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
|
||
# example: "(sich beschweren)", "(brauchen)", "(jammern)"
|
||
# These must NEVER be processed as phonetic transcriptions.
|
||
replaced_count = 0
|
||
for entry in entries:
|
||
text = entry.get('english', '') or ''
|
||
if not any(ch in text for ch in '[{('):
|
||
continue
|
||
new_text = _replace_phonetics_in_text(text, pronunciation)
|
||
if new_text != text:
|
||
logger.debug(f"_fix_phonetic_brackets: '{text}' → '{new_text}'")
|
||
replaced_count += 1
|
||
entry['english'] = new_text
|
||
|
||
if replaced_count:
|
||
logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries")
|
||
return entries
|
||
|
||
|
||
# Grammar particles that appear in brackets after English words:
|
||
# cross (with), complain (about/of), agree (on/with), look (sth) up
|
||
# These must NOT be replaced with IPA. Only used for the English field
|
||
# (German/example fields are never processed for IPA replacement).
|
||
_GRAMMAR_BRACKET_WORDS = frozenset({
|
||
# English prepositions/particles commonly in vocab tables
|
||
'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
|
||
'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
|
||
# English grammar abbreviations used in vocab tables
|
||
'sth', 'sb', 'adj', 'adv',
|
||
# Number/plural/grammar annotations
|
||
'pl', 'sg', 'sing', 'no', 'also', 'auch',
|
||
# Regional English markers
|
||
'ae', 'be', 'ame', 'bre',
|
||
})
|
||
|
||
|
||
def _is_grammar_bracket_content(content: str) -> bool:
|
||
"""Return True if bracket content is grammar info in the ENGLISH field.
|
||
|
||
Grammar info: cross (with), complain (about/of), agree (on/with)
|
||
NOT grammar: [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
|
||
|
||
Since we only process the English field, we only need to recognize
|
||
English grammar particles. Everything else is (garbled) IPA.
|
||
"""
|
||
if not content:
|
||
return False
|
||
|
||
# Split on / and spaces for patterns like (about/of), (no pl)
|
||
tokens = re.split(r'[/\s]+', content.strip().lower())
|
||
tokens = [t for t in tokens if t]
|
||
if not tokens:
|
||
return False
|
||
|
||
# ALL tokens must be known grammar words
|
||
return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
|
||
|
||
|
||
def _replace_phonetics_in_text(
|
||
text: str,
|
||
pronunciation: str = 'british',
|
||
strip_orphans: bool = True,
|
||
) -> str:
|
||
"""Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
|
||
|
||
Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
|
||
We match any bracket type and replace with dictionary IPA if found.
|
||
Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
|
||
|
||
Args:
|
||
strip_orphans: If True, strip orphan brackets that look like garbled IPA.
|
||
Set to False for column_text where brackets may be German content.
|
||
"""
|
||
if not IPA_AVAILABLE:
|
||
return text
|
||
|
||
def replacer(match):
|
||
word = match.group(1)
|
||
bracket_content = match.group(2).strip()
|
||
full_match = match.group(0)
|
||
|
||
# Skip if bracket content looks like regular text (multiple words)
|
||
if len(bracket_content.split()) > 3:
|
||
return full_match
|
||
|
||
# Look up IPA for the word before brackets
|
||
ipa = _lookup_ipa(word, pronunciation)
|
||
|
||
if ipa:
|
||
# Word has IPA → bracket content is phonetic (garbled or correct).
|
||
# Exception: grammar particles like cross (with) — keep those.
|
||
if _is_grammar_bracket_content(bracket_content):
|
||
return full_match
|
||
logger.debug(f"phonetic: '{full_match}' → '{word} [{ipa}]'")
|
||
return f"{word} [{ipa}]"
|
||
|
||
# No IPA for this word — keep as-is
|
||
return full_match
|
||
|
||
text = _PHONETIC_BRACKET_RE.sub(replacer, text)
|
||
|
||
if strip_orphans:
|
||
# Second pass: strip remaining orphan brackets that are garbled IPA.
|
||
# These have no word before them (the main regex requires \b word \s* bracket).
|
||
# Examples: "[mais]", "{'mani setva]", trailing "(kros]"
|
||
# Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
|
||
def _strip_orphan_bracket(m):
|
||
content = m.group(1).strip()
|
||
# Keep grammar info: (sich beschweren), (about/of)
|
||
if _is_grammar_bracket_content(content):
|
||
return m.group(0)
|
||
# Keep correct IPA (contains Unicode IPA characters)
|
||
if any(ch in _IPA_CHARS for ch in content):
|
||
return m.group(0)
|
||
# Keep real-word parentheticals like (probieren), (Profit), (Geld).
|
||
# Garbled IPA fragments are short nonsense like (kros), (cy), (mais)
|
||
# — they never contain a real word ≥4 letters with proper casing.
|
||
content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content)
|
||
if len(content_alpha) >= 4:
|
||
return m.group(0)
|
||
logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
|
||
return ''
|
||
|
||
text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
|
||
|
||
text = text.strip()
|
||
|
||
return text
|
||
|
||
|
||
def _text_has_garbled_ipa(text: str) -> bool:
|
||
"""Check if text contains garbled IPA-like fragments from OCR.
|
||
|
||
Returns True if there is evidence of OCR-mangled phonetic
|
||
transcription, e.g. stress marks, length marks, or IPA special chars.
|
||
This is used to decide whether ``_insert_missing_ipa`` should run:
|
||
it must only insert IPA to *replace* garbled phonetics that are already
|
||
in the text — never to ADD phonetics where none existed on the page.
|
||
"""
|
||
# Bracketed text that doesn't contain valid IPA symbols is garbled OCR
|
||
# of a phonetic transcription, e.g. "[n, nn]" or "[1uedtX,1]".
|
||
stripped = text.strip()
|
||
if stripped.startswith('[') and stripped.endswith(']'):
|
||
inner = stripped[1:-1]
|
||
# Real IPA brackets contain IPA symbols (ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ)
|
||
if not any(c in inner for c in 'ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ'):
|
||
# Not a valid dictionary-style bracket like "(no pl)" — those
|
||
# use parentheses, not square brackets. Square brackets with
|
||
# no IPA chars are garbled phonetics.
|
||
return True
|
||
|
||
for w in text.strip().split():
|
||
# Skip delimiters and very short tokens
|
||
if len(w) <= 1 or w in ('–', '—', '-', '/', '|', ',', ';'):
|
||
continue
|
||
# Starts with stress mark (OCR read IPA stress ' as apostrophe)
|
||
if w.startswith("'") and len(w) > 1 and not w[1:].istitle():
|
||
return True
|
||
if w.startswith("\u02c8") or w.startswith("\u02cc"): # ˈ ˌ
|
||
return True
|
||
# Contains IPA length mark ':' in a short non-word fragment
|
||
if ':' in w and len(w) < 12:
|
||
# But not things like "3:00" (time) or common words
|
||
stripped = re.sub(r'[^a-zA-Z:]', '', w)
|
||
if ':' in stripped and not stripped.replace(':', '').isalpha():
|
||
continue
|
||
return True
|
||
# Contains IPA special characters
|
||
if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'):
|
||
return True
|
||
# Embedded apostrophe suggesting merged garbled IPA with stress mark.
|
||
# E.g. "Scotland'skotland" — OCR reads ˈ as '.
|
||
# Guard: apostrophe must be after ≥3 chars and before ≥3 lowercase
|
||
# chars to avoid contractions (don't, won't, o'clock).
|
||
if "'" in w and not w.startswith("'"):
|
||
apos_idx = w.index("'")
|
||
after = w[apos_idx + 1:]
|
||
if apos_idx >= 3 and len(after) >= 3 and after[0].islower():
|
||
return True
|
||
return False
|
||
|
||
|
||
def _decompose_compound(word: str, pronunciation: str = 'british') -> Optional[str]:
|
||
"""Try to decompose a compound word and concatenate IPA for each part.
|
||
|
||
E.g. "schoolbag" → "school"+"bag" → IPA for both concatenated.
|
||
Only returns IPA if ALL parts are found in the dictionary.
|
||
|
||
Tries splits at every position (min 3 chars per part) and picks the
|
||
split where the first part is longest.
|
||
"""
|
||
if not IPA_AVAILABLE:
|
||
return None
|
||
lower = word.lower().strip()
|
||
if len(lower) < 6:
|
||
return None # too short for a compound
|
||
|
||
best_ipa = None
|
||
best_first_len = 0
|
||
|
||
for split_pos in range(3, len(lower) - 2): # min 3 chars each part
|
||
first = lower[:split_pos]
|
||
second = lower[split_pos:]
|
||
ipa_first = _lookup_ipa(first, pronunciation)
|
||
ipa_second = _lookup_ipa(second, pronunciation)
|
||
if ipa_first and ipa_second:
|
||
if split_pos > best_first_len:
|
||
best_first_len = split_pos
|
||
best_ipa = ipa_first + ipa_second
|
||
|
||
return best_ipa
|
||
|
||
|
||
def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
|
||
"""Insert IPA pronunciation for English words that have no brackets at all.
|
||
|
||
OCR sometimes garbles the phonetic transcription into plain-text fragments
|
||
(e.g. "scare skea" where "skea" is garbled /skɛə/). This scans the text
|
||
for the headword, inserts correct [IPA], and strips the garbled fragments.
|
||
|
||
Only inserts for words that:
|
||
- are standalone (not already followed by a bracket)
|
||
- have an IPA entry in the dictionary
|
||
- appear to be English headwords (at the start of text or after common
|
||
separators like ",", ";", "•")
|
||
|
||
This is intentionally conservative: it only inserts at the END of each
|
||
whitespace-separated token group to avoid breaking phrases.
|
||
"""
|
||
if not IPA_AVAILABLE:
|
||
return text
|
||
if not text or not text.strip():
|
||
return text
|
||
|
||
# Skip if already has brackets (IPA replacement handles those)
|
||
if any(ch in text for ch in '[{('):
|
||
return text
|
||
|
||
# Only process short text fragments (typical vocab cells).
|
||
# Long sentences / paragraphs should not get IPA insertions.
|
||
words = text.strip().split()
|
||
if len(words) > 6:
|
||
return text
|
||
|
||
# Try to insert IPA for the first alphanumeric word
|
||
# Typical patterns: "challenge", "profit", "film", "badge"
|
||
for i, w in enumerate(words):
|
||
# Clean punctuation for lookup
|
||
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
|
||
if not clean or len(clean) < 2:
|
||
continue
|
||
# Skip German/grammar words
|
||
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
|
||
continue
|
||
ipa = _lookup_ipa(clean, pronunciation)
|
||
# Fallback: try without hyphens (e.g. "second-hand" → "secondhand")
|
||
if not ipa and '-' in clean:
|
||
ipa = _lookup_ipa(clean.replace('-', ''), pronunciation)
|
||
# Fallback 0b: compound word decomposition
|
||
# E.g. "schoolbag" → "school"+"bag" → concatenated IPA
|
||
if not ipa:
|
||
ipa = _decompose_compound(clean, pronunciation)
|
||
# Fallback 1: IPA-marker split for merged tokens where OCR
|
||
# joined headword with its IPA (e.g. "schoolbagsku:lbæg").
|
||
# Find the first IPA marker character (:, æ, ɪ, etc.), walk
|
||
# backwards ≤3 chars for the onset consonant cluster, and
|
||
# split into headword + OCR IPA.
|
||
_IPA_SPLIT_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
|
||
if not ipa:
|
||
first_marker = next(
|
||
(p for p, ch in enumerate(w) if ch in _IPA_SPLIT_CHARS), -1,
|
||
)
|
||
if first_marker >= 3:
|
||
split = first_marker
|
||
while (split > 0
|
||
and split > first_marker - 3
|
||
and w[split - 1].isalpha()
|
||
and w[split - 1].islower()):
|
||
split -= 1
|
||
if split >= 2:
|
||
headword = w[:split]
|
||
ocr_ipa = w[split:]
|
||
hw_ipa = _lookup_ipa(headword, pronunciation)
|
||
if not hw_ipa:
|
||
# Try compound decomposition for the headword part
|
||
hw_ipa = _decompose_compound(headword, pronunciation)
|
||
if hw_ipa:
|
||
words[i] = f"{headword} [{hw_ipa}]"
|
||
else:
|
||
# Word not in dictionary — use OCR IPA
|
||
words[i] = f"{headword} [{ocr_ipa}]"
|
||
words = words[:i + 1]
|
||
ipa = True # signal that we handled it
|
||
break
|
||
# Fallback 2: prefix matching for merged tokens WITHOUT IPA
|
||
# markers (e.g. "Scotland'skotland"). Find longest dictionary
|
||
# prefix using only alpha chars to avoid punctuation matches.
|
||
if not ipa:
|
||
alpha = re.sub(r'[^a-zA-Z]', '', clean)
|
||
if len(alpha) > 5: # need at least 6 chars for meaningful split
|
||
for end in range(len(alpha), 3, -1): # min prefix 4 chars
|
||
prefix = alpha[:end]
|
||
test_ipa = _lookup_ipa(prefix, pronunciation)
|
||
if test_ipa:
|
||
ipa = test_ipa
|
||
w = prefix
|
||
words[i] = prefix
|
||
break
|
||
if ipa:
|
||
words[i] = f"{w} [{ipa}]"
|
||
# Strip garbled OCR phonetics after the IPA bracket.
|
||
# On scanned vocab pages, printed IPA is read as garbled
|
||
# text (e.g. "scare skea" where "skea" is garbled /skɛə/).
|
||
# After inserting correct IPA, remove remaining words that
|
||
# aren't real English words, delimiters, or German text.
|
||
kept = words[:i + 1]
|
||
for j in range(i + 1, len(words)):
|
||
wj = words[j]
|
||
# Delimiter — keep this and everything after
|
||
if wj in ('–', '—', '-', '/', '|', ',', ';'):
|
||
kept.extend(words[j:])
|
||
break
|
||
# Pure digits or numbering (e.g. "1", "2.", "3)") — keep
|
||
if re.match(r'^[\d.)\-]+$', wj):
|
||
kept.extend(words[j:])
|
||
break
|
||
# Starts with uppercase — likely German or proper noun
|
||
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
|
||
if clean_j and clean_j[0].isupper():
|
||
kept.extend(words[j:])
|
||
break
|
||
# Known English word (≥2 chars) — keep it and rest
|
||
if clean_j and len(clean_j) >= 2:
|
||
if _lookup_ipa(clean_j, pronunciation):
|
||
kept.extend(words[j:])
|
||
break
|
||
# Merged token: dictionary word + garbled IPA stuck together.
|
||
# E.g. "fictionsalans'fIkfn" starts with "fiction".
|
||
# Extract the dictionary prefix (≥4 chars) and add it with
|
||
# IPA, but only if enough chars remain after the prefix (≥3)
|
||
# to look like garbled IPA, not just a plural 's'.
|
||
if clean_j and len(clean_j) >= 7:
|
||
for pend in range(min(len(clean_j) - 3, 15), 3, -1):
|
||
prefix_j = clean_j[:pend]
|
||
prefix_ipa = _lookup_ipa(prefix_j, pronunciation)
|
||
if prefix_ipa:
|
||
kept.append(f"{prefix_j} [{prefix_ipa}]")
|
||
break
|
||
break # rest of this token is garbled
|
||
# Otherwise — likely garbled phonetics, skip
|
||
words = kept
|
||
break
|
||
|
||
return ' '.join(words)
|
||
|
||
|