Files
breakpilot-lehrer/klausur-service/backend/cv_ocr_ipa_repair.py
Benjamin Admin 9ba420fa91
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Fix: Remove broken getKlausurApiUrl and clean up empty lines
sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00

288 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Advanced IPA repair for OCR-extracted vocabulary.
Functions that detect and fix garbled IPA fragments trailing after
headwords or in continuation cells. Split from cv_ocr_ipa_lookup.py
to stay within the 500 LOC budget.
Contains:
- _has_non_dict_trailing: detect non-dictionary trailing words
- _strip_post_bracket_garbled: strip garbled IPA after [brackets]
- fix_ipa_continuation_cell: replace garbled IPA in continuation rows
- _insert_headword_ipa: insert IPA for first headword in mixed-lang lines
"""
import logging
import re
from typing import Any, Dict, List, Optional
from cv_vocab_types import IPA_AVAILABLE
from cv_ocr_ipa_lookup import (
_lookup_ipa,
_GRAMMAR_BRACKET_WORDS,
)
logger = logging.getLogger(__name__)
def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool:
"""Check if text has a headword followed by non-dictionary trailing words.
Used as an additional trigger for ``_insert_missing_ipa`` when
``_text_has_garbled_ipa`` returns False because the garbled IPA
happens to look like plain ASCII (e.g. "skea" for /skɛə/).
"""
if not IPA_AVAILABLE:
return False
words = text.strip().split()
if len(words) < 2 or len(words) > 6:
return False
# Find first dictionary word
hw_idx = -1
for i, w in enumerate(words):
clean = re.sub(r'[^a-zA-Z\'-]', '', w)
if not clean or len(clean) < 2:
continue
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
continue
if _lookup_ipa(clean, pronunciation):
hw_idx = i
break
if hw_idx < 0 or hw_idx >= len(words) - 1:
return False
# Check ALL remaining words — if none are dictionary/delimiter/German,
# they are likely garbled IPA.
for j in range(hw_idx + 1, len(words)):
wj = words[j]
if wj in ('', '', '-', '/', '|', ',', ';'):
return False
# Pure digits or numbering (e.g. "1", "2.", "3)") — not garbled IPA
if re.match(r'^[\d.)\-]+$', wj):
return False
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
if clean_j and clean_j[0].isupper():
return False
if clean_j and len(clean_j) >= 2 and _lookup_ipa(clean_j, pronunciation):
return False
return True
def _strip_post_bracket_garbled(
text: str, pronunciation: str = 'british',
) -> str:
"""Strip garbled IPA fragments that trail after proper [IPA] brackets.
E.g. ``sea [sˈiː] si:`` → ``sea [sˈiː]``
``seat [sˈiːt] si:t`` → ``seat [sˈiːt]``
``seat [sˈiːt] belt si:t belt`` → ``seat [sˈiːt] belt``
For multi-word headwords like "seat belt", a real English word ("belt")
may be followed by garbled IPA duplicates. We detect this by checking
whether the sequence after a real word contains IPA markers (`:`, `ə`,
etc.) — if so, everything from the first garbled token onward is stripped.
"""
if ']' not in text:
return text
last_bracket = text.rfind(']')
if last_bracket >= len(text) - 1:
return text
before = text[:last_bracket + 1].rstrip()
after = text[last_bracket + 1:].strip()
if not after:
return text
_IPA_MARKER_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
after_words = after.split()
kept: List[str] = []
for idx, w in enumerate(after_words):
# Delimiter — keep rest
if w in ('', '', '-', '/', '|', ',', ';'):
kept.extend(after_words[idx:])
break
# Contains IPA markers (length mark, IPA chars) — garbled, skip
if any(c in w for c in _IPA_MARKER_CHARS):
# Everything from here is garbled IPA — stop scanning
# but look ahead: if any remaining words are real English
# words WITHOUT IPA markers, they might be a different headword
# following. Only skip the contiguous garbled run.
continue
clean = re.sub(r'[^a-zA-Z]', '', w)
# Uppercase — likely German, keep rest
if clean and clean[0].isupper():
kept.extend(after_words[idx:])
break
# Known English word — keep it, but check if followed by garbled IPA
# (multi-word headword case like "seat [siːt] belt si:t belt")
if clean and len(clean) >= 2 and _lookup_ipa(clean, pronunciation):
# Peek ahead: if next word has IPA markers, the rest is garbled
remaining = after_words[idx + 1:]
has_garbled_after = any(
any(c in rw for c in _IPA_MARKER_CHARS)
for rw in remaining
)
if has_garbled_after:
# Keep this real word but stop — rest is garbled duplication
kept.append(w)
# Still scan for delimiters/German in the remaining words
for ridx, rw in enumerate(remaining):
if rw in ('', '', '-', '/', '|', ',', ';'):
kept.extend(remaining[ridx:])
break
rclean = re.sub(r'[^a-zA-Z]', '', rw)
if rclean and rclean[0].isupper():
kept.extend(remaining[ridx:])
break
break
else:
kept.extend(after_words[idx:])
break
# Unknown short word — likely garbled, skip
if kept:
return before + ' ' + ' '.join(kept)
return before
def fix_ipa_continuation_cell(
garbled_text: str,
headword_text: str,
pronunciation: str = 'british',
) -> str:
"""Replace garbled IPA in a continuation row with proper IPA.
Continuation rows appear below the headword and contain only the
printed phonetic transcription, which OCR garbles into fragments
like ``ska:f ska:vz`` (should be ``[skˈɑːf] [skˈɑːvz]``).
Args:
garbled_text: The OCR-garbled IPA text from the continuation row.
headword_text: The headword text from the previous row
(e.g. ``scarf scarves``).
pronunciation: ``'british'`` or ``'american'``.
Returns:
Corrected IPA text, or the original if no fix could be applied.
"""
if not IPA_AVAILABLE or not garbled_text or not headword_text:
return garbled_text
# If headword already has inline IPA like "beat [bˈiːt] , beat, beaten",
# only generate continuation IPA for words NOT already covered.
covered_words: set = set()
has_inline_ipa = bool(re.search(r'\[[^\]]*\]', headword_text))
if has_inline_ipa:
# Words before the first bracket already have their IPA shown
first_bracket = headword_text.index('[')
pre_bracket = headword_text[:first_bracket].strip()
for w in pre_bracket.split():
clean = re.sub(r'[^a-zA-Z\'-]', '', w).lower()
if clean and len(clean) >= 2:
covered_words.add(clean)
last_bracket_end = headword_text.rfind(']')
tail = headword_text[last_bracket_end + 1:].strip()
if not tail or not re.search(r'[a-zA-Z]{2,}', tail):
# Bracket is at the end (e.g. "the Highlands [ˈhaɪləndz]")
# — return the inline IPA directly (continuation duplicates it)
last_bracket_start = headword_text.rfind('[')
inline_ipa = headword_text[last_bracket_start:last_bracket_end + 1]
return inline_ipa
# Only the tail words need continuation IPA
headword_text = tail
# Strip existing IPA brackets and parenthetical grammar annotations
# like "(no pl)", "(sth)", "(sb)" from headword text
clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text)
clean_hw = re.sub(r'\([^)]*\)', '', clean_hw).strip()
if not clean_hw:
return garbled_text
# Split headword by delimiters ( — -)
# "scarf scarves" → ["scarf", "scarves"]
# "see - saw - seen" → ["see", "saw", "seen"]
parts = re.split(r'\s*[–—]\s*|\s+-\s+', clean_hw)
parts = [p.strip() for p in parts if p.strip()]
if not parts:
return garbled_text
# Look up IPA for each headword part.
# Skip articles (the, a, an) — they never get IPA in vocab books.
# Other function words like "down", "up" are kept because they are
# integral parts of phrasal verbs (e.g. "close down").
# Skip words that already have inline IPA in the headword row.
_ARTICLES = {'the', 'a', 'an'}
ipa_parts: List[str] = []
for part in parts:
# A part may be multi-word like "secondary school"
words = part.split()
word_ipas: List[str] = []
for w in words:
clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
if not clean_w or len(clean_w) < 2:
continue
if covered_words and clean_w.lower() in covered_words:
continue # Already has IPA inline in the headword
if clean_w.lower() in _ARTICLES:
continue # Articles never get IPA in vocab books
ipa = _lookup_ipa(clean_w, pronunciation)
if ipa:
word_ipas.append(ipa)
if word_ipas:
ipa_parts.append('[' + ' '.join(word_ipas) + ']')
if not ipa_parts:
return garbled_text
# Join with delimiter
result = ' '.join(ipa_parts)
logger.debug(
"fix_ipa_continuation: '%s''%s' (headwords: '%s')",
garbled_text, result, headword_text,
)
return result
def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str:
"""Insert IPA for the first English headword in a long mixed-language line.
Unlike _insert_missing_ipa (for short column_en cells), this handles
column_text lines of any length. It only inserts IPA for the FIRST word
if that word:
- has no bracket following it already
- has an IPA entry in the dictionary
- is not a number/symbol prefix like "».55"
Returns the text with [ipa] inserted after the first word, or unchanged.
"""
if not IPA_AVAILABLE:
return text
if not text or not text.strip():
return text
words = text.strip().split()
if not words:
return text
# Check if text already starts with a bracket (IPA already present)
if len(words) > 1 and words[1].startswith(('[', '{', '(')):
return text
# Try the first few words (skip numeric prefixes like "».55", "0.56")
for i in range(min(3, len(words))):
w = words[i]
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
if not clean or len(clean) < 2:
continue
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
continue
ipa = _lookup_ipa(clean, pronunciation)
if ipa:
words[i] = f"{w} [{ipa}]"
return ' '.join(words)
# Stop at first real word even if no IPA found
break
return text