Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
287
klausur-service/backend/cv_ocr_ipa_repair.py
Normal file
287
klausur-service/backend/cv_ocr_ipa_repair.py
Normal file
@@ -0,0 +1,287 @@
|
||||
"""
|
||||
Advanced IPA repair for OCR-extracted vocabulary.
|
||||
|
||||
Functions that detect and fix garbled IPA fragments trailing after
|
||||
headwords or in continuation cells. Split from cv_ocr_ipa_lookup.py
|
||||
to stay within the 500 LOC budget.
|
||||
|
||||
Contains:
|
||||
- _has_non_dict_trailing: detect non-dictionary trailing words
|
||||
- _strip_post_bracket_garbled: strip garbled IPA after [brackets]
|
||||
- fix_ipa_continuation_cell: replace garbled IPA in continuation rows
|
||||
- _insert_headword_ipa: insert IPA for first headword in mixed-lang lines
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from cv_vocab_types import IPA_AVAILABLE
|
||||
from cv_ocr_ipa_lookup import (
|
||||
_lookup_ipa,
|
||||
_GRAMMAR_BRACKET_WORDS,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool:
|
||||
"""Check if text has a headword followed by non-dictionary trailing words.
|
||||
|
||||
Used as an additional trigger for ``_insert_missing_ipa`` when
|
||||
``_text_has_garbled_ipa`` returns False because the garbled IPA
|
||||
happens to look like plain ASCII (e.g. "skea" for /skɛə/).
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return False
|
||||
words = text.strip().split()
|
||||
if len(words) < 2 or len(words) > 6:
|
||||
return False
|
||||
# Find first dictionary word
|
||||
hw_idx = -1
|
||||
for i, w in enumerate(words):
|
||||
clean = re.sub(r'[^a-zA-Z\'-]', '', w)
|
||||
if not clean or len(clean) < 2:
|
||||
continue
|
||||
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
|
||||
continue
|
||||
if _lookup_ipa(clean, pronunciation):
|
||||
hw_idx = i
|
||||
break
|
||||
if hw_idx < 0 or hw_idx >= len(words) - 1:
|
||||
return False
|
||||
# Check ALL remaining words — if none are dictionary/delimiter/German,
|
||||
# they are likely garbled IPA.
|
||||
for j in range(hw_idx + 1, len(words)):
|
||||
wj = words[j]
|
||||
if wj in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
return False
|
||||
# Pure digits or numbering (e.g. "1", "2.", "3)") — not garbled IPA
|
||||
if re.match(r'^[\d.)\-]+$', wj):
|
||||
return False
|
||||
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
|
||||
if clean_j and clean_j[0].isupper():
|
||||
return False
|
||||
if clean_j and len(clean_j) >= 2 and _lookup_ipa(clean_j, pronunciation):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _strip_post_bracket_garbled(
|
||||
text: str, pronunciation: str = 'british',
|
||||
) -> str:
|
||||
"""Strip garbled IPA fragments that trail after proper [IPA] brackets.
|
||||
|
||||
E.g. ``sea [sˈiː] si:`` → ``sea [sˈiː]``
|
||||
``seat [sˈiːt] si:t`` → ``seat [sˈiːt]``
|
||||
``seat [sˈiːt] belt si:t belt`` → ``seat [sˈiːt] belt``
|
||||
|
||||
For multi-word headwords like "seat belt", a real English word ("belt")
|
||||
may be followed by garbled IPA duplicates. We detect this by checking
|
||||
whether the sequence after a real word contains IPA markers (`:`, `ə`,
|
||||
etc.) — if so, everything from the first garbled token onward is stripped.
|
||||
"""
|
||||
if ']' not in text:
|
||||
return text
|
||||
last_bracket = text.rfind(']')
|
||||
if last_bracket >= len(text) - 1:
|
||||
return text
|
||||
before = text[:last_bracket + 1].rstrip()
|
||||
after = text[last_bracket + 1:].strip()
|
||||
if not after:
|
||||
return text
|
||||
|
||||
_IPA_MARKER_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
|
||||
after_words = after.split()
|
||||
kept: List[str] = []
|
||||
for idx, w in enumerate(after_words):
|
||||
# Delimiter — keep rest
|
||||
if w in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
kept.extend(after_words[idx:])
|
||||
break
|
||||
# Contains IPA markers (length mark, IPA chars) — garbled, skip
|
||||
if any(c in w for c in _IPA_MARKER_CHARS):
|
||||
# Everything from here is garbled IPA — stop scanning
|
||||
# but look ahead: if any remaining words are real English
|
||||
# words WITHOUT IPA markers, they might be a different headword
|
||||
# following. Only skip the contiguous garbled run.
|
||||
continue
|
||||
clean = re.sub(r'[^a-zA-Z]', '', w)
|
||||
# Uppercase — likely German, keep rest
|
||||
if clean and clean[0].isupper():
|
||||
kept.extend(after_words[idx:])
|
||||
break
|
||||
# Known English word — keep it, but check if followed by garbled IPA
|
||||
# (multi-word headword case like "seat [siːt] belt si:t belt")
|
||||
if clean and len(clean) >= 2 and _lookup_ipa(clean, pronunciation):
|
||||
# Peek ahead: if next word has IPA markers, the rest is garbled
|
||||
remaining = after_words[idx + 1:]
|
||||
has_garbled_after = any(
|
||||
any(c in rw for c in _IPA_MARKER_CHARS)
|
||||
for rw in remaining
|
||||
)
|
||||
if has_garbled_after:
|
||||
# Keep this real word but stop — rest is garbled duplication
|
||||
kept.append(w)
|
||||
# Still scan for delimiters/German in the remaining words
|
||||
for ridx, rw in enumerate(remaining):
|
||||
if rw in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
kept.extend(remaining[ridx:])
|
||||
break
|
||||
rclean = re.sub(r'[^a-zA-Z]', '', rw)
|
||||
if rclean and rclean[0].isupper():
|
||||
kept.extend(remaining[ridx:])
|
||||
break
|
||||
break
|
||||
else:
|
||||
kept.extend(after_words[idx:])
|
||||
break
|
||||
# Unknown short word — likely garbled, skip
|
||||
if kept:
|
||||
return before + ' ' + ' '.join(kept)
|
||||
return before
|
||||
|
||||
|
||||
def fix_ipa_continuation_cell(
|
||||
garbled_text: str,
|
||||
headword_text: str,
|
||||
pronunciation: str = 'british',
|
||||
) -> str:
|
||||
"""Replace garbled IPA in a continuation row with proper IPA.
|
||||
|
||||
Continuation rows appear below the headword and contain only the
|
||||
printed phonetic transcription, which OCR garbles into fragments
|
||||
like ``ska:f – ska:vz`` (should be ``[skˈɑːf] – [skˈɑːvz]``).
|
||||
|
||||
Args:
|
||||
garbled_text: The OCR-garbled IPA text from the continuation row.
|
||||
headword_text: The headword text from the previous row
|
||||
(e.g. ``scarf – scarves``).
|
||||
pronunciation: ``'british'`` or ``'american'``.
|
||||
|
||||
Returns:
|
||||
Corrected IPA text, or the original if no fix could be applied.
|
||||
"""
|
||||
if not IPA_AVAILABLE or not garbled_text or not headword_text:
|
||||
return garbled_text
|
||||
|
||||
# If headword already has inline IPA like "beat [bˈiːt] , beat, beaten",
|
||||
# only generate continuation IPA for words NOT already covered.
|
||||
covered_words: set = set()
|
||||
has_inline_ipa = bool(re.search(r'\[[^\]]*\]', headword_text))
|
||||
if has_inline_ipa:
|
||||
# Words before the first bracket already have their IPA shown
|
||||
first_bracket = headword_text.index('[')
|
||||
pre_bracket = headword_text[:first_bracket].strip()
|
||||
for w in pre_bracket.split():
|
||||
clean = re.sub(r'[^a-zA-Z\'-]', '', w).lower()
|
||||
if clean and len(clean) >= 2:
|
||||
covered_words.add(clean)
|
||||
|
||||
last_bracket_end = headword_text.rfind(']')
|
||||
tail = headword_text[last_bracket_end + 1:].strip()
|
||||
|
||||
if not tail or not re.search(r'[a-zA-Z]{2,}', tail):
|
||||
# Bracket is at the end (e.g. "the Highlands [ˈhaɪləndz]")
|
||||
# — return the inline IPA directly (continuation duplicates it)
|
||||
last_bracket_start = headword_text.rfind('[')
|
||||
inline_ipa = headword_text[last_bracket_start:last_bracket_end + 1]
|
||||
return inline_ipa
|
||||
|
||||
# Only the tail words need continuation IPA
|
||||
headword_text = tail
|
||||
|
||||
# Strip existing IPA brackets and parenthetical grammar annotations
|
||||
# like "(no pl)", "(sth)", "(sb)" from headword text
|
||||
clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text)
|
||||
clean_hw = re.sub(r'\([^)]*\)', '', clean_hw).strip()
|
||||
if not clean_hw:
|
||||
return garbled_text
|
||||
|
||||
# Split headword by delimiters (– — -)
|
||||
# "scarf – scarves" → ["scarf", "scarves"]
|
||||
# "see - saw - seen" → ["see", "saw", "seen"]
|
||||
parts = re.split(r'\s*[–—]\s*|\s+-\s+', clean_hw)
|
||||
parts = [p.strip() for p in parts if p.strip()]
|
||||
|
||||
if not parts:
|
||||
return garbled_text
|
||||
|
||||
# Look up IPA for each headword part.
|
||||
# Skip articles (the, a, an) — they never get IPA in vocab books.
|
||||
# Other function words like "down", "up" are kept because they are
|
||||
# integral parts of phrasal verbs (e.g. "close down").
|
||||
# Skip words that already have inline IPA in the headword row.
|
||||
_ARTICLES = {'the', 'a', 'an'}
|
||||
ipa_parts: List[str] = []
|
||||
for part in parts:
|
||||
# A part may be multi-word like "secondary school"
|
||||
words = part.split()
|
||||
word_ipas: List[str] = []
|
||||
for w in words:
|
||||
clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
|
||||
if not clean_w or len(clean_w) < 2:
|
||||
continue
|
||||
if covered_words and clean_w.lower() in covered_words:
|
||||
continue # Already has IPA inline in the headword
|
||||
if clean_w.lower() in _ARTICLES:
|
||||
continue # Articles never get IPA in vocab books
|
||||
ipa = _lookup_ipa(clean_w, pronunciation)
|
||||
if ipa:
|
||||
word_ipas.append(ipa)
|
||||
if word_ipas:
|
||||
ipa_parts.append('[' + ' '.join(word_ipas) + ']')
|
||||
|
||||
if not ipa_parts:
|
||||
return garbled_text
|
||||
|
||||
# Join with delimiter
|
||||
result = ' – '.join(ipa_parts)
|
||||
logger.debug(
|
||||
"fix_ipa_continuation: '%s' → '%s' (headwords: '%s')",
|
||||
garbled_text, result, headword_text,
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str:
|
||||
"""Insert IPA for the first English headword in a long mixed-language line.
|
||||
|
||||
Unlike _insert_missing_ipa (for short column_en cells), this handles
|
||||
column_text lines of any length. It only inserts IPA for the FIRST word
|
||||
if that word:
|
||||
- has no bracket following it already
|
||||
- has an IPA entry in the dictionary
|
||||
- is not a number/symbol prefix like "».55"
|
||||
|
||||
Returns the text with [ipa] inserted after the first word, or unchanged.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return text
|
||||
if not text or not text.strip():
|
||||
return text
|
||||
|
||||
words = text.strip().split()
|
||||
if not words:
|
||||
return text
|
||||
|
||||
# Check if text already starts with a bracket (IPA already present)
|
||||
if len(words) > 1 and words[1].startswith(('[', '{', '(')):
|
||||
return text
|
||||
|
||||
# Try the first few words (skip numeric prefixes like "».55", "0.56")
|
||||
for i in range(min(3, len(words))):
|
||||
w = words[i]
|
||||
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
|
||||
if not clean or len(clean) < 2:
|
||||
continue
|
||||
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
|
||||
continue
|
||||
ipa = _lookup_ipa(clean, pronunciation)
|
||||
if ipa:
|
||||
words[i] = f"{w} [{ipa}]"
|
||||
return ' '.join(words)
|
||||
# Stop at first real word even if no IPA found
|
||||
break
|
||||
|
||||
return text
|
||||
Reference in New Issue
Block a user