feat: Sprint 1 — IPA hardening, regression framework, ground-truth review
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 19s
Track A (Backend): - Compound word IPA decomposition (schoolbag→school+bag) - Trailing garbled IPA fragment removal after brackets (R21 fix) - Regression runner with DB persistence, history endpoints - Page crop determinism verified with tests Track B (Frontend): - OCR Regression dashboard (/ai/ocr-regression) - Ground Truth Review workflow (/ai/ocr-ground-truth) with split-view, confidence highlighting, inline edit, batch mark, progress tracking Track C (Docs): - OCR-Pipeline.md v5.0 (Steps 5e-5h) - Regression testing guide - mkdocs.yml nav update Track D (Infra): - TrOCR baseline benchmark script - run-regression.sh shell script - Migration 008: regression_runs table Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1032,6 +1032,37 @@ def _text_has_garbled_ipa(text: str) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def _decompose_compound(word: str, pronunciation: str = 'british') -> Optional[str]:
|
||||
"""Try to decompose a compound word and concatenate IPA for each part.
|
||||
|
||||
E.g. "schoolbag" → "school"+"bag" → IPA for both concatenated.
|
||||
Only returns IPA if ALL parts are found in the dictionary.
|
||||
|
||||
Tries splits at every position (min 3 chars per part) and picks the
|
||||
split where the first part is longest.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return None
|
||||
lower = word.lower().strip()
|
||||
if len(lower) < 6:
|
||||
return None # too short for a compound
|
||||
|
||||
best_ipa = None
|
||||
best_first_len = 0
|
||||
|
||||
for split_pos in range(3, len(lower) - 2): # min 3 chars each part
|
||||
first = lower[:split_pos]
|
||||
second = lower[split_pos:]
|
||||
ipa_first = _lookup_ipa(first, pronunciation)
|
||||
ipa_second = _lookup_ipa(second, pronunciation)
|
||||
if ipa_first and ipa_second:
|
||||
if split_pos > best_first_len:
|
||||
best_first_len = split_pos
|
||||
best_ipa = ipa_first + ipa_second
|
||||
|
||||
return best_ipa
|
||||
|
||||
|
||||
def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
|
||||
"""Insert IPA pronunciation for English words that have no brackets at all.
|
||||
|
||||
@@ -1077,6 +1108,10 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
|
||||
# Fallback: try without hyphens (e.g. "second-hand" → "secondhand")
|
||||
if not ipa and '-' in clean:
|
||||
ipa = _lookup_ipa(clean.replace('-', ''), pronunciation)
|
||||
# Fallback 0b: compound word decomposition
|
||||
# E.g. "schoolbag" → "school"+"bag" → concatenated IPA
|
||||
if not ipa:
|
||||
ipa = _decompose_compound(clean, pronunciation)
|
||||
# Fallback 1: IPA-marker split for merged tokens where OCR
|
||||
# joined headword with its IPA (e.g. "schoolbagsku:lbæg").
|
||||
# Find the first IPA marker character (:, æ, ɪ, etc.), walk
|
||||
@@ -1098,6 +1133,9 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
|
||||
headword = w[:split]
|
||||
ocr_ipa = w[split:]
|
||||
hw_ipa = _lookup_ipa(headword, pronunciation)
|
||||
if not hw_ipa:
|
||||
# Try compound decomposition for the headword part
|
||||
hw_ipa = _decompose_compound(headword, pronunciation)
|
||||
if hw_ipa:
|
||||
words[i] = f"{headword} [{hw_ipa}]"
|
||||
else:
|
||||
@@ -1197,6 +1235,12 @@ def _strip_post_bracket_garbled(
|
||||
|
||||
E.g. ``sea [sˈiː] si:`` → ``sea [sˈiː]``
|
||||
``seat [sˈiːt] si:t`` → ``seat [sˈiːt]``
|
||||
``seat [sˈiːt] belt si:t belt`` → ``seat [sˈiːt] belt``
|
||||
|
||||
For multi-word headwords like "seat belt", a real English word ("belt")
|
||||
may be followed by garbled IPA duplicates. We detect this by checking
|
||||
whether the sequence after a real word contains IPA markers (`:`, `ə`,
|
||||
etc.) — if so, everything from the first garbled token onward is stripped.
|
||||
"""
|
||||
if ']' not in text:
|
||||
return text
|
||||
@@ -1207,6 +1251,8 @@ def _strip_post_bracket_garbled(
|
||||
after = text[last_bracket + 1:].strip()
|
||||
if not after:
|
||||
return text
|
||||
|
||||
_IPA_MARKER_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
|
||||
after_words = after.split()
|
||||
kept: List[str] = []
|
||||
for idx, w in enumerate(after_words):
|
||||
@@ -1215,17 +1261,42 @@ def _strip_post_bracket_garbled(
|
||||
kept.extend(after_words[idx:])
|
||||
break
|
||||
# Contains IPA markers (length mark, IPA chars) — garbled, skip
|
||||
if ':' in w or any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋˈˌ'):
|
||||
if any(c in w for c in _IPA_MARKER_CHARS):
|
||||
# Everything from here is garbled IPA — stop scanning
|
||||
# but look ahead: if any remaining words are real English
|
||||
# words WITHOUT IPA markers, they might be a different headword
|
||||
# following. Only skip the contiguous garbled run.
|
||||
continue
|
||||
clean = re.sub(r'[^a-zA-Z]', '', w)
|
||||
# Uppercase — likely German, keep rest
|
||||
if clean and clean[0].isupper():
|
||||
kept.extend(after_words[idx:])
|
||||
break
|
||||
# Known English word — keep rest
|
||||
# Known English word — keep it, but check if followed by garbled IPA
|
||||
# (multi-word headword case like "seat [siːt] belt si:t belt")
|
||||
if clean and len(clean) >= 2 and _lookup_ipa(clean, pronunciation):
|
||||
kept.extend(after_words[idx:])
|
||||
break
|
||||
# Peek ahead: if next word has IPA markers, the rest is garbled
|
||||
remaining = after_words[idx + 1:]
|
||||
has_garbled_after = any(
|
||||
any(c in rw for c in _IPA_MARKER_CHARS)
|
||||
for rw in remaining
|
||||
)
|
||||
if has_garbled_after:
|
||||
# Keep this real word but stop — rest is garbled duplication
|
||||
kept.append(w)
|
||||
# Still scan for delimiters/German in the remaining words
|
||||
for ridx, rw in enumerate(remaining):
|
||||
if rw in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
kept.extend(remaining[ridx:])
|
||||
break
|
||||
rclean = re.sub(r'[^a-zA-Z]', '', rw)
|
||||
if rclean and rclean[0].isupper():
|
||||
kept.extend(remaining[ridx:])
|
||||
break
|
||||
break
|
||||
else:
|
||||
kept.extend(after_words[idx:])
|
||||
break
|
||||
# Unknown short word — likely garbled, skip
|
||||
if kept:
|
||||
return before + ' ' + ' '.join(kept)
|
||||
|
||||
Reference in New Issue
Block a user