Compare commits

...

3 Commits

Author SHA1 Message Date
Benjamin Admin
6ad4b84584 fix: broaden phonetic bracket regex to catch Tesseract-garbled IPA
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 16s
Tesseract mangles IPA square brackets into curly braces or parentheses
(e.g. China [ˈtʃaɪnə] → China {'tfatno]). The previous regex only
matched [...], missing all garbled variants.

- Match any bracket type: [...], {...}, (...) including mixed pairs
- Add _is_meaningful_bracket_content() to preserve legitimate German
  prefixes like (zer)brechen and Tanz(veranstaltung)
- Trigger IPA replacement on any bracket character, not just [

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 22:53:50 +01:00
Benjamin Admin
f94a3836f8 fix: use Tesseract as default engine for cell-first OCR instead of RapidOCR
RapidOCR (PaddleOCR) is optimized for full-page scene text and produces
artifacts on small isolated cell crops: extra characters ("Tanz z",
"er r wollte"), missing punctuation, garbled phonetic transcriptions.

Tesseract works much better on isolated binarized crops with upscaling,
which is exactly what cell-first OCR provides. RapidOCR remains available
as explicit engine choice via the dropdown.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 22:30:34 +01:00
Benjamin Admin
34c649c8be fix: send SSE keepalive events every 5s during batch OCR
Batch OCR takes 30-60s with 3x upscaling. Without keepalive events,
proxy servers (Nginx) drop the SSE connection after their read timeout.
Now sends keepalive events every 5s to prevent timeout, with elapsed
time for debugging. Also checks for client disconnect between keepalives.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 22:21:14 +01:00
2 changed files with 82 additions and 16 deletions

View File

@@ -4201,9 +4201,11 @@ def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, A
# --- D. Phonetic Bracket IPA Replacement ---
# Pattern: word [phonetic] or word (phonetic) — capture the word before brackets
# Pattern: word followed by any bracket type containing phonetic content.
# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
_PHONETIC_BRACKET_RE = re.compile(
r'(\b[a-zA-ZäöüÄÖÜß]+)\s*\[([^\]]*)\]'
r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
)
@@ -4274,25 +4276,64 @@ def _fix_phonetic_brackets(
for entry in entries:
for field in ('english', 'german', 'example'):
text = entry.get(field, '') or ''
if '[' not in text:
# Check for any bracket type — Tesseract garbles [ into { or (
if not any(ch in text for ch in '[{('):
continue
entry[field] = _replace_phonetics_in_text(text, pronunciation)
return entries
# German prefixes/words commonly in parentheses in vocab tables: (zer)brechen, Tanz(veranstaltung)
_GERMAN_BRACKET_PREFIXES = frozenset({
'ab', 'an', 'auf', 'aus', 'be', 'bei', 'dar', 'ein', 'emp', 'ent',
'er', 'ge', 'her', 'hin', 'los', 'mit', 'nach', 'um', 'un', 'unter',
'ver', 'vor', 'weg', 'zer', 'zu', 'zurück',
})
def _is_meaningful_bracket_content(content: str) -> bool:
"""Return True if bracket content is a meaningful word/prefix, not garbled IPA.
Meaningful: (zer)brechen, (veranstaltung), (Haupt)stadt
Garbled IPA: {'tfatno, (cy, 1u], 'daens
"""
if not content:
return False
# Must be pure letters (no digits, punctuation, IPA symbols)
if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', content):
return False
# Known German prefix
if content.lower() in _GERMAN_BRACKET_PREFIXES:
return True
# Long enough to be a real word (not 1-2 char garbled IPA like "cy")
if len(content) >= 4:
return True
return False
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
"""Replace [phonetic] after words with dictionary IPA."""
"""Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
We match any bracket type and replace with dictionary IPA if found.
Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
"""
if not IPA_AVAILABLE:
return text
def replacer(match):
word = match.group(1)
ocr_phonetic = match.group(2)
bracket_content = match.group(2).strip()
# Skip if bracket content looks like regular text (has spaces + capitals)
if len(ocr_phonetic.split()) > 3:
return match.group(0) # Keep original
# Skip if bracket content looks like regular text (multiple words)
if len(bracket_content.split()) > 3:
return match.group(0)
# Skip if bracket content is a meaningful word/prefix — e.g. (zer)brechen,
# Tanz(veranstaltung). These are real German morphemes, not garbled IPA.
if _is_meaningful_bracket_content(bracket_content):
return match.group(0)
# Look up in IPA dictionary
ipa = _lookup_ipa(word, pronunciation)
@@ -4886,13 +4927,15 @@ def build_cell_grid_v2(
Drop-in replacement for build_cell_grid() — same signature & return type.
No full-page word assignment; each cell is OCR'd from its own crop.
"""
# Resolve engine
# Resolve engine — default to Tesseract for cell-first OCR.
# Tesseract excels at isolated text crops (binarized, upscaled).
# RapidOCR is optimized for full-page scene-text and produces artifacts
# on small cell crops (extra chars, missing punctuation, garbled IPA).
use_rapid = False
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
engine_name = ocr_engine
elif ocr_engine == "auto":
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
engine_name = "rapid" if use_rapid else "tesseract"
engine_name = "tesseract"
elif ocr_engine == "rapid":
if not RAPIDOCR_AVAILABLE:
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
@@ -5034,13 +5077,15 @@ def build_cell_grid_v2_streaming(
Yields:
(cell_dict, columns_meta, total_cells)
"""
# Resolve engine
# Resolve engine — default to Tesseract for cell-first OCR.
# Tesseract excels at isolated text crops (binarized, upscaled).
# RapidOCR is optimized for full-page scene-text and produces artifacts
# on small cell crops (extra chars, missing punctuation, garbled IPA).
use_rapid = False
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
engine_name = ocr_engine
elif ocr_engine == "auto":
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
engine_name = "rapid" if use_rapid else "tesseract"
engine_name = "tesseract"
elif ocr_engine == "rapid":
if not RAPIDOCR_AVAILABLE:
logger.warning("RapidOCR requested but not available, falling back to Tesseract")

View File

@@ -1416,9 +1416,11 @@ async def _word_batch_stream_generator(
# 2. Send preparing event (keepalive for proxy)
yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR laeuft parallel...'})}\n\n"
# 3. Run batch OCR in thread pool (CPU-bound, don't block event loop)
# 3. Run batch OCR in thread pool with periodic keepalive events.
# The OCR takes 30-60s and proxy servers (Nginx) may drop idle SSE
# connections after 30-60s. Send keepalive every 5s to prevent this.
loop = asyncio.get_event_loop()
cells, columns_meta = await loop.run_in_executor(
ocr_future = loop.run_in_executor(
None,
lambda: build_cell_grid_v2(
ocr_img, col_regions, row_geoms, img_w, img_h,
@@ -1426,6 +1428,25 @@ async def _word_batch_stream_generator(
),
)
# Send keepalive events every 5 seconds while OCR runs
keepalive_count = 0
while not ocr_future.done():
try:
cells, columns_meta = await asyncio.wait_for(
asyncio.shield(ocr_future), timeout=5.0,
)
break # OCR finished
except asyncio.TimeoutError:
keepalive_count += 1
elapsed = int(time.time() - t0)
yield f"data: {json.dumps({'type': 'keepalive', 'elapsed': elapsed, 'message': f'OCR laeuft... ({elapsed}s)'})}\n\n"
if await request.is_disconnected():
logger.info(f"SSE batch: client disconnected during OCR for {session_id}")
ocr_future.cancel()
return
else:
cells, columns_meta = ocr_future.result()
if await request.is_disconnected():
logger.info(f"SSE batch: client disconnected after OCR for {session_id}")
return