Compare commits
3 Commits
dd16c88007
...
6ad4b84584
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6ad4b84584 | ||
|
|
f94a3836f8 | ||
|
|
34c649c8be |
@@ -4201,9 +4201,11 @@ def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, A
|
|||||||
|
|
||||||
# --- D. Phonetic Bracket IPA Replacement ---
|
# --- D. Phonetic Bracket IPA Replacement ---
|
||||||
|
|
||||||
# Pattern: word [phonetic] or word (phonetic) — capture the word before brackets
|
# Pattern: word followed by any bracket type containing phonetic content.
|
||||||
|
# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
|
||||||
|
# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
|
||||||
_PHONETIC_BRACKET_RE = re.compile(
|
_PHONETIC_BRACKET_RE = re.compile(
|
||||||
r'(\b[a-zA-ZäöüÄÖÜß]+)\s*\[([^\]]*)\]'
|
r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -4274,25 +4276,64 @@ def _fix_phonetic_brackets(
|
|||||||
for entry in entries:
|
for entry in entries:
|
||||||
for field in ('english', 'german', 'example'):
|
for field in ('english', 'german', 'example'):
|
||||||
text = entry.get(field, '') or ''
|
text = entry.get(field, '') or ''
|
||||||
if '[' not in text:
|
# Check for any bracket type — Tesseract garbles [ into { or (
|
||||||
|
if not any(ch in text for ch in '[{('):
|
||||||
continue
|
continue
|
||||||
entry[field] = _replace_phonetics_in_text(text, pronunciation)
|
entry[field] = _replace_phonetics_in_text(text, pronunciation)
|
||||||
|
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|
||||||
|
# German prefixes/words commonly in parentheses in vocab tables: (zer)brechen, Tanz(veranstaltung)
|
||||||
|
_GERMAN_BRACKET_PREFIXES = frozenset({
|
||||||
|
'ab', 'an', 'auf', 'aus', 'be', 'bei', 'dar', 'ein', 'emp', 'ent',
|
||||||
|
'er', 'ge', 'her', 'hin', 'los', 'mit', 'nach', 'um', 'un', 'unter',
|
||||||
|
'ver', 'vor', 'weg', 'zer', 'zu', 'zurück',
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def _is_meaningful_bracket_content(content: str) -> bool:
|
||||||
|
"""Return True if bracket content is a meaningful word/prefix, not garbled IPA.
|
||||||
|
|
||||||
|
Meaningful: (zer)brechen, (veranstaltung), (Haupt)stadt
|
||||||
|
Garbled IPA: {'tfatno, (cy, 1u], 'daens
|
||||||
|
"""
|
||||||
|
if not content:
|
||||||
|
return False
|
||||||
|
# Must be pure letters (no digits, punctuation, IPA symbols)
|
||||||
|
if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', content):
|
||||||
|
return False
|
||||||
|
# Known German prefix
|
||||||
|
if content.lower() in _GERMAN_BRACKET_PREFIXES:
|
||||||
|
return True
|
||||||
|
# Long enough to be a real word (not 1-2 char garbled IPA like "cy")
|
||||||
|
if len(content) >= 4:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
|
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
|
||||||
"""Replace [phonetic] after words with dictionary IPA."""
|
"""Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
|
||||||
|
|
||||||
|
Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
|
||||||
|
We match any bracket type and replace with dictionary IPA if found.
|
||||||
|
Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
|
||||||
|
"""
|
||||||
if not IPA_AVAILABLE:
|
if not IPA_AVAILABLE:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def replacer(match):
|
def replacer(match):
|
||||||
word = match.group(1)
|
word = match.group(1)
|
||||||
ocr_phonetic = match.group(2)
|
bracket_content = match.group(2).strip()
|
||||||
|
|
||||||
# Skip if bracket content looks like regular text (has spaces + capitals)
|
# Skip if bracket content looks like regular text (multiple words)
|
||||||
if len(ocr_phonetic.split()) > 3:
|
if len(bracket_content.split()) > 3:
|
||||||
return match.group(0) # Keep original
|
return match.group(0)
|
||||||
|
|
||||||
|
# Skip if bracket content is a meaningful word/prefix — e.g. (zer)brechen,
|
||||||
|
# Tanz(veranstaltung). These are real German morphemes, not garbled IPA.
|
||||||
|
if _is_meaningful_bracket_content(bracket_content):
|
||||||
|
return match.group(0)
|
||||||
|
|
||||||
# Look up in IPA dictionary
|
# Look up in IPA dictionary
|
||||||
ipa = _lookup_ipa(word, pronunciation)
|
ipa = _lookup_ipa(word, pronunciation)
|
||||||
@@ -4886,13 +4927,15 @@ def build_cell_grid_v2(
|
|||||||
Drop-in replacement for build_cell_grid() — same signature & return type.
|
Drop-in replacement for build_cell_grid() — same signature & return type.
|
||||||
No full-page word assignment; each cell is OCR'd from its own crop.
|
No full-page word assignment; each cell is OCR'd from its own crop.
|
||||||
"""
|
"""
|
||||||
# Resolve engine
|
# Resolve engine — default to Tesseract for cell-first OCR.
|
||||||
|
# Tesseract excels at isolated text crops (binarized, upscaled).
|
||||||
|
# RapidOCR is optimized for full-page scene-text and produces artifacts
|
||||||
|
# on small cell crops (extra chars, missing punctuation, garbled IPA).
|
||||||
use_rapid = False
|
use_rapid = False
|
||||||
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
||||||
engine_name = ocr_engine
|
engine_name = ocr_engine
|
||||||
elif ocr_engine == "auto":
|
elif ocr_engine == "auto":
|
||||||
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
|
engine_name = "tesseract"
|
||||||
engine_name = "rapid" if use_rapid else "tesseract"
|
|
||||||
elif ocr_engine == "rapid":
|
elif ocr_engine == "rapid":
|
||||||
if not RAPIDOCR_AVAILABLE:
|
if not RAPIDOCR_AVAILABLE:
|
||||||
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||||||
@@ -5034,13 +5077,15 @@ def build_cell_grid_v2_streaming(
|
|||||||
Yields:
|
Yields:
|
||||||
(cell_dict, columns_meta, total_cells)
|
(cell_dict, columns_meta, total_cells)
|
||||||
"""
|
"""
|
||||||
# Resolve engine
|
# Resolve engine — default to Tesseract for cell-first OCR.
|
||||||
|
# Tesseract excels at isolated text crops (binarized, upscaled).
|
||||||
|
# RapidOCR is optimized for full-page scene-text and produces artifacts
|
||||||
|
# on small cell crops (extra chars, missing punctuation, garbled IPA).
|
||||||
use_rapid = False
|
use_rapid = False
|
||||||
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
||||||
engine_name = ocr_engine
|
engine_name = ocr_engine
|
||||||
elif ocr_engine == "auto":
|
elif ocr_engine == "auto":
|
||||||
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
|
engine_name = "tesseract"
|
||||||
engine_name = "rapid" if use_rapid else "tesseract"
|
|
||||||
elif ocr_engine == "rapid":
|
elif ocr_engine == "rapid":
|
||||||
if not RAPIDOCR_AVAILABLE:
|
if not RAPIDOCR_AVAILABLE:
|
||||||
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||||||
|
|||||||
@@ -1416,9 +1416,11 @@ async def _word_batch_stream_generator(
|
|||||||
# 2. Send preparing event (keepalive for proxy)
|
# 2. Send preparing event (keepalive for proxy)
|
||||||
yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR laeuft parallel...'})}\n\n"
|
yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR laeuft parallel...'})}\n\n"
|
||||||
|
|
||||||
# 3. Run batch OCR in thread pool (CPU-bound, don't block event loop)
|
# 3. Run batch OCR in thread pool with periodic keepalive events.
|
||||||
|
# The OCR takes 30-60s and proxy servers (Nginx) may drop idle SSE
|
||||||
|
# connections after 30-60s. Send keepalive every 5s to prevent this.
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
cells, columns_meta = await loop.run_in_executor(
|
ocr_future = loop.run_in_executor(
|
||||||
None,
|
None,
|
||||||
lambda: build_cell_grid_v2(
|
lambda: build_cell_grid_v2(
|
||||||
ocr_img, col_regions, row_geoms, img_w, img_h,
|
ocr_img, col_regions, row_geoms, img_w, img_h,
|
||||||
@@ -1426,6 +1428,25 @@ async def _word_batch_stream_generator(
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Send keepalive events every 5 seconds while OCR runs
|
||||||
|
keepalive_count = 0
|
||||||
|
while not ocr_future.done():
|
||||||
|
try:
|
||||||
|
cells, columns_meta = await asyncio.wait_for(
|
||||||
|
asyncio.shield(ocr_future), timeout=5.0,
|
||||||
|
)
|
||||||
|
break # OCR finished
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
keepalive_count += 1
|
||||||
|
elapsed = int(time.time() - t0)
|
||||||
|
yield f"data: {json.dumps({'type': 'keepalive', 'elapsed': elapsed, 'message': f'OCR laeuft... ({elapsed}s)'})}\n\n"
|
||||||
|
if await request.is_disconnected():
|
||||||
|
logger.info(f"SSE batch: client disconnected during OCR for {session_id}")
|
||||||
|
ocr_future.cancel()
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
cells, columns_meta = ocr_future.result()
|
||||||
|
|
||||||
if await request.is_disconnected():
|
if await request.is_disconnected():
|
||||||
logger.info(f"SSE batch: client disconnected after OCR for {session_id}")
|
logger.info(f"SSE batch: client disconnected after OCR for {session_id}")
|
||||||
return
|
return
|
||||||
|
|||||||
Reference in New Issue
Block a user