Compare commits
5 Commits
6ad4b84584
...
fd99d4f875
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fd99d4f875 | ||
|
|
1e0c6bb4b5 | ||
|
|
e6dc3fcdd7 | ||
|
|
edbdac3203 | ||
|
|
99573a46ef |
@@ -4204,10 +4204,20 @@ def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, A
|
||||
# Pattern: word followed by any bracket type containing phonetic content.
|
||||
# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
|
||||
# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
|
||||
# This intentionally matches mixed brackets (e.g. {content]) because
|
||||
# Tesseract frequently misrecognizes bracket characters.
|
||||
_PHONETIC_BRACKET_RE = re.compile(
|
||||
r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
|
||||
)
|
||||
|
||||
# Unicode IPA characters — used to distinguish correct IPA (from dictionary
|
||||
# lookup) from garbled OCR content when stripping orphan brackets.
|
||||
_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ')
|
||||
|
||||
# Minimum word confidence for full-page Tesseract results (0-100).
|
||||
# Words below this threshold are OCR noise (scanner shadows, borders).
|
||||
_MIN_WORD_CONF = 30
|
||||
|
||||
|
||||
def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
|
||||
"""Look up IPA for a word using the selected pronunciation dictionary.
|
||||
@@ -4273,43 +4283,59 @@ def _fix_phonetic_brackets(
|
||||
if not IPA_AVAILABLE:
|
||||
return entries
|
||||
|
||||
# IPA phonetics only appear in the ENGLISH field of vocab tables.
|
||||
# German and example fields contain meaningful parenthetical content:
|
||||
# german: "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
|
||||
# example: "(sich beschweren)", "(brauchen)", "(jammern)"
|
||||
# These must NEVER be processed as phonetic transcriptions.
|
||||
replaced_count = 0
|
||||
for entry in entries:
|
||||
for field in ('english', 'german', 'example'):
|
||||
text = entry.get(field, '') or ''
|
||||
# Check for any bracket type — Tesseract garbles [ into { or (
|
||||
if not any(ch in text for ch in '[{('):
|
||||
continue
|
||||
entry[field] = _replace_phonetics_in_text(text, pronunciation)
|
||||
text = entry.get('english', '') or ''
|
||||
if not any(ch in text for ch in '[{('):
|
||||
continue
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation)
|
||||
if new_text != text:
|
||||
logger.debug(f"_fix_phonetic_brackets: '{text}' → '{new_text}'")
|
||||
replaced_count += 1
|
||||
entry['english'] = new_text
|
||||
|
||||
if replaced_count:
|
||||
logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries")
|
||||
return entries
|
||||
|
||||
|
||||
# German prefixes/words commonly in parentheses in vocab tables: (zer)brechen, Tanz(veranstaltung)
|
||||
_GERMAN_BRACKET_PREFIXES = frozenset({
|
||||
'ab', 'an', 'auf', 'aus', 'be', 'bei', 'dar', 'ein', 'emp', 'ent',
|
||||
'er', 'ge', 'her', 'hin', 'los', 'mit', 'nach', 'um', 'un', 'unter',
|
||||
'ver', 'vor', 'weg', 'zer', 'zu', 'zurück',
|
||||
# Grammar particles that appear in brackets after English words:
|
||||
# cross (with), complain (about/of), agree (on/with), look (sth) up
|
||||
# These must NOT be replaced with IPA. Only used for the English field
|
||||
# (German/example fields are never processed for IPA replacement).
|
||||
_GRAMMAR_BRACKET_WORDS = frozenset({
|
||||
# English prepositions/particles commonly in vocab tables
|
||||
'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
|
||||
'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
|
||||
# English grammar abbreviations used in vocab tables
|
||||
'sth', 'sb', 'adj', 'adv',
|
||||
})
|
||||
|
||||
|
||||
def _is_meaningful_bracket_content(content: str) -> bool:
|
||||
"""Return True if bracket content is a meaningful word/prefix, not garbled IPA.
|
||||
def _is_grammar_bracket_content(content: str) -> bool:
|
||||
"""Return True if bracket content is grammar info in the ENGLISH field.
|
||||
|
||||
Meaningful: (zer)brechen, (veranstaltung), (Haupt)stadt
|
||||
Garbled IPA: {'tfatno, (cy, 1u], 'daens
|
||||
Grammar info: cross (with), complain (about/of), agree (on/with)
|
||||
NOT grammar: [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
|
||||
|
||||
Since we only process the English field, we only need to recognize
|
||||
English grammar particles. Everything else is (garbled) IPA.
|
||||
"""
|
||||
if not content:
|
||||
return False
|
||||
# Must be pure letters (no digits, punctuation, IPA symbols)
|
||||
if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', content):
|
||||
|
||||
# Split on / for patterns like (about/of), (on/with)
|
||||
tokens = [t.strip().lower() for t in content.split('/') if t.strip()]
|
||||
if not tokens:
|
||||
return False
|
||||
# Known German prefix
|
||||
if content.lower() in _GERMAN_BRACKET_PREFIXES:
|
||||
return True
|
||||
# Long enough to be a real word (not 1-2 char garbled IPA like "cy")
|
||||
if len(content) >= 4:
|
||||
return True
|
||||
return False
|
||||
|
||||
# ALL tokens must be known grammar words
|
||||
return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
|
||||
|
||||
|
||||
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
|
||||
@@ -4325,24 +4351,47 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
|
||||
def replacer(match):
|
||||
word = match.group(1)
|
||||
bracket_content = match.group(2).strip()
|
||||
full_match = match.group(0)
|
||||
|
||||
# Skip if bracket content looks like regular text (multiple words)
|
||||
if len(bracket_content.split()) > 3:
|
||||
return match.group(0)
|
||||
return full_match
|
||||
|
||||
# Skip if bracket content is a meaningful word/prefix — e.g. (zer)brechen,
|
||||
# Tanz(veranstaltung). These are real German morphemes, not garbled IPA.
|
||||
if _is_meaningful_bracket_content(bracket_content):
|
||||
return match.group(0)
|
||||
|
||||
# Look up in IPA dictionary
|
||||
# Look up IPA for the word before brackets
|
||||
ipa = _lookup_ipa(word, pronunciation)
|
||||
if not ipa:
|
||||
return match.group(0) # Keep original
|
||||
|
||||
return f"{word} [{ipa}]"
|
||||
if ipa:
|
||||
# Word has IPA → bracket content is phonetic (garbled or correct).
|
||||
# Exception: grammar particles like cross (with) — keep those.
|
||||
if _is_grammar_bracket_content(bracket_content):
|
||||
return full_match
|
||||
logger.debug(f"phonetic: '{full_match}' → '{word} [{ipa}]'")
|
||||
return f"{word} [{ipa}]"
|
||||
|
||||
return _PHONETIC_BRACKET_RE.sub(replacer, text)
|
||||
# No IPA for this word — keep as-is
|
||||
return full_match
|
||||
|
||||
text = _PHONETIC_BRACKET_RE.sub(replacer, text)
|
||||
|
||||
# Second pass: strip remaining orphan brackets that are garbled IPA.
|
||||
# These have no word before them (the main regex requires \b word \s* bracket).
|
||||
# Examples: "[mais]", "{'mani setva]", trailing "(kros]"
|
||||
# Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
|
||||
def _strip_orphan_bracket(m):
|
||||
content = m.group(1).strip()
|
||||
# Keep grammar info: (sich beschweren), (about/of)
|
||||
if _is_grammar_bracket_content(content):
|
||||
return m.group(0)
|
||||
# Keep correct IPA (contains Unicode IPA characters)
|
||||
if any(ch in _IPA_CHARS for ch in content):
|
||||
return m.group(0)
|
||||
logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
|
||||
return ''
|
||||
|
||||
text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
|
||||
text = text.strip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def _assign_row_words_to_columns(
|
||||
@@ -4748,6 +4797,7 @@ def _ocr_cell_crop(
|
||||
# Crop boundaries: add small internal padding (3px each side) to avoid
|
||||
# clipping characters near column/row edges (e.g. parentheses, descenders).
|
||||
# Stays within image bounds but may extend slightly beyond strict cell.
|
||||
# 3px is small enough to avoid neighbour content at typical scan DPI (200-300).
|
||||
_PAD = 3
|
||||
cx = max(0, disp_x - _PAD)
|
||||
cy = max(0, disp_y - _PAD)
|
||||
@@ -4774,7 +4824,7 @@ def _ocr_cell_crop(
|
||||
}
|
||||
|
||||
if cw <= 0 or ch <= 0:
|
||||
logger.info("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
|
||||
return empty_cell
|
||||
|
||||
# --- Pixel-density check: skip truly empty cells ---
|
||||
@@ -4783,7 +4833,7 @@ def _ocr_cell_crop(
|
||||
if crop.size > 0:
|
||||
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
|
||||
if dark_ratio < 0.005:
|
||||
logger.info("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
|
||||
row_idx, col_idx, dark_ratio, cw, ch)
|
||||
return empty_cell
|
||||
|
||||
@@ -4824,7 +4874,7 @@ def _ocr_cell_crop(
|
||||
scale_x = up_w / max(crop_w, 1)
|
||||
scale_y = up_h / max(crop_h, 1)
|
||||
was_scaled = (up_w != crop_w or up_h != crop_h)
|
||||
logger.info("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
|
||||
row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y)
|
||||
tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
|
||||
words = ocr_region_rapid(bgr_up, tmp_region)
|
||||
@@ -4872,10 +4922,10 @@ def _ocr_cell_crop(
|
||||
y_tol = max(15, ch)
|
||||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
||||
logger.info("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
|
||||
row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
|
||||
else:
|
||||
logger.info("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
|
||||
row_idx, col_idx, cw, ch, psm, engine_name)
|
||||
|
||||
# --- PSM 7 fallback for still-empty Tesseract cells ---
|
||||
@@ -4901,7 +4951,7 @@ def _ocr_cell_crop(
|
||||
pre_filter = text
|
||||
text = _clean_cell_text_lite(text)
|
||||
if not text:
|
||||
logger.info("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
|
||||
row_idx, col_idx, pre_filter)
|
||||
avg_conf = 0.0
|
||||
|
||||
@@ -4912,6 +4962,23 @@ def _ocr_cell_crop(
|
||||
return result
|
||||
|
||||
|
||||
# Threshold: columns narrower than this (% of image width) use single-cell
|
||||
# crop OCR instead of full-page word assignment.
|
||||
#
|
||||
# Broad columns (>= threshold): Full-page Tesseract word assignment.
|
||||
# Better for multi-word content (sentences, IPA brackets, punctuation).
|
||||
# Examples: EN vocabulary, DE translation, example sentences.
|
||||
#
|
||||
# Narrow columns (< threshold): Isolated cell-crop OCR.
|
||||
# Prevents neighbour bleeding from adjacent broad columns.
|
||||
# Examples: page_ref, marker, numbering columns.
|
||||
#
|
||||
# 15% was empirically validated across vocab table scans with 3-5 columns.
|
||||
# Typical broad columns: 20-40% width. Typical narrow columns: 3-12% width.
|
||||
# The 15% boundary cleanly separates the two groups.
|
||||
_NARROW_COL_THRESHOLD_PCT = 15.0
|
||||
|
||||
|
||||
def build_cell_grid_v2(
|
||||
ocr_img: np.ndarray,
|
||||
column_regions: List[PageRegion],
|
||||
@@ -4922,30 +4989,24 @@ def build_cell_grid_v2(
|
||||
ocr_engine: str = "auto",
|
||||
img_bgr: Optional[np.ndarray] = None,
|
||||
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||||
"""Cell-First Grid: crop each cell in isolation, then OCR.
|
||||
"""Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones.
|
||||
|
||||
Drop-in replacement for build_cell_grid() — same signature & return type.
|
||||
No full-page word assignment; each cell is OCR'd from its own crop.
|
||||
|
||||
Strategy:
|
||||
- Broad columns (>15% image width): Use pre-assigned full-page Tesseract
|
||||
words (from row.words). Handles IPA brackets, punctuation, sentence
|
||||
continuity correctly.
|
||||
- Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent
|
||||
neighbour bleeding from adjacent broad columns.
|
||||
"""
|
||||
# Resolve engine — default to Tesseract for cell-first OCR.
|
||||
# Tesseract excels at isolated text crops (binarized, upscaled).
|
||||
# RapidOCR is optimized for full-page scene-text and produces artifacts
|
||||
# on small cell crops (extra chars, missing punctuation, garbled IPA).
|
||||
use_rapid = False
|
||||
engine_name = "tesseract"
|
||||
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
||||
engine_name = ocr_engine
|
||||
elif ocr_engine == "auto":
|
||||
engine_name = "tesseract"
|
||||
elif ocr_engine == "rapid":
|
||||
if not RAPIDOCR_AVAILABLE:
|
||||
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||||
else:
|
||||
use_rapid = True
|
||||
engine_name = "rapid" if use_rapid else "tesseract"
|
||||
else:
|
||||
engine_name = "tesseract"
|
||||
elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE:
|
||||
engine_name = "rapid"
|
||||
|
||||
logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}'")
|
||||
logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)")
|
||||
|
||||
# Filter to content rows only
|
||||
content_rows = [r for r in row_geometries if r.row_type == 'content']
|
||||
@@ -4980,9 +5041,7 @@ def build_cell_grid_v2(
|
||||
logger.warning("build_cell_grid_v2: no usable columns found")
|
||||
return [], []
|
||||
|
||||
# Heal row gaps — use header/footer boundaries (NOT column bounds!)
|
||||
# In Cell-First OCR, the crop IS the OCR input, so extending into
|
||||
# header/footer means OCR'ing header/footer text ("VOCABULARY", page nums).
|
||||
# Heal row gaps — use header/footer boundaries
|
||||
content_rows.sort(key=lambda r: r.y)
|
||||
header_rows = [r for r in row_geometries if r.row_type == 'header']
|
||||
footer_rows = [r for r in row_geometries if r.row_type == 'footer']
|
||||
@@ -5010,38 +5069,91 @@ def build_cell_grid_v2(
|
||||
'column_example': 'eng+deu',
|
||||
}
|
||||
|
||||
# --- Parallel OCR with ThreadPoolExecutor ---
|
||||
# Tesseract is single-threaded per call, so we benefit from parallelism.
|
||||
# ~40 rows × 4 cols = 160 cells, ~50% empty (density skip) → ~80 OCR calls.
|
||||
# --- Classify columns as broad vs narrow ---
|
||||
narrow_col_indices = set()
|
||||
for ci, col in enumerate(relevant_cols):
|
||||
col_pct = (col.width / img_w * 100) if img_w > 0 else 0
|
||||
if col_pct < _NARROW_COL_THRESHOLD_PCT:
|
||||
narrow_col_indices.add(ci)
|
||||
|
||||
broad_col_count = len(relevant_cols) - len(narrow_col_indices)
|
||||
logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), "
|
||||
f"{len(narrow_col_indices)} narrow columns (cell-crop)")
|
||||
|
||||
# --- Phase 1: Broad columns via full-page word assignment ---
|
||||
cells: List[Dict[str, Any]] = []
|
||||
cell_tasks = []
|
||||
|
||||
for row_idx, row in enumerate(content_rows):
|
||||
# Assign full-page words to columns for this row
|
||||
col_words = _assign_row_words_to_columns(row, relevant_cols)
|
||||
|
||||
for col_idx, col in enumerate(relevant_cols):
|
||||
cell_tasks.append((row_idx, col_idx, row, col))
|
||||
if col_idx not in narrow_col_indices:
|
||||
# BROAD column: use pre-assigned full-page words
|
||||
words = col_words.get(col_idx, [])
|
||||
# Filter low-confidence words
|
||||
words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||||
|
||||
max_workers = 4 if engine_name == "tesseract" else 2
|
||||
if words:
|
||||
y_tol = max(15, row.height)
|
||||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
||||
else:
|
||||
text = ''
|
||||
avg_conf = 0.0
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
||||
futures = {
|
||||
pool.submit(
|
||||
_ocr_cell_crop,
|
||||
ri, ci, row, col,
|
||||
ocr_img, img_bgr, img_w, img_h,
|
||||
engine_name, lang, lang_map,
|
||||
): (ri, ci)
|
||||
for ri, ci, row, col in cell_tasks
|
||||
}
|
||||
# Apply noise filter
|
||||
text = _clean_cell_text(text)
|
||||
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
cell = future.result()
|
||||
cell = {
|
||||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||||
'row_index': row_idx,
|
||||
'col_index': col_idx,
|
||||
'col_type': col.type,
|
||||
'text': text,
|
||||
'confidence': avg_conf,
|
||||
'bbox_px': {
|
||||
'x': col.x, 'y': row.y,
|
||||
'w': col.width, 'h': row.height,
|
||||
},
|
||||
'bbox_pct': {
|
||||
'x': round(col.x / img_w * 100, 2) if img_w else 0,
|
||||
'y': round(row.y / img_h * 100, 2) if img_h else 0,
|
||||
'w': round(col.width / img_w * 100, 2) if img_w else 0,
|
||||
'h': round(row.height / img_h * 100, 2) if img_h else 0,
|
||||
},
|
||||
'ocr_engine': 'word_lookup',
|
||||
}
|
||||
cells.append(cell)
|
||||
except Exception as e:
|
||||
ri, ci = futures[future]
|
||||
logger.error(f"build_cell_grid_v2: cell R{ri:02d}_C{ci} failed: {e}")
|
||||
|
||||
# Sort cells by (row_index, col_index) since futures complete out of order
|
||||
# --- Phase 2: Narrow columns via cell-crop OCR (parallel) ---
|
||||
narrow_tasks = []
|
||||
for row_idx, row in enumerate(content_rows):
|
||||
for col_idx, col in enumerate(relevant_cols):
|
||||
if col_idx in narrow_col_indices:
|
||||
narrow_tasks.append((row_idx, col_idx, row, col))
|
||||
|
||||
if narrow_tasks:
|
||||
max_workers = 4 if engine_name == "tesseract" else 2
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
||||
futures = {
|
||||
pool.submit(
|
||||
_ocr_cell_crop,
|
||||
ri, ci, row, col,
|
||||
ocr_img, img_bgr, img_w, img_h,
|
||||
engine_name, lang, lang_map,
|
||||
): (ri, ci)
|
||||
for ri, ci, row, col in narrow_tasks
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
cell = future.result()
|
||||
cells.append(cell)
|
||||
except Exception as e:
|
||||
ri, ci = futures[future]
|
||||
logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}")
|
||||
|
||||
# Sort cells by (row_index, col_index)
|
||||
cells.sort(key=lambda c: (c['row_index'], c['col_index']))
|
||||
|
||||
# Remove all-empty rows
|
||||
@@ -5057,7 +5169,7 @@ def build_cell_grid_v2(
|
||||
|
||||
logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
|
||||
f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
|
||||
f"engine={engine_name}")
|
||||
f"engine={engine_name} (hybrid)")
|
||||
|
||||
return cells, columns_meta
|
||||
|
||||
|
||||
Reference in New Issue
Block a user