cleanup: remove sheet-specific code, reduce logging, document constants
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m59s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 17s

Genericity audit findings:
- Remove German prefixes from _GRAMMAR_BRACKET_WORDS (only English field
  is processed, German prefixes were unreachable dead code)
- Move _IPA_CHARS and _MIN_WORD_CONF to module-level constants
- Document _NARROW_COL_THRESHOLD_PCT with empirical rationale
- Document _PAD=3 with DPI context
- Document _PHONETIC_BRACKET_RE intentional mixed-bracket matching
- Reduce all diagnostic logger.info() to logger.debug() in:
  _ocr_cell_crop, _replace_phonetics_in_text, _fix_phonetic_brackets
- Keep only summary-level info logging

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-05 00:04:02 +01:00
parent 1e0c6bb4b5
commit fd99d4f875

View File

@@ -4204,10 +4204,20 @@ def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, A
# Pattern: word followed by any bracket type containing phonetic content. # Pattern: word followed by any bracket type containing phonetic content.
# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc. # Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs. # Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
# This intentionally matches mixed brackets (e.g. {content]) because
# Tesseract frequently misrecognizes bracket characters.
_PHONETIC_BRACKET_RE = re.compile( _PHONETIC_BRACKET_RE = re.compile(
r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]' r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
) )
# Unicode IPA characters — used to distinguish correct IPA (from dictionary
# lookup) from garbled OCR content when stripping orphan brackets.
_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ')
# Minimum word confidence for full-page Tesseract results (0-100).
# Words below this threshold are OCR noise (scanner shadows, borders).
_MIN_WORD_CONF = 30
def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]: def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
"""Look up IPA for a word using the selected pronunciation dictionary. """Look up IPA for a word using the selected pronunciation dictionary.
@@ -4285,27 +4295,24 @@ def _fix_phonetic_brackets(
continue continue
new_text = _replace_phonetics_in_text(text, pronunciation) new_text = _replace_phonetics_in_text(text, pronunciation)
if new_text != text: if new_text != text:
logger.info(f"_fix_phonetic_brackets: english '{text}''{new_text}'") logger.debug(f"_fix_phonetic_brackets: '{text}''{new_text}'")
replaced_count += 1 replaced_count += 1
entry['english'] = new_text entry['english'] = new_text
logger.info(f"_fix_phonetic_brackets: {replaced_count} replacements in {len(entries)} entries") if replaced_count:
logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries")
return entries return entries
# Bracket content that is grammar info, not phonetic: cross (with), complain (about/of) # Grammar particles that appear in brackets after English words:
# Also German prefixes: (zer)brechen, Tanz(veranstaltung), Schild(chen) # cross (with), complain (about/of), agree (on/with), look (sth) up
# These should NEVER be replaced with IPA. # These must NOT be replaced with IPA. Only used for the English field
# (German/example fields are never processed for IPA replacement).
_GRAMMAR_BRACKET_WORDS = frozenset({ _GRAMMAR_BRACKET_WORDS = frozenset({
# English prepositions/particles commonly in vocab tables # English prepositions/particles commonly in vocab tables
'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by', 'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through', 'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
# German prepositions/particles # English grammar abbreviations used in vocab tables
'ab', 'an', 'auf', 'aus', 'bei', 'dar', 'ein', 'für', 'her', 'hin',
'los', 'mit', 'nach', 'um', 'unter', 'von', 'vor', 'weg', 'zu', 'zurück',
# German verb prefixes (in parentheses before verb stems)
'be', 'emp', 'ent', 'er', 'ge', 'un', 'ver', 'zer',
# Abbreviations
'sth', 'sb', 'adj', 'adv', 'sth', 'sb', 'adj', 'adv',
}) })
@@ -4348,7 +4355,6 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
# Skip if bracket content looks like regular text (multiple words) # Skip if bracket content looks like regular text (multiple words)
if len(bracket_content.split()) > 3: if len(bracket_content.split()) > 3:
logger.info(f" phonetic replacer: SKIP (too many words) '{full_match}'")
return full_match return full_match
# Look up IPA for the word before brackets # Look up IPA for the word before brackets
@@ -4358,19 +4364,11 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
# Word has IPA → bracket content is phonetic (garbled or correct). # Word has IPA → bracket content is phonetic (garbled or correct).
# Exception: grammar particles like cross (with) — keep those. # Exception: grammar particles like cross (with) — keep those.
if _is_grammar_bracket_content(bracket_content): if _is_grammar_bracket_content(bracket_content):
# Grammar info followed by garbled IPA? E.g. "cross (with) [kros]"
# Keep the grammar part, IPA will be handled as orphan bracket.
logger.info(f" phonetic replacer: SKIP (grammar info) '{full_match}'")
return full_match return full_match
logger.info(f" phonetic replacer: REPLACE '{full_match}''{word} [{ipa}]'") logger.debug(f"phonetic: '{full_match}''{word} [{ipa}]'")
return f"{word} [{ipa}]" return f"{word} [{ipa}]"
# No IPA for this word — keep grammar info, strip garbled IPA # No IPA for this word — keep as-is
if _is_grammar_bracket_content(bracket_content):
logger.info(f" phonetic replacer: SKIP (grammar, no IPA) '{full_match}'")
return full_match
logger.info(f" phonetic replacer: SKIP (no IPA for '{word}') '{full_match}'")
return full_match return full_match
text = _PHONETIC_BRACKET_RE.sub(replacer, text) text = _PHONETIC_BRACKET_RE.sub(replacer, text)
@@ -4379,17 +4377,15 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
# These have no word before them (the main regex requires \b word \s* bracket). # These have no word before them (the main regex requires \b word \s* bracket).
# Examples: "[mais]", "{'mani setva]", trailing "(kros]" # Examples: "[mais]", "{'mani setva]", trailing "(kros]"
# Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]" # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
_IPA_CHARS = set('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔ̩̃ʊɐ')
def _strip_orphan_bracket(m): def _strip_orphan_bracket(m):
content = m.group(1).strip() content = m.group(1).strip()
# Keep grammar info: (sich beschweren), (auf), (about/of) # Keep grammar info: (sich beschweren), (about/of)
if _is_grammar_bracket_content(content): if _is_grammar_bracket_content(content):
return m.group(0) return m.group(0)
# Keep correct IPA (contains Unicode IPA characters) # Keep correct IPA (contains Unicode IPA characters)
if any(ch in _IPA_CHARS for ch in content): if any(ch in _IPA_CHARS for ch in content):
return m.group(0) return m.group(0)
logger.info(f" phonetic: stripping orphan bracket '{m.group(0)}'") logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
return '' return ''
text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text) text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
@@ -4801,6 +4797,7 @@ def _ocr_cell_crop(
# Crop boundaries: add small internal padding (3px each side) to avoid # Crop boundaries: add small internal padding (3px each side) to avoid
# clipping characters near column/row edges (e.g. parentheses, descenders). # clipping characters near column/row edges (e.g. parentheses, descenders).
# Stays within image bounds but may extend slightly beyond strict cell. # Stays within image bounds but may extend slightly beyond strict cell.
# 3px is small enough to avoid neighbour content at typical scan DPI (200-300).
_PAD = 3 _PAD = 3
cx = max(0, disp_x - _PAD) cx = max(0, disp_x - _PAD)
cy = max(0, disp_y - _PAD) cy = max(0, disp_y - _PAD)
@@ -4827,7 +4824,7 @@ def _ocr_cell_crop(
} }
if cw <= 0 or ch <= 0: if cw <= 0 or ch <= 0:
logger.info("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch) logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
return empty_cell return empty_cell
# --- Pixel-density check: skip truly empty cells --- # --- Pixel-density check: skip truly empty cells ---
@@ -4836,7 +4833,7 @@ def _ocr_cell_crop(
if crop.size > 0: if crop.size > 0:
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
if dark_ratio < 0.005: if dark_ratio < 0.005:
logger.info("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)", logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
row_idx, col_idx, dark_ratio, cw, ch) row_idx, col_idx, dark_ratio, cw, ch)
return empty_cell return empty_cell
@@ -4877,7 +4874,7 @@ def _ocr_cell_crop(
scale_x = up_w / max(crop_w, 1) scale_x = up_w / max(crop_w, 1)
scale_y = up_h / max(crop_h, 1) scale_y = up_h / max(crop_h, 1)
was_scaled = (up_w != crop_w or up_h != crop_h) was_scaled = (up_w != crop_w or up_h != crop_h)
logger.info("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)", logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y) row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y)
tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h) tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
words = ocr_region_rapid(bgr_up, tmp_region) words = ocr_region_rapid(bgr_up, tmp_region)
@@ -4925,10 +4922,10 @@ def _ocr_cell_crop(
y_tol = max(15, ch) y_tol = max(15, ch)
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol) text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1) avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
logger.info("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s", logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name) row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
else: else:
logger.info("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)", logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
row_idx, col_idx, cw, ch, psm, engine_name) row_idx, col_idx, cw, ch, psm, engine_name)
# --- PSM 7 fallback for still-empty Tesseract cells --- # --- PSM 7 fallback for still-empty Tesseract cells ---
@@ -4954,7 +4951,7 @@ def _ocr_cell_crop(
pre_filter = text pre_filter = text
text = _clean_cell_text_lite(text) text = _clean_cell_text_lite(text)
if not text: if not text:
logger.info("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r", logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
row_idx, col_idx, pre_filter) row_idx, col_idx, pre_filter)
avg_conf = 0.0 avg_conf = 0.0
@@ -4966,10 +4963,19 @@ def _ocr_cell_crop(
# Threshold: columns narrower than this (% of image width) use single-cell # Threshold: columns narrower than this (% of image width) use single-cell
# crop OCR instead of full-page word assignment. Broad columns (EN, DE, # crop OCR instead of full-page word assignment.
# Example) get full-page Tesseract which handles IPA brackets, punctuation, #
# and sentence flow much better. Narrow columns (page_ref, marker) use # Broad columns (>= threshold): Full-page Tesseract word assignment.
# isolated cell crops to prevent neighbour bleeding. # Better for multi-word content (sentences, IPA brackets, punctuation).
# Examples: EN vocabulary, DE translation, example sentences.
#
# Narrow columns (< threshold): Isolated cell-crop OCR.
# Prevents neighbour bleeding from adjacent broad columns.
# Examples: page_ref, marker, numbering columns.
#
# 15% was empirically validated across vocab table scans with 3-5 columns.
# Typical broad columns: 20-40% width. Typical narrow columns: 3-12% width.
# The 15% boundary cleanly separates the two groups.
_NARROW_COL_THRESHOLD_PCT = 15.0 _NARROW_COL_THRESHOLD_PCT = 15.0
@@ -5086,7 +5092,7 @@ def build_cell_grid_v2(
# BROAD column: use pre-assigned full-page words # BROAD column: use pre-assigned full-page words
words = col_words.get(col_idx, []) words = col_words.get(col_idx, [])
# Filter low-confidence words # Filter low-confidence words
words = [w for w in words if w.get('conf', 0) >= 30] words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
if words: if words:
y_tol = max(15, row.height) y_tol = max(15, row.height)