From fd99d4f87577333517e2d7d3f6779dc13b7c1c78 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Thu, 5 Mar 2026 00:04:02 +0100
Subject: [PATCH] cleanup: remove sheet-specific code, reduce logging, document
 constants

Genericity audit findings:
- Remove German prefixes from _GRAMMAR_BRACKET_WORDS (only English field
  is processed, German prefixes were unreachable dead code)
- Move _IPA_CHARS and _MIN_WORD_CONF to module-level constants
- Document _NARROW_COL_THRESHOLD_PCT with empirical rationale
- Document _PAD=3 with DPI context
- Document _PHONETIC_BRACKET_RE intentional mixed-bracket matching
- Reduce all diagnostic logger.info() to logger.debug() in:
  _ocr_cell_crop, _replace_phonetics_in_text, _fix_phonetic_brackets
- Keep only summary-level info logging

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 80 +++++++++++---------
 1 file changed, 43 insertions(+), 37 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 1e5ea05..1ead642 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -4204,10 +4204,20 @@ def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, A
 # Pattern: word followed by any bracket type containing phonetic content.
 # Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
 # Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
+# This intentionally matches mixed brackets (e.g. {content]) because
+# Tesseract frequently misrecognizes bracket characters.
 _PHONETIC_BRACKET_RE = re.compile(
     r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
 )
 
+# Unicode IPA characters — used to distinguish correct IPA (from dictionary
+# lookup) from garbled OCR content when stripping orphan brackets.
+_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ')
+
+# Minimum word confidence for full-page Tesseract results (0-100).
+# Words below this threshold are OCR noise (scanner shadows, borders).
+_MIN_WORD_CONF = 30
+
 
 def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
     """Look up IPA for a word using the selected pronunciation dictionary.
@@ -4285,27 +4295,24 @@ def _fix_phonetic_brackets(
             continue
         new_text = _replace_phonetics_in_text(text, pronunciation)
         if new_text != text:
-            logger.info(f"_fix_phonetic_brackets: english '{text}' → '{new_text}'")
+            logger.debug(f"_fix_phonetic_brackets: '{text}' → '{new_text}'")
             replaced_count += 1
         entry['english'] = new_text
 
-    logger.info(f"_fix_phonetic_brackets: {replaced_count} replacements in {len(entries)} entries")
+    if replaced_count:
+        logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries")
     return entries
 
 
-# Bracket content that is grammar info, not phonetic: cross (with), complain (about/of)
-# Also German prefixes: (zer)brechen, Tanz(veranstaltung), Schild(chen)
-# These should NEVER be replaced with IPA.
+# Grammar particles that appear in brackets after English words:
+#   cross (with), complain (about/of), agree (on/with), look (sth) up
+# These must NOT be replaced with IPA.  Only used for the English field
+# (German/example fields are never processed for IPA replacement).
 _GRAMMAR_BRACKET_WORDS = frozenset({
     # English prepositions/particles commonly in vocab tables
     'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
     'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
-    # German prepositions/particles
-    'ab', 'an', 'auf', 'aus', 'bei', 'dar', 'ein', 'für', 'her', 'hin',
-    'los', 'mit', 'nach', 'um', 'unter', 'von', 'vor', 'weg', 'zu', 'zurück',
-    # German verb prefixes (in parentheses before verb stems)
-    'be', 'emp', 'ent', 'er', 'ge', 'un', 'ver', 'zer',
-    # Abbreviations
+    # English grammar abbreviations used in vocab tables
     'sth', 'sb', 'adj', 'adv',
 })
 
@@ -4348,7 +4355,6 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
 
         # Skip if bracket content looks like regular text (multiple words)
         if len(bracket_content.split()) > 3:
-            logger.info(f"  phonetic replacer: SKIP (too many words) '{full_match}'")
             return full_match
 
         # Look up IPA for the word before brackets
@@ -4358,19 +4364,11 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
             # Word has IPA → bracket content is phonetic (garbled or correct).
             # Exception: grammar particles like cross (with) — keep those.
             if _is_grammar_bracket_content(bracket_content):
-                # Grammar info followed by garbled IPA? E.g. "cross (with) [kros]"
-                # Keep the grammar part, IPA will be handled as orphan bracket.
-                logger.info(f"  phonetic replacer: SKIP (grammar info) '{full_match}'")
                 return full_match
-            logger.info(f"  phonetic replacer: REPLACE '{full_match}' → '{word} [{ipa}]'")
+            logger.debug(f"phonetic: '{full_match}' → '{word} [{ipa}]'")
             return f"{word} [{ipa}]"
 
-        # No IPA for this word — keep grammar info, strip garbled IPA
-        if _is_grammar_bracket_content(bracket_content):
-            logger.info(f"  phonetic replacer: SKIP (grammar, no IPA) '{full_match}'")
-            return full_match
-
-        logger.info(f"  phonetic replacer: SKIP (no IPA for '{word}') '{full_match}'")
+        # No IPA for this word — keep as-is
         return full_match
 
     text = _PHONETIC_BRACKET_RE.sub(replacer, text)
@@ -4379,17 +4377,15 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
     # These have no word before them (the main regex requires \b word \s* bracket).
     # Examples: "[mais]", "{'mani setva]", trailing "(kros]"
     # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
-    _IPA_CHARS = set('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔ̩̃ʊɐ')
-
     def _strip_orphan_bracket(m):
         content = m.group(1).strip()
-        # Keep grammar info: (sich beschweren), (auf), (about/of)
+        # Keep grammar info: (sich beschweren), (about/of)
         if _is_grammar_bracket_content(content):
             return m.group(0)
         # Keep correct IPA (contains Unicode IPA characters)
         if any(ch in _IPA_CHARS for ch in content):
             return m.group(0)
-        logger.info(f"  phonetic: stripping orphan bracket '{m.group(0)}'")
+        logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
         return ''
 
     text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
@@ -4801,6 +4797,7 @@ def _ocr_cell_crop(
     # Crop boundaries: add small internal padding (3px each side) to avoid
     # clipping characters near column/row edges (e.g. parentheses, descenders).
     # Stays within image bounds but may extend slightly beyond strict cell.
+    # 3px is small enough to avoid neighbour content at typical scan DPI (200-300).
     _PAD = 3
     cx = max(0, disp_x - _PAD)
     cy = max(0, disp_y - _PAD)
@@ -4827,7 +4824,7 @@ def _ocr_cell_crop(
     }
 
     if cw <= 0 or ch <= 0:
-        logger.info("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
+        logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
         return empty_cell
 
     # --- Pixel-density check: skip truly empty cells ---
@@ -4836,7 +4833,7 @@ def _ocr_cell_crop(
         if crop.size > 0:
             dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
             if dark_ratio < 0.005:
-                logger.info("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
+                logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
                             row_idx, col_idx, dark_ratio, cw, ch)
                 return empty_cell
 
@@ -4877,7 +4874,7 @@ def _ocr_cell_crop(
             scale_x = up_w / max(crop_w, 1)
             scale_y = up_h / max(crop_h, 1)
             was_scaled = (up_w != crop_w or up_h != crop_h)
-            logger.info("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
+            logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
                         row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y)
             tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
             words = ocr_region_rapid(bgr_up, tmp_region)
@@ -4925,10 +4922,10 @@ def _ocr_cell_crop(
         y_tol = max(15, ch)
         text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
         avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
-        logger.info("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
+        logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
                     row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
     else:
-        logger.info("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
+        logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
                     row_idx, col_idx, cw, ch, psm, engine_name)
 
     # --- PSM 7 fallback for still-empty Tesseract cells ---
@@ -4954,7 +4951,7 @@ def _ocr_cell_crop(
         pre_filter = text
         text = _clean_cell_text_lite(text)
         if not text:
-            logger.info("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
+            logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
                         row_idx, col_idx, pre_filter)
             avg_conf = 0.0
 
@@ -4966,10 +4963,19 @@ def _ocr_cell_crop(
 
 
 # Threshold: columns narrower than this (% of image width) use single-cell
-# crop OCR instead of full-page word assignment.  Broad columns (EN, DE,
-# Example) get full-page Tesseract which handles IPA brackets, punctuation,
-# and sentence flow much better.  Narrow columns (page_ref, marker) use
-# isolated cell crops to prevent neighbour bleeding.
+# crop OCR instead of full-page word assignment.
+#
+# Broad columns (>= threshold): Full-page Tesseract word assignment.
+#   Better for multi-word content (sentences, IPA brackets, punctuation).
+#   Examples: EN vocabulary, DE translation, example sentences.
+#
+# Narrow columns (< threshold): Isolated cell-crop OCR.
+#   Prevents neighbour bleeding from adjacent broad columns.
+#   Examples: page_ref, marker, numbering columns.
+#
+# 15% was empirically validated across vocab table scans with 3-5 columns.
+# Typical broad columns: 20-40% width. Typical narrow columns: 3-12% width.
+# The 15% boundary cleanly separates the two groups.
 _NARROW_COL_THRESHOLD_PCT = 15.0
 
 
@@ -5086,7 +5092,7 @@ def build_cell_grid_v2(
                 # BROAD column: use pre-assigned full-page words
                 words = col_words.get(col_idx, [])
                 # Filter low-confidence words
-                words = [w for w in words if w.get('conf', 0) >= 30]
+                words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
 
                 if words:
                     y_tol = max(15, row.height)