From e3f939a6282f5dd5e53cd66ffbd02dd295e83793 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 2 Mar 2026 09:27:30 +0100 Subject: [PATCH] refactor(ocr-pipeline): make post-processing fully generic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three non-generic solutions replaced with universal heuristics: 1. Cell-OCR fallback: instead of restricting to column_en/column_de, now checks pixel density (>2% dark pixels) for ANY column type. Truly empty cells are skipped without running Tesseract. 2. Example-sentence detection: instead of checking for example-column text (worksheet-specific), now uses sentence heuristics (>=4 words or ends with sentence punctuation). Short EN text without DE is kept as a vocab entry (OCR may have missed the translation). 3. Comma-split: re-enabled with singular/plural detection. Pairs like "mouse, mice" / "Maus, Mäuse" are kept together. Verb forms like "break, broke, broken" are still split into individual entries. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 91 ++++++++++++++++---- klausur-service/backend/ocr_pipeline_api.py | 8 +- 2 files changed, 76 insertions(+), 23 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index eea7a64..87353be 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -2713,14 +2713,54 @@ def _fix_character_confusion(entries: List[Dict[str, Any]]) -> List[Dict[str, An # --- B. Comma-Separated Word Form Splitting --- +def _is_singular_plural_pair(parts: List[str]) -> bool: + """Detect if comma-separated parts are singular/plural forms of the same word. + + E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split). + "break, broke, broken" → False (different verb forms, OK to split). + + Heuristic: exactly 2 parts that share a common prefix of >= 50% length, + OR one part is a known plural suffix of the other (e.g. +s, +es, +en). + """ + if len(parts) != 2: + return False + + a, b = parts[0].lower().strip(), parts[1].lower().strip() + if not a or not b: + return False + + # Common prefix heuristic: if words share >= 50% of the shorter word, + # they are likely forms of the same word (Maus/Mäuse, child/children). + min_len = min(len(a), len(b)) + common = 0 + for ca, cb in zip(a, b): + if ca == cb: + common += 1 + else: + break + if common >= max(2, min_len * 0.5): + return True + + # Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü) + umlaut_map = str.maketrans('aou', 'äöü') + if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a: + return True + + return False + + def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Split entries with comma-separated word forms into individual entries. E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen" → 3 entries: break/brechen, broke/brach, broken/gebrochen + Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse" + because those are forms of the same vocabulary entry. + Only splits when both EN and DE have the same number of comma-parts, - or when one side has multiple and the other has exactly one. + parts are short (word forms, not sentences), and at least 3 parts + (to avoid splitting pairs that likely belong together). """ result: List[Dict[str, Any]] = [] @@ -2732,13 +2772,17 @@ def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]: en_parts = _split_by_comma(en) de_parts = _split_by_comma(de) - # Only split if we have multiple parts and counts match or one side is single + # Only split if we have multiple parts and counts match should_split = False if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts): - # Both have same count — each part is a word form - # But only if parts are short (word forms, not sentences) + # All parts must be short (word forms, not sentences) if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts): - should_split = True + # Do NOT split singular/plural pairs (2 parts that are + # forms of the same word) + if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts): + should_split = False + else: + should_split = True if not should_split: result.append(entry) @@ -2872,13 +2916,18 @@ def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, A # "Ei" (2 chars) is a valid German word, so threshold is 1. has_de = len(de) > 1 has_en = bool(en) - has_ex = bool(ex) - # A row is an example candidate ONLY if it has EN text but - # NO DE translation AND NO example-column text. Rows with - # text in the example column are real vocab entries (e.g. - # continuation lines like "stand ..." / "German: Ich möchte..."). - is_example_candidate = has_en and not has_de and not has_ex and vocab_entries + # Heuristic: a row without DE is an "example sentence" only if + # the EN text looks like a sentence (>= 4 words, or contains + # typical sentence punctuation). Short EN text (1-3 words) is + # more likely a vocab entry whose DE was missed by OCR. + _looks_like_sentence = ( + len(en.split()) >= 4 + or en.rstrip().endswith(('.', '!', '?')) + ) + is_example_candidate = ( + has_en and not has_de and _looks_like_sentence and vocab_entries + ) if is_example_candidate: # This is an example sentence — find best matching vocab entry @@ -3127,12 +3176,20 @@ def _ocr_single_cell( # --- FALLBACK: Cell-OCR for empty cells --- # Full-page Tesseract can miss small or isolated words (e.g. "Ei"). # Re-run OCR on the cell crop to catch what word-lookup missed. - # Only run fallback for EN/DE columns (where vocab words are expected). - # Example columns are often legitimately empty and running Tesseract on - # all of them wastes ~10s. column_example cells stay empty if word-lookup - # found nothing. - _fallback_col_types = {'column_en', 'column_de'} - if not text.strip() and cell_w > 0 and cell_h > 0 and col.type in _fallback_col_types: + # To avoid wasting time on truly empty cells, check pixel density first: + # only run Tesseract if the cell crop contains enough dark pixels to + # plausibly contain text. + _run_fallback = False + if not text.strip() and cell_w > 0 and cell_h > 0: + # Quick pixel-density check: binarise the cell crop and count + # dark pixels. Text cells typically have >2% ink coverage. + if ocr_img is not None: + crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w] + if crop.size > 0: + # Threshold: pixels darker than 180 (on 0-255 grayscale) + dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size + _run_fallback = dark_ratio > 0.02 + if _run_fallback: cell_region = PageRegion( type=col.type, x=cell_x, y=cell_y, diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index e900e9c..24f1d4a 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -1179,9 +1179,7 @@ async def detect_words( entries = _cells_to_vocab_entries(cells, columns_meta) entries = _fix_character_confusion(entries) entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) - # NOTE: _split_comma_entries disabled — word forms like "mouse, mice" - # / "Maus, Mäuse" belong together in one entry. - # entries = _split_comma_entries(entries) + entries = _split_comma_entries(entries) entries = _attach_example_sentences(entries) word_result["vocab_entries"] = entries # Also keep "entries" key for backwards compatibility @@ -1310,9 +1308,7 @@ async def _word_stream_generator( entries = _cells_to_vocab_entries(all_cells, columns_meta) entries = _fix_character_confusion(entries) entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) - # NOTE: _split_comma_entries disabled — word forms like "mouse, mice" - # / "Maus, Mäuse" belong together in one entry. - # entries = _split_comma_entries(entries) + entries = _split_comma_entries(entries) entries = _attach_example_sentences(entries) word_result["vocab_entries"] = entries word_result["entries"] = entries