Fix: Remove broken getKlausurApiUrl and clean up empty lines

sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions
--- a/klausur-service/backend/cv_ocr_vocab_postprocess.py
+++ b/klausur-service/backend/cv_ocr_vocab_postprocess.py
@@ -0,0 +1,318 @@
+"""
+Vocab postprocessing: deterministic quality fixes for OCR-extracted vocabulary.
+
+- Character confusion fix (I/1/l/|)
+- Comma-separated word form splitting
+- Example sentence attachment to matching vocab entries
+
+Split from cv_ocr_engines.py for maintainability.
+"""
+
+import re
+from typing import Any, Dict, List
+
+
+# =============================================================================
+# Post-Processing: Deterministic Quality Fixes
+# =============================================================================
+
+# --- A. Character Confusion Fix (I/1/l) ---
+
+# Common OCR confusion pairs in vocabulary context
+_CHAR_CONFUSION_RULES = [
+    # "1" at word start followed by lowercase → likely "I" or "l"
+    # Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3")
+    (re.compile(r'\b1([a-z])'), r'I\1'),           # 1ch → Ich, 1want → Iwant
+    # Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
+    (re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'),  # "1 want" → "I want"
+    # "|" → "I", but NOT when embedded between letters (syllable divider: Ka|me|rad)
+    # and NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
+    (re.compile(r'(?<![a-zA-ZäöüÄÖÜß])\|(?!\||[.,])'), 'I'),  # |ch → Ich, | want → I want
+]
+
+# Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
+_DE_INDICATORS_FOR_EN_I = {'ich', 'mich', 'mir', 'mein', 'meine', 'meiner', 'meinem'}
+
+
+def _fix_character_confusion(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Fix common OCR character confusions using context.
+
+    Deterministic rules:
+    - "1" at word start → "I" or "l" based on context
+    - Cross-reference EN↔DE: if DE contains "ich/mich/mir", EN "1" → "I"
+    - "y " artifact at word boundaries → remove (e.g. "y you" → "you")
+    """
+    for entry in entries:
+        en = entry.get('english', '') or ''
+        de = entry.get('german', '') or ''
+        ex = entry.get('example', '') or ''
+
+        # Apply general rules to all fields
+        for pattern, replacement in _CHAR_CONFUSION_RULES:
+            en = pattern.sub(replacement, en)
+            de = pattern.sub(replacement, de)
+            ex = pattern.sub(replacement, ex)
+
+        # Cross-reference: if DE has "ich"/"mich" indicators, fix EN "1" → "I"
+        de_lower_words = set(de.lower().replace(',', ' ').split())
+        if de_lower_words & _DE_INDICATORS_FOR_EN_I:
+            # Any remaining "1" in EN that looks like "I"
+            en = re.sub(r'\b1\b(?![\d.,])', 'I', en)
+
+        # Fix "y " artifact before repeated word: "y you" → "you"
+        en = re.sub(r'\by\s+([a-z])', r'\1', en)
+        ex = re.sub(r'\by\s+([a-z])', r'\1', ex)
+
+        entry['english'] = en.strip()
+        entry['german'] = de.strip()
+        entry['example'] = ex.strip()
+
+    return entries
+
+
+# --- B. Comma-Separated Word Form Splitting ---
+
+def _is_singular_plural_pair(parts: List[str]) -> bool:
+    """Detect if comma-separated parts are singular/plural forms of the same word.
+
+    E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split).
+    "break, broke, broken" → False (different verb forms, OK to split).
+
+    Heuristic: exactly 2 parts that share a common prefix of >= 50% length,
+    OR one part is a known plural suffix of the other (e.g. +s, +es, +en).
+    """
+    if len(parts) != 2:
+        return False
+
+    a, b = parts[0].lower().strip(), parts[1].lower().strip()
+    if not a or not b:
+        return False
+
+    # Common prefix heuristic: if words share >= 50% of the shorter word,
+    # they are likely forms of the same word (Maus/Mäuse, child/children).
+    min_len = min(len(a), len(b))
+    common = 0
+    for ca, cb in zip(a, b):
+        if ca == cb:
+            common += 1
+        else:
+            break
+    if common >= max(2, min_len * 0.5):
+        return True
+
+    # Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü)
+    umlaut_map = str.maketrans('aou', 'äöü')
+    if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a:
+        return True
+
+    return False
+
+
+def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Split entries with comma-separated word forms into individual entries.
+
+    E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen"
+    → 3 entries: break/brechen, broke/brach, broken/gebrochen
+
+    Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse"
+    because those are forms of the same vocabulary entry.
+
+    Only splits when both EN and DE have the same number of comma-parts,
+    parts are short (word forms, not sentences), and at least 3 parts
+    (to avoid splitting pairs that likely belong together).
+    """
+    result: List[Dict[str, Any]] = []
+
+    for entry in entries:
+        en = (entry.get('english', '') or '').strip()
+        de = (entry.get('german', '') or '').strip()
+
+        # Split by comma (but not inside brackets or parentheses)
+        en_parts = _split_by_comma(en)
+        de_parts = _split_by_comma(de)
+
+        # Only split if we have multiple parts and counts match
+        should_split = False
+        if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts):
+            # All parts must be short (word forms, not sentences)
+            if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts):
+                # Do NOT split singular/plural pairs (2 parts that are
+                # forms of the same word)
+                if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts):
+                    should_split = False
+                else:
+                    should_split = True
+
+        if not should_split:
+            result.append(entry)
+            continue
+
+        # Split into individual entries
+        for k in range(len(en_parts)):
+            sub = dict(entry)  # shallow copy
+            sub['english'] = en_parts[k].strip()
+            sub['german'] = de_parts[k].strip() if k < len(de_parts) else ''
+            sub['example'] = ''  # examples get attached later
+            sub['split_from_comma'] = True
+            result.append(sub)
+
+    # Re-number
+    for i, e in enumerate(result):
+        e['row_index'] = i
+
+    return result
+
+
+def _split_by_comma(text: str) -> List[str]:
+    """Split text by commas, but not inside brackets [...] or parens (...)."""
+    if ',' not in text:
+        return [text]
+
+    parts = []
+    depth_bracket = 0
+    depth_paren = 0
+    current = []
+
+    for ch in text:
+        if ch == '[':
+            depth_bracket += 1
+        elif ch == ']':
+            depth_bracket = max(0, depth_bracket - 1)
+        elif ch == '(':
+            depth_paren += 1
+        elif ch == ')':
+            depth_paren = max(0, depth_paren - 1)
+        elif ch == ',' and depth_bracket == 0 and depth_paren == 0:
+            parts.append(''.join(current).strip())
+            current = []
+            continue
+        current.append(ch)
+
+    if current:
+        parts.append(''.join(current).strip())
+
+    # Filter empty parts
+    return [p for p in parts if p]
+
+
+# --- C. Example Sentence Attachment ---
+
+def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
+    """Find the vocab entry whose English word(s) best match the example sentence.
+
+    Returns index into vocab_entries, or -1 if no match found.
+    Uses word stem overlap: "a broken arm" matches "broken" or "break".
+    """
+    if not vocab_entries or not example_text:
+        return -1
+
+    example_lower = example_text.lower()
+    example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
+
+    best_idx = -1
+    best_score = 0
+
+    for i, entry in enumerate(vocab_entries):
+        en = (entry.get('english', '') or '').lower()
+        if not en:
+            continue
+
+        # Extract vocab words (split on space, comma, newline)
+        vocab_words = set(re.findall(r'[a-zäöüß]+', en))
+
+        # Score: how many vocab words appear in the example?
+        # Also check if example words share a common stem (first 4 chars)
+        direct_matches = vocab_words & example_words
+        score = len(direct_matches) * 10
+
+        # Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
+        if score == 0:
+            for vw in vocab_words:
+                if len(vw) < 3:
+                    continue
+                stem = vw[:4] if len(vw) >= 4 else vw[:3]
+                for ew in example_words:
+                    if len(ew) >= len(stem) and ew[:len(stem)] == stem:
+                        score += 5
+                        break
+
+        if score > best_score:
+            best_score = score
+            best_idx = i
+
+    return best_idx if best_score > 0 else -1
+
+
+def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Attach rows with EN text but no DE translation as examples to matching vocab entries.
+
+    Vocabulary worksheets often have:
+      Row 1: break, broke, broken / brechen, brach, gebrochen
+      Row 2: a broken arm          (no DE → example for "broken")
+      Row 3: a broken plate         (no DE → example for "broken")
+      Row 4: egg / Ei               (has DE → new vocab entry)
+
+    Rules (deterministic, generic):
+    - A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
+    - Find the best matching vocab entry by checking which entry's English words
+      appear in the example sentence (semantic matching via word overlap)
+    - Fall back to the nearest preceding entry if no word match found
+    - Multiple examples get joined with " | "
+    """
+    if not entries:
+        return entries
+
+    # Separate into vocab entries (have DE) and example candidates (no DE)
+    vocab_entries: List[Dict[str, Any]] = []
+    examples_for: Dict[int, List[str]] = {}  # vocab_index → list of example texts
+
+    for entry in entries:
+        en = (entry.get('english', '') or '').strip()
+        de = (entry.get('german', '') or '').strip()
+        ex = (entry.get('example', '') or '').strip()
+
+        # Treat single-char DE as OCR noise, not real translation.
+        # "Ei" (2 chars) is a valid German word, so threshold is 1.
+        has_de = len(de) > 1
+        has_en = bool(en)
+
+        # Heuristic: a row without DE is an "example sentence" only if
+        # the EN text looks like a sentence (>= 4 words, or contains
+        # typical sentence punctuation).  Short EN text (1-3 words) is
+        # more likely a vocab entry whose DE was missed by OCR.
+        _looks_like_sentence = (
+            len(en.split()) >= 4
+            or en.rstrip().endswith(('.', '!', '?'))
+        )
+        is_example_candidate = (
+            has_en and not has_de and _looks_like_sentence and vocab_entries
+        )
+
+        if is_example_candidate:
+            # This is an example sentence — find best matching vocab entry
+            example_text = en
+
+            match_idx = _find_best_vocab_match(en, vocab_entries)
+            if match_idx < 0:
+                # No word match → fall back to last entry
+                match_idx = len(vocab_entries) - 1
+
+            if match_idx not in examples_for:
+                examples_for[match_idx] = []
+            examples_for[match_idx].append(example_text)
+        else:
+            vocab_entries.append(entry)
+
+    # Attach examples to their matched vocab entries
+    for idx, example_list in examples_for.items():
+        if 0 <= idx < len(vocab_entries):
+            entry = vocab_entries[idx]
+            existing_ex = (entry.get('example', '') or '').strip()
+            new_examples = ' | '.join(example_list)
+            entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
+
+    # Re-number
+    for i, e in enumerate(vocab_entries):
+        e['row_index'] = i
+
+    return vocab_entries