diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 1a27762..6f65d67 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -5426,6 +5426,7 @@ NUR diese Korrekturen sind erlaubt: - Ziffer 1 statt l oder I: "1ong" → "long", "Ber1in" → "Berlin" - Ziffer 5 statt S oder s: "5tadt" → "Stadt", "5ee" → "See" - Ziffer 6 statt G oder g: "6eld" → "Geld" +- Senkrechter Strich | statt I oder l: "| want" → "I want", "|ong" → "long", "he| p" → "help" ABSOLUT VERBOTEN — aendere NIEMALS: - Woerter die korrekt geschrieben sind — auch wenn du eine andere Schreibweise kennst @@ -5477,27 +5478,34 @@ def _is_spurious_change(old_val: str, new_val: str) -> bool: # letter. If the change doesn't include such a substitution, reject it. # Build a set of (old_char, new_char) pairs that differ between old and new. # Use character-level diff heuristic: if lengths are close, zip and compare. - _DIGIT_TO_LETTER = { + # Map of characters that OCR commonly misreads → set of correct replacements + _OCR_CHAR_MAP = { + # Digits mistaken for letters '0': set('oOgG'), '1': set('lLiI'), '5': set('sS'), '6': set('gG'), '8': set('bB'), + # Non-letter symbols mistaken for letters + '|': set('lLiI'), # pipe → lowercase l or capital I + 'l': set('iI|'), # lowercase l → capital I (and reverse) } - has_valid_digit_fix = False + has_valid_fix = False if len(old_val) == len(new_val): for oc, nc in zip(old_val, new_val): if oc != nc: - if oc in _DIGIT_TO_LETTER and nc in _DIGIT_TO_LETTER[oc]: - has_valid_digit_fix = True - # Any other single-char change is suspicious (could be translation) + if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]: + has_valid_fix = True + elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]: + # Reverse check (e.g. l→I where new is the "correct" char) + has_valid_fix = True else: - # Length changed: only accept if the difference is one char and - # the old contained a digit where new has a letter - if abs(len(old_val) - len(new_val)) <= 1 and _OCR_DIGIT_IN_WORD_RE.search(old_val): - has_valid_digit_fix = True + # Length changed by 1: accept if old had a suspicious char sequence + _OCR_SUSPICIOUS_RE = _re.compile(r'[|01568]') + if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val): + has_valid_fix = True - if not has_valid_digit_fix: + if not has_valid_fix: return True # Reject — looks like translation or hallucination return False @@ -5700,6 +5708,17 @@ async def llm_review_entries_streaming( } +def _sanitize_for_json(text: str) -> str: + """Remove or escape control characters that break JSON parsing. + + Keeps tab (\\t), newline (\\n), carriage return (\\r) which are valid + JSON whitespace. Removes all other ASCII control characters (0x00-0x1f) + that are only valid inside JSON strings when properly escaped. + """ + # Replace literal control chars (except \\t \\n \\r) with a space + return _re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text) + + def _parse_llm_json_array(text: str) -> List[Dict]: """Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags).""" # Strip qwen3 ... blocks (present even with think=False on some builds) @@ -5707,7 +5726,9 @@ def _parse_llm_json_array(text: str) -> List[Dict]: # Strip markdown code fences text = _re.sub(r'```json\s*', '', text) text = _re.sub(r'```\s*', '', text) - # Find first [ ... last ] (non-greedy would miss nested structures, greedy is correct here) + # Sanitize control characters before JSON parsing + text = _sanitize_for_json(text) + # Find first [ ... last ] match = _re.search(r'\[.*\]', text, _re.DOTALL) if match: try: