From 1ac47cd9b7c2cca4edb91e779c1ed03c9ed65722 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Tue, 3 Mar 2026 14:37:16 +0100
Subject: [PATCH] fix(llm-review): JSON-Parse-Fehler durch Control-Zeichen
 beheben
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Log zeigte: "Invalid control character at: line 28 column 27"
Das Pipe-Zeichen | in OCR-Texten (z.B. "| want" statt "I want")
bricht den JSON-Parser wenn es als Literal im LLM-Response steht.

Fixes:
- _sanitize_for_json(): entfernt ASCII Control-Chars 0x00-0x1f
  (außer Tab/LF/CR die in JSON valid sind)
- | → I als erlaubte OCR-Korrektur in _is_spurious_change und Prompt
- Reverse-Check in _is_spurious_change (l→I etc.)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 43 +++++++++++++++-----
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 1a27762..6f65d67 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -5426,6 +5426,7 @@ NUR diese Korrekturen sind erlaubt:
 - Ziffer 1 statt l oder I: "1ong" → "long", "Ber1in" → "Berlin"
 - Ziffer 5 statt S oder s: "5tadt" → "Stadt", "5ee" → "See"
 - Ziffer 6 statt G oder g: "6eld" → "Geld"
+- Senkrechter Strich | statt I oder l: "| want" → "I want", "|ong" → "long", "he| p" → "help"
 
 ABSOLUT VERBOTEN — aendere NIEMALS:
 - Woerter die korrekt geschrieben sind — auch wenn du eine andere Schreibweise kennst
@@ -5477,27 +5478,34 @@ def _is_spurious_change(old_val: str, new_val: str) -> bool:
     # letter. If the change doesn't include such a substitution, reject it.
     # Build a set of (old_char, new_char) pairs that differ between old and new.
     # Use character-level diff heuristic: if lengths are close, zip and compare.
-    _DIGIT_TO_LETTER = {
+    # Map of characters that OCR commonly misreads → set of correct replacements
+    _OCR_CHAR_MAP = {
+        # Digits mistaken for letters
         '0': set('oOgG'),
         '1': set('lLiI'),
         '5': set('sS'),
         '6': set('gG'),
         '8': set('bB'),
+        # Non-letter symbols mistaken for letters
+        '|': set('lLiI'),   # pipe → lowercase l or capital I
+        'l': set('iI|'),    # lowercase l → capital I (and reverse)
     }
-    has_valid_digit_fix = False
+    has_valid_fix = False
     if len(old_val) == len(new_val):
         for oc, nc in zip(old_val, new_val):
             if oc != nc:
-                if oc in _DIGIT_TO_LETTER and nc in _DIGIT_TO_LETTER[oc]:
-                    has_valid_digit_fix = True
-                # Any other single-char change is suspicious (could be translation)
+                if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]:
+                    has_valid_fix = True
+                elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]:
+                    # Reverse check (e.g. l→I where new is the "correct" char)
+                    has_valid_fix = True
     else:
-        # Length changed: only accept if the difference is one char and
-        # the old contained a digit where new has a letter
-        if abs(len(old_val) - len(new_val)) <= 1 and _OCR_DIGIT_IN_WORD_RE.search(old_val):
-            has_valid_digit_fix = True
+        # Length changed by 1: accept if old had a suspicious char sequence
+        _OCR_SUSPICIOUS_RE = _re.compile(r'[|01568]')
+        if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val):
+            has_valid_fix = True
 
-    if not has_valid_digit_fix:
+    if not has_valid_fix:
         return True  # Reject — looks like translation or hallucination
 
     return False
@@ -5700,6 +5708,17 @@ async def llm_review_entries_streaming(
     }
 
 
+def _sanitize_for_json(text: str) -> str:
+    """Remove or escape control characters that break JSON parsing.
+
+    Keeps tab (\\t), newline (\\n), carriage return (\\r) which are valid
+    JSON whitespace. Removes all other ASCII control characters (0x00-0x1f)
+    that are only valid inside JSON strings when properly escaped.
+    """
+    # Replace literal control chars (except \\t \\n \\r) with a space
+    return _re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)
+
+
 def _parse_llm_json_array(text: str) -> List[Dict]:
     """Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
     # Strip qwen3 <think>...</think> blocks (present even with think=False on some builds)
@@ -5707,7 +5726,9 @@ def _parse_llm_json_array(text: str) -> List[Dict]:
     # Strip markdown code fences
     text = _re.sub(r'```json\s*', '', text)
     text = _re.sub(r'```\s*', '', text)
-    # Find first [ ... last ]  (non-greedy would miss nested structures, greedy is correct here)
+    # Sanitize control characters before JSON parsing
+    text = _sanitize_for_json(text)
+    # Find first [ ... last ]
     match = _re.search(r'\[.*\]', text, _re.DOTALL)
     if match:
         try: