fix(llm-review): JSON-Parse-Fehler durch Control-Zeichen beheben
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m48s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m48s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Log zeigte: "Invalid control character at: line 28 column 27" Das Pipe-Zeichen | in OCR-Texten (z.B. "| want" statt "I want") bricht den JSON-Parser wenn es als Literal im LLM-Response steht. Fixes: - _sanitize_for_json(): entfernt ASCII Control-Chars 0x00-0x1f (außer Tab/LF/CR die in JSON valid sind) - | → I als erlaubte OCR-Korrektur in _is_spurious_change und Prompt - Reverse-Check in _is_spurious_change (l→I etc.) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -5426,6 +5426,7 @@ NUR diese Korrekturen sind erlaubt:
|
||||
- Ziffer 1 statt l oder I: "1ong" → "long", "Ber1in" → "Berlin"
|
||||
- Ziffer 5 statt S oder s: "5tadt" → "Stadt", "5ee" → "See"
|
||||
- Ziffer 6 statt G oder g: "6eld" → "Geld"
|
||||
- Senkrechter Strich | statt I oder l: "| want" → "I want", "|ong" → "long", "he| p" → "help"
|
||||
|
||||
ABSOLUT VERBOTEN — aendere NIEMALS:
|
||||
- Woerter die korrekt geschrieben sind — auch wenn du eine andere Schreibweise kennst
|
||||
@@ -5477,27 +5478,34 @@ def _is_spurious_change(old_val: str, new_val: str) -> bool:
|
||||
# letter. If the change doesn't include such a substitution, reject it.
|
||||
# Build a set of (old_char, new_char) pairs that differ between old and new.
|
||||
# Use character-level diff heuristic: if lengths are close, zip and compare.
|
||||
_DIGIT_TO_LETTER = {
|
||||
# Map of characters that OCR commonly misreads → set of correct replacements
|
||||
_OCR_CHAR_MAP = {
|
||||
# Digits mistaken for letters
|
||||
'0': set('oOgG'),
|
||||
'1': set('lLiI'),
|
||||
'5': set('sS'),
|
||||
'6': set('gG'),
|
||||
'8': set('bB'),
|
||||
# Non-letter symbols mistaken for letters
|
||||
'|': set('lLiI'), # pipe → lowercase l or capital I
|
||||
'l': set('iI|'), # lowercase l → capital I (and reverse)
|
||||
}
|
||||
has_valid_digit_fix = False
|
||||
has_valid_fix = False
|
||||
if len(old_val) == len(new_val):
|
||||
for oc, nc in zip(old_val, new_val):
|
||||
if oc != nc:
|
||||
if oc in _DIGIT_TO_LETTER and nc in _DIGIT_TO_LETTER[oc]:
|
||||
has_valid_digit_fix = True
|
||||
# Any other single-char change is suspicious (could be translation)
|
||||
if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]:
|
||||
has_valid_fix = True
|
||||
elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]:
|
||||
# Reverse check (e.g. l→I where new is the "correct" char)
|
||||
has_valid_fix = True
|
||||
else:
|
||||
# Length changed: only accept if the difference is one char and
|
||||
# the old contained a digit where new has a letter
|
||||
if abs(len(old_val) - len(new_val)) <= 1 and _OCR_DIGIT_IN_WORD_RE.search(old_val):
|
||||
has_valid_digit_fix = True
|
||||
# Length changed by 1: accept if old had a suspicious char sequence
|
||||
_OCR_SUSPICIOUS_RE = _re.compile(r'[|01568]')
|
||||
if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val):
|
||||
has_valid_fix = True
|
||||
|
||||
if not has_valid_digit_fix:
|
||||
if not has_valid_fix:
|
||||
return True # Reject — looks like translation or hallucination
|
||||
|
||||
return False
|
||||
@@ -5700,6 +5708,17 @@ async def llm_review_entries_streaming(
|
||||
}
|
||||
|
||||
|
||||
def _sanitize_for_json(text: str) -> str:
|
||||
"""Remove or escape control characters that break JSON parsing.
|
||||
|
||||
Keeps tab (\\t), newline (\\n), carriage return (\\r) which are valid
|
||||
JSON whitespace. Removes all other ASCII control characters (0x00-0x1f)
|
||||
that are only valid inside JSON strings when properly escaped.
|
||||
"""
|
||||
# Replace literal control chars (except \\t \\n \\r) with a space
|
||||
return _re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)
|
||||
|
||||
|
||||
def _parse_llm_json_array(text: str) -> List[Dict]:
|
||||
"""Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
|
||||
# Strip qwen3 <think>...</think> blocks (present even with think=False on some builds)
|
||||
@@ -5707,7 +5726,9 @@ def _parse_llm_json_array(text: str) -> List[Dict]:
|
||||
# Strip markdown code fences
|
||||
text = _re.sub(r'```json\s*', '', text)
|
||||
text = _re.sub(r'```\s*', '', text)
|
||||
# Find first [ ... last ] (non-greedy would miss nested structures, greedy is correct here)
|
||||
# Sanitize control characters before JSON parsing
|
||||
text = _sanitize_for_json(text)
|
||||
# Find first [ ... last ]
|
||||
match = _re.search(r'\[.*\]', text, _re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user