diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 90b1caa..1a27762 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -5399,23 +5399,19 @@ _OCR_DIGIT_IN_WORD_RE = _re.compile(r'(?<=[A-Za-zÄÖÜäöüß])[01568]|[01568] def _entry_needs_review(entry: Dict) -> bool: """Check if an entry should be sent to the LLM for review. - Only sends entries that actually contain OCR digit↔letter confusion - patterns (e.g. "8en" instead of "Ben", "L0ndon" instead of "London"). - This prevents the LLM from touching correct entries. + Sends all non-empty entries that don't have IPA phonetic transcriptions. + The LLM prompt and _is_spurious_change() guard against unwanted changes. """ en = entry.get("english", "") or "" de = entry.get("german", "") or "" - ex = entry.get("example", "") or "" # Skip completely empty entries if not en.strip() and not de.strip(): return False - # Skip entries with IPA/phonetic brackets — dictionary-corrected, no OCR digits expected + # Skip entries with IPA/phonetic brackets — dictionary-corrected, LLM must not touch them if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de): return False - # Only review if at least one field has a digit-in-word pattern - combined = f"{en} {de} {ex}" - return bool(_OCR_DIGIT_IN_WORD_RE.search(combined)) + return True def _build_llm_prompt(table_lines: List[Dict]) -> str: