From fa8e38db2d6bee34d986fc24b9fd80ed3f8a501c Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 3 Mar 2026 14:29:46 +0100 Subject: [PATCH] =?UTF-8?q?fix(llm-review):=20Pre-Filter=20entfernt=20?= =?UTF-8?q?=E2=80=94=20alle=20Eintr=C3=A4ge=20ans=20LLM=20senden?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Der digit-in-word Pre-Filter hat alle 41 Einträge geblockt (skipped=41 im Log). OCR-Fehler können nicht im voraus erkannt werden. Zurück zum ursprünglichen Ansatz: alle nicht-leeren Einträge ohne IPA-Klammern werden ans LLM gesendet. Schutz gegen Übersetzungen erfolgt ausschließlich über den strikten Prompt und _is_spurious_change(). Co-Authored-By: Claude Sonnet 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 90b1caa..1a27762 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -5399,23 +5399,19 @@ _OCR_DIGIT_IN_WORD_RE = _re.compile(r'(?<=[A-Za-zÄÖÜäöüß])[01568]|[01568] def _entry_needs_review(entry: Dict) -> bool: """Check if an entry should be sent to the LLM for review. - Only sends entries that actually contain OCR digit↔letter confusion - patterns (e.g. "8en" instead of "Ben", "L0ndon" instead of "London"). - This prevents the LLM from touching correct entries. + Sends all non-empty entries that don't have IPA phonetic transcriptions. + The LLM prompt and _is_spurious_change() guard against unwanted changes. """ en = entry.get("english", "") or "" de = entry.get("german", "") or "" - ex = entry.get("example", "") or "" # Skip completely empty entries if not en.strip() and not de.strip(): return False - # Skip entries with IPA/phonetic brackets — dictionary-corrected, no OCR digits expected + # Skip entries with IPA/phonetic brackets — dictionary-corrected, LLM must not touch them if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de): return False - # Only review if at least one field has a digit-in-word pattern - combined = f"{en} {de} {ex}" - return bool(_OCR_DIGIT_IN_WORD_RE.search(combined)) + return True def _build_llm_prompt(table_lines: List[Dict]) -> str: