Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
318
klausur-service/backend/cv_ocr_vocab_postprocess.py
Normal file
318
klausur-service/backend/cv_ocr_vocab_postprocess.py
Normal file
@@ -0,0 +1,318 @@
|
||||
"""
|
||||
Vocab postprocessing: deterministic quality fixes for OCR-extracted vocabulary.
|
||||
|
||||
- Character confusion fix (I/1/l/|)
|
||||
- Comma-separated word form splitting
|
||||
- Example sentence attachment to matching vocab entries
|
||||
|
||||
Split from cv_ocr_engines.py for maintainability.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Any, Dict, List
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Post-Processing: Deterministic Quality Fixes
|
||||
# =============================================================================
|
||||
|
||||
# --- A. Character Confusion Fix (I/1/l) ---
|
||||
|
||||
# Common OCR confusion pairs in vocabulary context
|
||||
_CHAR_CONFUSION_RULES = [
|
||||
# "1" at word start followed by lowercase → likely "I" or "l"
|
||||
# Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3")
|
||||
(re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant
|
||||
# Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
|
||||
(re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'), # "1 want" → "I want"
|
||||
# "|" → "I", but NOT when embedded between letters (syllable divider: Ka|me|rad)
|
||||
# and NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
|
||||
(re.compile(r'(?<![a-zA-ZäöüÄÖÜß])\|(?!\||[.,])'), 'I'), # |ch → Ich, | want → I want
|
||||
]
|
||||
|
||||
# Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
|
||||
_DE_INDICATORS_FOR_EN_I = {'ich', 'mich', 'mir', 'mein', 'meine', 'meiner', 'meinem'}
|
||||
|
||||
|
||||
def _fix_character_confusion(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Fix common OCR character confusions using context.
|
||||
|
||||
Deterministic rules:
|
||||
- "1" at word start → "I" or "l" based on context
|
||||
- Cross-reference EN↔DE: if DE contains "ich/mich/mir", EN "1" → "I"
|
||||
- "y " artifact at word boundaries → remove (e.g. "y you" → "you")
|
||||
"""
|
||||
for entry in entries:
|
||||
en = entry.get('english', '') or ''
|
||||
de = entry.get('german', '') or ''
|
||||
ex = entry.get('example', '') or ''
|
||||
|
||||
# Apply general rules to all fields
|
||||
for pattern, replacement in _CHAR_CONFUSION_RULES:
|
||||
en = pattern.sub(replacement, en)
|
||||
de = pattern.sub(replacement, de)
|
||||
ex = pattern.sub(replacement, ex)
|
||||
|
||||
# Cross-reference: if DE has "ich"/"mich" indicators, fix EN "1" → "I"
|
||||
de_lower_words = set(de.lower().replace(',', ' ').split())
|
||||
if de_lower_words & _DE_INDICATORS_FOR_EN_I:
|
||||
# Any remaining "1" in EN that looks like "I"
|
||||
en = re.sub(r'\b1\b(?![\d.,])', 'I', en)
|
||||
|
||||
# Fix "y " artifact before repeated word: "y you" → "you"
|
||||
en = re.sub(r'\by\s+([a-z])', r'\1', en)
|
||||
ex = re.sub(r'\by\s+([a-z])', r'\1', ex)
|
||||
|
||||
entry['english'] = en.strip()
|
||||
entry['german'] = de.strip()
|
||||
entry['example'] = ex.strip()
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
# --- B. Comma-Separated Word Form Splitting ---
|
||||
|
||||
def _is_singular_plural_pair(parts: List[str]) -> bool:
|
||||
"""Detect if comma-separated parts are singular/plural forms of the same word.
|
||||
|
||||
E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split).
|
||||
"break, broke, broken" → False (different verb forms, OK to split).
|
||||
|
||||
Heuristic: exactly 2 parts that share a common prefix of >= 50% length,
|
||||
OR one part is a known plural suffix of the other (e.g. +s, +es, +en).
|
||||
"""
|
||||
if len(parts) != 2:
|
||||
return False
|
||||
|
||||
a, b = parts[0].lower().strip(), parts[1].lower().strip()
|
||||
if not a or not b:
|
||||
return False
|
||||
|
||||
# Common prefix heuristic: if words share >= 50% of the shorter word,
|
||||
# they are likely forms of the same word (Maus/Mäuse, child/children).
|
||||
min_len = min(len(a), len(b))
|
||||
common = 0
|
||||
for ca, cb in zip(a, b):
|
||||
if ca == cb:
|
||||
common += 1
|
||||
else:
|
||||
break
|
||||
if common >= max(2, min_len * 0.5):
|
||||
return True
|
||||
|
||||
# Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü)
|
||||
umlaut_map = str.maketrans('aou', 'äöü')
|
||||
if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Split entries with comma-separated word forms into individual entries.
|
||||
|
||||
E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen"
|
||||
→ 3 entries: break/brechen, broke/brach, broken/gebrochen
|
||||
|
||||
Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse"
|
||||
because those are forms of the same vocabulary entry.
|
||||
|
||||
Only splits when both EN and DE have the same number of comma-parts,
|
||||
parts are short (word forms, not sentences), and at least 3 parts
|
||||
(to avoid splitting pairs that likely belong together).
|
||||
"""
|
||||
result: List[Dict[str, Any]] = []
|
||||
|
||||
for entry in entries:
|
||||
en = (entry.get('english', '') or '').strip()
|
||||
de = (entry.get('german', '') or '').strip()
|
||||
|
||||
# Split by comma (but not inside brackets or parentheses)
|
||||
en_parts = _split_by_comma(en)
|
||||
de_parts = _split_by_comma(de)
|
||||
|
||||
# Only split if we have multiple parts and counts match
|
||||
should_split = False
|
||||
if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts):
|
||||
# All parts must be short (word forms, not sentences)
|
||||
if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts):
|
||||
# Do NOT split singular/plural pairs (2 parts that are
|
||||
# forms of the same word)
|
||||
if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts):
|
||||
should_split = False
|
||||
else:
|
||||
should_split = True
|
||||
|
||||
if not should_split:
|
||||
result.append(entry)
|
||||
continue
|
||||
|
||||
# Split into individual entries
|
||||
for k in range(len(en_parts)):
|
||||
sub = dict(entry) # shallow copy
|
||||
sub['english'] = en_parts[k].strip()
|
||||
sub['german'] = de_parts[k].strip() if k < len(de_parts) else ''
|
||||
sub['example'] = '' # examples get attached later
|
||||
sub['split_from_comma'] = True
|
||||
result.append(sub)
|
||||
|
||||
# Re-number
|
||||
for i, e in enumerate(result):
|
||||
e['row_index'] = i
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _split_by_comma(text: str) -> List[str]:
|
||||
"""Split text by commas, but not inside brackets [...] or parens (...)."""
|
||||
if ',' not in text:
|
||||
return [text]
|
||||
|
||||
parts = []
|
||||
depth_bracket = 0
|
||||
depth_paren = 0
|
||||
current = []
|
||||
|
||||
for ch in text:
|
||||
if ch == '[':
|
||||
depth_bracket += 1
|
||||
elif ch == ']':
|
||||
depth_bracket = max(0, depth_bracket - 1)
|
||||
elif ch == '(':
|
||||
depth_paren += 1
|
||||
elif ch == ')':
|
||||
depth_paren = max(0, depth_paren - 1)
|
||||
elif ch == ',' and depth_bracket == 0 and depth_paren == 0:
|
||||
parts.append(''.join(current).strip())
|
||||
current = []
|
||||
continue
|
||||
current.append(ch)
|
||||
|
||||
if current:
|
||||
parts.append(''.join(current).strip())
|
||||
|
||||
# Filter empty parts
|
||||
return [p for p in parts if p]
|
||||
|
||||
|
||||
# --- C. Example Sentence Attachment ---
|
||||
|
||||
def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
|
||||
"""Find the vocab entry whose English word(s) best match the example sentence.
|
||||
|
||||
Returns index into vocab_entries, or -1 if no match found.
|
||||
Uses word stem overlap: "a broken arm" matches "broken" or "break".
|
||||
"""
|
||||
if not vocab_entries or not example_text:
|
||||
return -1
|
||||
|
||||
example_lower = example_text.lower()
|
||||
example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
|
||||
|
||||
best_idx = -1
|
||||
best_score = 0
|
||||
|
||||
for i, entry in enumerate(vocab_entries):
|
||||
en = (entry.get('english', '') or '').lower()
|
||||
if not en:
|
||||
continue
|
||||
|
||||
# Extract vocab words (split on space, comma, newline)
|
||||
vocab_words = set(re.findall(r'[a-zäöüß]+', en))
|
||||
|
||||
# Score: how many vocab words appear in the example?
|
||||
# Also check if example words share a common stem (first 4 chars)
|
||||
direct_matches = vocab_words & example_words
|
||||
score = len(direct_matches) * 10
|
||||
|
||||
# Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
|
||||
if score == 0:
|
||||
for vw in vocab_words:
|
||||
if len(vw) < 3:
|
||||
continue
|
||||
stem = vw[:4] if len(vw) >= 4 else vw[:3]
|
||||
for ew in example_words:
|
||||
if len(ew) >= len(stem) and ew[:len(stem)] == stem:
|
||||
score += 5
|
||||
break
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_idx = i
|
||||
|
||||
return best_idx if best_score > 0 else -1
|
||||
|
||||
|
||||
def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Attach rows with EN text but no DE translation as examples to matching vocab entries.
|
||||
|
||||
Vocabulary worksheets often have:
|
||||
Row 1: break, broke, broken / brechen, brach, gebrochen
|
||||
Row 2: a broken arm (no DE → example for "broken")
|
||||
Row 3: a broken plate (no DE → example for "broken")
|
||||
Row 4: egg / Ei (has DE → new vocab entry)
|
||||
|
||||
Rules (deterministic, generic):
|
||||
- A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
|
||||
- Find the best matching vocab entry by checking which entry's English words
|
||||
appear in the example sentence (semantic matching via word overlap)
|
||||
- Fall back to the nearest preceding entry if no word match found
|
||||
- Multiple examples get joined with " | "
|
||||
"""
|
||||
if not entries:
|
||||
return entries
|
||||
|
||||
# Separate into vocab entries (have DE) and example candidates (no DE)
|
||||
vocab_entries: List[Dict[str, Any]] = []
|
||||
examples_for: Dict[int, List[str]] = {} # vocab_index → list of example texts
|
||||
|
||||
for entry in entries:
|
||||
en = (entry.get('english', '') or '').strip()
|
||||
de = (entry.get('german', '') or '').strip()
|
||||
ex = (entry.get('example', '') or '').strip()
|
||||
|
||||
# Treat single-char DE as OCR noise, not real translation.
|
||||
# "Ei" (2 chars) is a valid German word, so threshold is 1.
|
||||
has_de = len(de) > 1
|
||||
has_en = bool(en)
|
||||
|
||||
# Heuristic: a row without DE is an "example sentence" only if
|
||||
# the EN text looks like a sentence (>= 4 words, or contains
|
||||
# typical sentence punctuation). Short EN text (1-3 words) is
|
||||
# more likely a vocab entry whose DE was missed by OCR.
|
||||
_looks_like_sentence = (
|
||||
len(en.split()) >= 4
|
||||
or en.rstrip().endswith(('.', '!', '?'))
|
||||
)
|
||||
is_example_candidate = (
|
||||
has_en and not has_de and _looks_like_sentence and vocab_entries
|
||||
)
|
||||
|
||||
if is_example_candidate:
|
||||
# This is an example sentence — find best matching vocab entry
|
||||
example_text = en
|
||||
|
||||
match_idx = _find_best_vocab_match(en, vocab_entries)
|
||||
if match_idx < 0:
|
||||
# No word match → fall back to last entry
|
||||
match_idx = len(vocab_entries) - 1
|
||||
|
||||
if match_idx not in examples_for:
|
||||
examples_for[match_idx] = []
|
||||
examples_for[match_idx].append(example_text)
|
||||
else:
|
||||
vocab_entries.append(entry)
|
||||
|
||||
# Attach examples to their matched vocab entries
|
||||
for idx, example_list in examples_for.items():
|
||||
if 0 <= idx < len(vocab_entries):
|
||||
entry = vocab_entries[idx]
|
||||
existing_ex = (entry.get('example', '') or '').strip()
|
||||
new_examples = ' | '.join(example_list)
|
||||
entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
|
||||
|
||||
# Re-number
|
||||
for i, e in enumerate(vocab_entries):
|
||||
e['row_index'] = i
|
||||
|
||||
return vocab_entries
|
||||
Reference in New Issue
Block a user