diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 5a02616..3c4d0f1 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3177,6 +3177,55 @@ _COMMON_SHORT_WORDS: set = { 'wut', 'zum', 'zur', } +# Known abbreviations found in EN/DE textbooks and dictionaries. +# Stored WITHOUT trailing period (the noise filter strips periods). +# These rescue tokens like "sth." / "sb." / "usw." from being deleted. +_KNOWN_ABBREVIATIONS: set = { + # EN dictionary meta-words + 'sth', 'sb', 'smth', 'smb', 'sbd', + # EN general + 'etc', 'eg', 'ie', 'esp', 'approx', 'dept', 'govt', 'corp', + 'inc', 'ltd', 'vs', 'cf', 'ibid', 'nb', 'ps', 'asap', + # EN references / textbook + 'p', 'pp', 'ch', 'chap', 'fig', 'figs', 'no', 'nos', 'nr', + 'vol', 'vols', 'ed', 'eds', 'rev', 'repr', 'trans', 'ff', + 'fn', 'sec', 'par', 'para', 'app', 'abbr', 'ex', 'exs', + 'ans', 'wb', 'tb', 'vocab', + # EN parts of speech / grammar + 'adj', 'adv', 'prep', 'conj', 'pron', 'det', 'art', 'interj', + 'aux', 'mod', 'inf', 'pt', 'pres', 'pret', 'ger', + 'sg', 'pl', 'sing', 'irreg', 'reg', 'intr', 'intrans', + 'refl', 'pass', 'imper', 'subj', 'ind', 'perf', 'fut', + 'attr', 'pred', 'comp', 'superl', 'pos', 'neg', + 'lit', 'colloq', 'sl', 'dial', 'arch', 'obs', 'fml', 'infml', + 'syn', 'ant', 'opp', 'var', 'orig', + # EN titles + 'mr', 'mrs', 'ms', 'dr', 'prof', 'st', 'jr', 'sr', + # EN pronunciation + 'br', 'am', 'brit', 'amer', + # EN units + 'hr', 'hrs', 'min', 'km', 'cm', 'mm', 'kg', 'mg', 'ml', + # DE general + 'usw', 'bzw', 'evtl', 'ggf', 'ggfs', 'sog', 'eigtl', 'allg', + 'bes', 'insb', 'insbes', 'bspw', 'ca', + 'od', 'ua', 'sa', 'vgl', 'zb', 'dh', 'zt', 'idr', + 'inkl', 'exkl', 'zzgl', 'abzgl', + # DE references + 'abs', 'abschn', 'abt', 'anm', 'ausg', 'aufl', 'bd', 'bde', + 'bearb', 'ebd', 'hrsg', 'hg', 'jg', 'jh', 'jhd', 'kap', + 's', 'sp', 'zit', 'zs', 'vlg', + # DE grammar + 'nom', 'akk', 'dat', 'gen', 'konj', 'subst', 'obj', + 'praet', 'imp', 'part', 'mask', 'fem', 'neutr', + 'trennb', 'untrennb', 'ugs', 'geh', 'pej', + # DE regional + 'nordd', 'österr', 'schweiz', + # Linguistic + 'lex', 'morph', 'phon', 'phonet', 'sem', 'synt', 'etym', + 'deriv', 'pref', 'suf', 'suff', 'dim', 'coll', + 'count', 'uncount', 'indef', 'def', 'poss', 'demon', +} + def _is_noise_tail_token(token: str) -> bool: """Check if a token at the END of cell text is trailing OCR noise. @@ -3209,6 +3258,10 @@ def _is_noise_tail_token(token: str) -> bool: # Extract only alpha characters for dictionary lookup cleaned = ''.join(alpha_chars) + # Known abbreviations (e.g. "sth.", "usw.", "adj.") — always keep + if cleaned.lower() in _KNOWN_ABBREVIATIONS: + return False + # Strip normal trailing punctuation before checking for internal noise. stripped_punct = re.sub(r'[.,;:!?]+$', '', t) # "cupcakes." → "cupcakes" t_check = stripped_punct if stripped_punct else t @@ -3248,12 +3301,16 @@ def _is_garbage_text(text: str) -> bool: """ words = _RE_REAL_WORD.findall(text) if not words: + # Check if any token is a known abbreviation (e.g. "e.g.") + alpha_only = ''.join(_RE_ALPHA.findall(text)).lower() + if alpha_only in _KNOWN_ABBREVIATIONS: + return False return True for w in words: wl = w.lower() - # Known short word → not garbage - if wl in _COMMON_SHORT_WORDS: + # Known short word or abbreviation → not garbage + if wl in _COMMON_SHORT_WORDS or wl in _KNOWN_ABBREVIATIONS: return False # Long word (>= 4 chars): check vowel/consonant ratio. # Real EN/DE words have 20-60% vowels. Garbage like "uanoaain" @@ -3280,7 +3337,10 @@ def _clean_cell_text(text: str) -> str: # --- Filter 1: No real word at all --- if not _RE_REAL_WORD.search(stripped): - return '' + # Exception: dotted abbreviations like "e.g.", "z.B.", "i.e." + alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower() + if alpha_only not in _KNOWN_ABBREVIATIONS: + return '' # --- Filter 2: Entire text is garbage --- if _is_garbage_text(stripped):