From e9f368d3ecaee971cab66b971d70cbfd0a8338ae Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 2 Mar 2026 10:46:54 +0100 Subject: [PATCH] feat(ocr-pipeline): add abbreviation allowlist to noise filter Add _KNOWN_ABBREVIATIONS set with ~150 common EN/DE abbreviations (sth, sb, etc, eg, ie, usw, bzw, vgl, adj, adv, prep, sg, pl, ...). Tokens matching known abbreviations are never stripped as noise. Also handle dotted abbreviations (e.g., z.B., i.e.) that have no 2+ consecutive alpha chars by checking the abbreviation set before the _RE_REAL_WORD filter. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 66 +++++++++++++++++++- 1 file changed, 63 insertions(+), 3 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 5a02616..3c4d0f1 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3177,6 +3177,55 @@ _COMMON_SHORT_WORDS: set = { 'wut', 'zum', 'zur', } +# Known abbreviations found in EN/DE textbooks and dictionaries. +# Stored WITHOUT trailing period (the noise filter strips periods). +# These rescue tokens like "sth." / "sb." / "usw." from being deleted. +_KNOWN_ABBREVIATIONS: set = { + # EN dictionary meta-words + 'sth', 'sb', 'smth', 'smb', 'sbd', + # EN general + 'etc', 'eg', 'ie', 'esp', 'approx', 'dept', 'govt', 'corp', + 'inc', 'ltd', 'vs', 'cf', 'ibid', 'nb', 'ps', 'asap', + # EN references / textbook + 'p', 'pp', 'ch', 'chap', 'fig', 'figs', 'no', 'nos', 'nr', + 'vol', 'vols', 'ed', 'eds', 'rev', 'repr', 'trans', 'ff', + 'fn', 'sec', 'par', 'para', 'app', 'abbr', 'ex', 'exs', + 'ans', 'wb', 'tb', 'vocab', + # EN parts of speech / grammar + 'adj', 'adv', 'prep', 'conj', 'pron', 'det', 'art', 'interj', + 'aux', 'mod', 'inf', 'pt', 'pres', 'pret', 'ger', + 'sg', 'pl', 'sing', 'irreg', 'reg', 'intr', 'intrans', + 'refl', 'pass', 'imper', 'subj', 'ind', 'perf', 'fut', + 'attr', 'pred', 'comp', 'superl', 'pos', 'neg', + 'lit', 'colloq', 'sl', 'dial', 'arch', 'obs', 'fml', 'infml', + 'syn', 'ant', 'opp', 'var', 'orig', + # EN titles + 'mr', 'mrs', 'ms', 'dr', 'prof', 'st', 'jr', 'sr', + # EN pronunciation + 'br', 'am', 'brit', 'amer', + # EN units + 'hr', 'hrs', 'min', 'km', 'cm', 'mm', 'kg', 'mg', 'ml', + # DE general + 'usw', 'bzw', 'evtl', 'ggf', 'ggfs', 'sog', 'eigtl', 'allg', + 'bes', 'insb', 'insbes', 'bspw', 'ca', + 'od', 'ua', 'sa', 'vgl', 'zb', 'dh', 'zt', 'idr', + 'inkl', 'exkl', 'zzgl', 'abzgl', + # DE references + 'abs', 'abschn', 'abt', 'anm', 'ausg', 'aufl', 'bd', 'bde', + 'bearb', 'ebd', 'hrsg', 'hg', 'jg', 'jh', 'jhd', 'kap', + 's', 'sp', 'zit', 'zs', 'vlg', + # DE grammar + 'nom', 'akk', 'dat', 'gen', 'konj', 'subst', 'obj', + 'praet', 'imp', 'part', 'mask', 'fem', 'neutr', + 'trennb', 'untrennb', 'ugs', 'geh', 'pej', + # DE regional + 'nordd', 'österr', 'schweiz', + # Linguistic + 'lex', 'morph', 'phon', 'phonet', 'sem', 'synt', 'etym', + 'deriv', 'pref', 'suf', 'suff', 'dim', 'coll', + 'count', 'uncount', 'indef', 'def', 'poss', 'demon', +} + def _is_noise_tail_token(token: str) -> bool: """Check if a token at the END of cell text is trailing OCR noise. @@ -3209,6 +3258,10 @@ def _is_noise_tail_token(token: str) -> bool: # Extract only alpha characters for dictionary lookup cleaned = ''.join(alpha_chars) + # Known abbreviations (e.g. "sth.", "usw.", "adj.") — always keep + if cleaned.lower() in _KNOWN_ABBREVIATIONS: + return False + # Strip normal trailing punctuation before checking for internal noise. stripped_punct = re.sub(r'[.,;:!?]+$', '', t) # "cupcakes." → "cupcakes" t_check = stripped_punct if stripped_punct else t @@ -3248,12 +3301,16 @@ def _is_garbage_text(text: str) -> bool: """ words = _RE_REAL_WORD.findall(text) if not words: + # Check if any token is a known abbreviation (e.g. "e.g.") + alpha_only = ''.join(_RE_ALPHA.findall(text)).lower() + if alpha_only in _KNOWN_ABBREVIATIONS: + return False return True for w in words: wl = w.lower() - # Known short word → not garbage - if wl in _COMMON_SHORT_WORDS: + # Known short word or abbreviation → not garbage + if wl in _COMMON_SHORT_WORDS or wl in _KNOWN_ABBREVIATIONS: return False # Long word (>= 4 chars): check vowel/consonant ratio. # Real EN/DE words have 20-60% vowels. Garbage like "uanoaain" @@ -3280,7 +3337,10 @@ def _clean_cell_text(text: str) -> str: # --- Filter 1: No real word at all --- if not _RE_REAL_WORD.search(stripped): - return '' + # Exception: dotted abbreviations like "e.g.", "z.B.", "i.e." + alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower() + if alpha_only not in _KNOWN_ABBREVIATIONS: + return '' # --- Filter 2: Entire text is garbage --- if _is_garbage_text(stripped):