feat(ocr-pipeline): add abbreviation allowlist to noise filter

Add _KNOWN_ABBREVIATIONS set with ~150 common EN/DE abbreviations (sth, sb, etc, eg, ie, usw, bzw, vgl, adj, adv, prep, sg, pl, ...). Tokens matching known abbreviations are never stripped as noise. Also handle dotted abbreviations (e.g., z.B., i.e.) that have no 2+ consecutive alpha chars by checking the abbreviation set before the _RE_REAL_WORD filter. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 10:46:54 +01:00
parent 3028f421b4
commit e9f368d3ec
1 changed files with 63 additions and 3 deletions
@@ -3177,6 +3177,55 @@ _COMMON_SHORT_WORDS: set = {
    'wut', 'zum', 'zur',
 }

+# Known abbreviations found in EN/DE textbooks and dictionaries.
+# Stored WITHOUT trailing period (the noise filter strips periods).
+# These rescue tokens like "sth." / "sb." / "usw." from being deleted.
+_KNOWN_ABBREVIATIONS: set = {
+    # EN dictionary meta-words
+    'sth', 'sb', 'smth', 'smb', 'sbd',
+    # EN general
+    'etc', 'eg', 'ie', 'esp', 'approx', 'dept', 'govt', 'corp',
+    'inc', 'ltd', 'vs', 'cf', 'ibid', 'nb', 'ps', 'asap',
+    # EN references / textbook
+    'p', 'pp', 'ch', 'chap', 'fig', 'figs', 'no', 'nos', 'nr',
+    'vol', 'vols', 'ed', 'eds', 'rev', 'repr', 'trans', 'ff',
+    'fn', 'sec', 'par', 'para', 'app', 'abbr', 'ex', 'exs',
+    'ans', 'wb', 'tb', 'vocab',
+    # EN parts of speech / grammar
+    'adj', 'adv', 'prep', 'conj', 'pron', 'det', 'art', 'interj',
+    'aux', 'mod', 'inf', 'pt', 'pres', 'pret', 'ger',
+    'sg', 'pl', 'sing', 'irreg', 'reg', 'intr', 'intrans',
+    'refl', 'pass', 'imper', 'subj', 'ind', 'perf', 'fut',
+    'attr', 'pred', 'comp', 'superl', 'pos', 'neg',
+    'lit', 'colloq', 'sl', 'dial', 'arch', 'obs', 'fml', 'infml',
+    'syn', 'ant', 'opp', 'var', 'orig',
+    # EN titles
+    'mr', 'mrs', 'ms', 'dr', 'prof', 'st', 'jr', 'sr',
+    # EN pronunciation
+    'br', 'am', 'brit', 'amer',
+    # EN units
+    'hr', 'hrs', 'min', 'km', 'cm', 'mm', 'kg', 'mg', 'ml',
+    # DE general
+    'usw', 'bzw', 'evtl', 'ggf', 'ggfs', 'sog', 'eigtl', 'allg',
+    'bes', 'insb', 'insbes', 'bspw', 'ca',
+    'od', 'ua', 'sa', 'vgl', 'zb', 'dh', 'zt', 'idr',
+    'inkl', 'exkl', 'zzgl', 'abzgl',
+    # DE references
+    'abs', 'abschn', 'abt', 'anm', 'ausg', 'aufl', 'bd', 'bde',
+    'bearb', 'ebd', 'hrsg', 'hg', 'jg', 'jh', 'jhd', 'kap',
+    's', 'sp', 'zit', 'zs', 'vlg',
+    # DE grammar
+    'nom', 'akk', 'dat', 'gen', 'konj', 'subst', 'obj',
+    'praet', 'imp', 'part', 'mask', 'fem', 'neutr',
+    'trennb', 'untrennb', 'ugs', 'geh', 'pej',
+    # DE regional
+    'nordd', 'österr', 'schweiz',
+    # Linguistic
+    'lex', 'morph', 'phon', 'phonet', 'sem', 'synt', 'etym',
+    'deriv', 'pref', 'suf', 'suff', 'dim', 'coll',
+    'count', 'uncount', 'indef', 'def', 'poss', 'demon',
+}
+

 def _is_noise_tail_token(token: str) -> bool:
    """Check if a token at the END of cell text is trailing OCR noise.
@@ -3209,6 +3258,10 @@ def _is_noise_tail_token(token: str) -> bool:
    # Extract only alpha characters for dictionary lookup
    cleaned = ''.join(alpha_chars)

+    # Known abbreviations (e.g. "sth.", "usw.", "adj.") — always keep
+    if cleaned.lower() in _KNOWN_ABBREVIATIONS:
+        return False
+
    # Strip normal trailing punctuation before checking for internal noise.
    stripped_punct = re.sub(r'[.,;:!?]+$', '', t)  # "cupcakes." → "cupcakes"
    t_check = stripped_punct if stripped_punct else t
@@ -3248,12 +3301,16 @@ def _is_garbage_text(text: str) -> bool:
    """
    words = _RE_REAL_WORD.findall(text)
    if not words:
+        # Check if any token is a known abbreviation (e.g. "e.g.")
+        alpha_only = ''.join(_RE_ALPHA.findall(text)).lower()
+        if alpha_only in _KNOWN_ABBREVIATIONS:
+            return False
        return True

    for w in words:
        wl = w.lower()
-        # Known short word → not garbage
-        if wl in _COMMON_SHORT_WORDS:
+        # Known short word or abbreviation → not garbage
+        if wl in _COMMON_SHORT_WORDS or wl in _KNOWN_ABBREVIATIONS:
            return False
        # Long word (>= 4 chars): check vowel/consonant ratio.
        # Real EN/DE words have 20-60% vowels.  Garbage like "uanoaain"
@@ -3280,7 +3337,10 @@ def _clean_cell_text(text: str) -> str:

    # --- Filter 1: No real word at all ---
    if not _RE_REAL_WORD.search(stripped):
-        return ''
+        # Exception: dotted abbreviations like "e.g.", "z.B.", "i.e."
+        alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
+        if alpha_only not in _KNOWN_ABBREVIATIONS:
+            return ''

    # --- Filter 2: Entire text is garbage ---
    if _is_garbage_text(stripped):