""" Language scoring, role scoring, and dictionary detection/classification. Extracted from cv_layout.py to keep modules under 500 LOC. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging from collections import Counter from typing import Any, Dict, List, Optional from cv_vocab_types import ( ColumnGeometry, ENGLISH_FUNCTION_WORDS, GERMAN_FUNCTION_WORDS, PageRegion, ) logger = logging.getLogger(__name__) # --- Dictionary / Wörterbuch Detection --- # Article words that appear as a dedicated column in dictionaries _DICT_ARTICLE_WORDS = { # German articles "die", "der", "das", "dem", "den", "des", "ein", "eine", "einem", "einer", # English articles / infinitive marker "the", "a", "an", "to", } # --- Phase B: Content-Based Classification --- def _score_language(words: List[Dict]) -> Dict[str, float]: """Score the language of a column's words. Analyzes function words, umlauts, and capitalization patterns to determine whether text is English or German. Args: words: List of word dicts with 'text' and 'conf' keys. Returns: Dict with 'eng' and 'deu' scores (0.0-1.0). """ if not words: return {'eng': 0.0, 'deu': 0.0} # Only consider words with decent confidence good_words = [w['text'].lower() for w in words if w.get('conf', 0) > 40 and len(w['text']) > 0] if not good_words: return {'eng': 0.0, 'deu': 0.0} total = len(good_words) en_hits = sum(1 for w in good_words if w in ENGLISH_FUNCTION_WORDS) de_hits = sum(1 for w in good_words if w in GERMAN_FUNCTION_WORDS) # Check for umlauts (strong German signal) raw_texts = [w['text'] for w in words if w.get('conf', 0) > 40] umlaut_count = sum(1 for t in raw_texts for c in t if c in 'äöüÄÖÜß') # German capitalization: nouns are capitalized mid-sentence # Count words that start with uppercase but aren't at position 0 cap_words = sum(1 for t in raw_texts if t[0].isupper() and len(t) > 2) en_score = en_hits / total if total > 0 else 0.0 de_score = de_hits / total if total > 0 else 0.0 # Boost German score for umlauts if umlaut_count > 0: de_score = min(1.0, de_score + 0.15 * min(umlaut_count, 5)) # Boost German score for high capitalization ratio (typical for German nouns) if total > 5: cap_ratio = cap_words / total if cap_ratio > 0.3: de_score = min(1.0, de_score + 0.1) return {'eng': round(en_score, 3), 'deu': round(de_score, 3)} def _score_role(geom: ColumnGeometry) -> Dict[str, float]: """Score the role of a column based on its geometry and content patterns. Args: geom: ColumnGeometry with words and dimensions. Returns: Dict with role scores: 'reference', 'marker', 'sentence', 'vocabulary'. """ scores = {'reference': 0.0, 'marker': 0.0, 'sentence': 0.0, 'vocabulary': 0.0} if not geom.words: return scores texts = [w['text'] for w in geom.words if w.get('conf', 0) > 40] if not texts: return scores avg_word_len = sum(len(t) for t in texts) / len(texts) has_punctuation = sum(1 for t in texts if any(c in t for c in '.!?;:,')) digit_words = sum(1 for t in texts if any(c.isdigit() for c in t)) digit_ratio = digit_words / len(texts) if texts else 0.0 # Reference: narrow + mostly numbers/page references if geom.width_ratio < 0.12: scores['reference'] = 0.5 if digit_ratio > 0.4: scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5) # Marker: narrow + few short entries if geom.width_ratio < 0.06 and geom.word_count <= 15: scores['marker'] = 0.7 if avg_word_len < 4: scores['marker'] = 0.9 # Very narrow non-edge column → strong marker regardless of word count if geom.width_ratio < 0.04 and geom.index > 0: scores['marker'] = max(scores['marker'], 0.9) # Sentence: longer words + punctuation present if geom.width_ratio > 0.15 and has_punctuation > 2: scores['sentence'] = 0.3 + min(0.5, has_punctuation / len(texts)) if avg_word_len > 4: scores['sentence'] = min(1.0, scores['sentence'] + 0.2) # Vocabulary: medium width + medium word length if 0.10 < geom.width_ratio < 0.45: scores['vocabulary'] = 0.4 if 3 < avg_word_len < 8: scores['vocabulary'] = min(1.0, scores['vocabulary'] + 0.3) return {k: round(v, 3) for k, v in scores.items()} def _score_dictionary_signals( geometries: List[ColumnGeometry], document_category: Optional[str] = None, margin_strip_detected: bool = False, ) -> Dict[str, Any]: """Score dictionary-specific patterns across all columns. Combines 4 independent signals to determine if the page is a dictionary: 1. Alphabetical ordering of words in each column 2. Article column detection (der/die/das, to) 3. First-letter uniformity (most headwords share a letter) 4. Decorative A-Z margin strip (detected upstream) Args: geometries: List of ColumnGeometry with words. document_category: User-selected category (e.g. 'woerterbuch'). margin_strip_detected: Whether a decorative A-Z margin strip was found. Returns: Dict with 'is_dictionary', 'confidence', 'article_col_index', 'headword_col_index', and 'signals' sub-dict. """ result: Dict[str, Any] = { "is_dictionary": False, "confidence": 0.0, "article_col_index": None, "headword_col_index": None, "signals": {}, } if not geometries or len(geometries) < 2: return result # --- Signal 1: Alphabetical ordering per column (weight 0.35) --- best_alpha_score = 0.0 best_alpha_col = -1 for geom in geometries: texts = [ w["text"].strip().lower() for w in sorted(geom.words, key=lambda w: w.get("top", 0)) if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2 ] if len(texts) < 5: continue # Deduplicate consecutive identical words (OCR double-reads) deduped = [texts[0]] for t in texts[1:]: if t != deduped[-1]: deduped.append(t) if len(deduped) < 5: continue # Count consecutive pairs in alphabetical order ordered_pairs = sum( 1 for i in range(len(deduped) - 1) if deduped[i] <= deduped[i + 1] ) alpha_score = ordered_pairs / (len(deduped) - 1) if alpha_score > best_alpha_score: best_alpha_score = alpha_score best_alpha_col = geom.index result["signals"]["alphabetical_score"] = round(best_alpha_score, 3) result["signals"]["alphabetical_col"] = best_alpha_col # --- Signal 2: Article detection (weight 0.25) --- # Check three patterns: # (a) Dedicated narrow article column (der/die/das only) # (b) Inline articles: multi-word texts starting with "der X", "die X" # (c) High article word frequency: many individual words ARE articles # (common when OCR splits "der Zustand" into separate word_boxes) best_article_density = 0.0 best_article_col = -1 best_inline_article_ratio = 0.0 best_article_word_ratio = 0.0 for geom in geometries: texts = [ w["text"].strip().lower() for w in geom.words if w.get("conf", 0) > 30 and len(w["text"].strip()) > 0 ] if len(texts) < 3: continue # (a) Dedicated article column: narrow, mostly article words article_count = sum(1 for t in texts if t in _DICT_ARTICLE_WORDS) if geom.width_ratio <= 0.20: density = article_count / len(texts) if density > best_article_density: best_article_density = density best_article_col = geom.index # (b) Inline articles: "der Zustand", "die Zutat", etc. inline_count = sum( 1 for t in texts if any(t.startswith(art + " ") for art in _DICT_ARTICLE_WORDS) ) inline_ratio = inline_count / len(texts) if inline_ratio > best_inline_article_ratio: best_inline_article_ratio = inline_ratio # (c) Article word frequency in any column (for OCR-split word_boxes) # In dictionaries, articles appear frequently among headwords # Require at least 10% articles and >= 3 article words if article_count >= 3: art_ratio = article_count / len(texts) # Only count if column has enough non-article words too # (pure article column is handled by (a)) non_art = len(texts) - article_count if non_art >= 3 and art_ratio > best_article_word_ratio: best_article_word_ratio = art_ratio # Use the strongest signal effective_article_score = max( best_article_density, best_inline_article_ratio, best_article_word_ratio * 0.8, # slight discount for raw word ratio ) result["signals"]["article_density"] = round(best_article_density, 3) result["signals"]["inline_article_ratio"] = round(best_inline_article_ratio, 3) result["signals"]["article_word_ratio"] = round(best_article_word_ratio, 3) result["signals"]["article_col"] = best_article_col # --- Signal 3: First-letter uniformity (weight 0.25) --- best_uniformity = 0.0 best_uniform_col = -1 has_letter_transition = False for geom in geometries: texts = [ w["text"].strip().lower() for w in sorted(geom.words, key=lambda w: w.get("top", 0)) if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2 ] if len(texts) < 5: continue # Count first letters first_letters = [t[0] for t in texts if t[0].isalpha()] if not first_letters: continue letter_counts = Counter(first_letters) most_common_letter, most_common_count = letter_counts.most_common(1)[0] uniformity = most_common_count / len(first_letters) # Check for orderly letter transitions (A→B or Y→Z) # Group consecutive words by first letter, check if groups are in order groups = [] current_letter = first_letters[0] for fl in first_letters: if fl != current_letter: groups.append(current_letter) current_letter = fl groups.append(current_letter) if len(groups) >= 2 and len(groups) <= 5: # Check if groups are alphabetically ordered if all(groups[i] <= groups[i + 1] for i in range(len(groups) - 1)): has_letter_transition = True # Boost uniformity for orderly transitions uniformity = max(uniformity, 0.70) if uniformity > best_uniformity: best_uniformity = uniformity best_uniform_col = geom.index result["signals"]["first_letter_uniformity"] = round(best_uniformity, 3) result["signals"]["uniform_col"] = best_uniform_col result["signals"]["has_letter_transition"] = has_letter_transition # --- Signal 4: Decorative margin strip (weight 0.15) --- result["signals"]["margin_strip_detected"] = margin_strip_detected # --- Combine signals --- s1 = min(best_alpha_score, 1.0) * 0.35 s2 = min(effective_article_score, 1.0) * 0.25 s3 = min(best_uniformity, 1.0) * 0.25 s4 = (1.0 if margin_strip_detected else 0.0) * 0.15 combined = s1 + s2 + s3 + s4 # Boost if user set document_category to 'woerterbuch' if document_category == "woerterbuch": combined = min(1.0, combined + 0.20) result["signals"]["category_boost"] = True result["confidence"] = round(combined, 3) # Threshold: combined >= 0.40 to classify as dictionary # (at least 2 strong signals or 3 moderate ones) if combined >= 0.40: result["is_dictionary"] = True # Identify headword column: best alphabetical OR best uniform if best_alpha_col >= 0 and best_alpha_score >= 0.60: result["headword_col_index"] = best_alpha_col elif best_uniform_col >= 0 and best_uniformity >= 0.50: result["headword_col_index"] = best_uniform_col if best_article_col >= 0 and best_article_density >= 0.30: result["article_col_index"] = best_article_col # If inline articles are strong but no dedicated column, note it if best_inline_article_ratio >= 0.30 and result["article_col_index"] is None: result["signals"]["inline_articles_detected"] = True logger.info( "DictionaryDetection: combined=%.3f is_dict=%s signals=%s", combined, result["is_dictionary"], result["signals"], ) return result def _classify_dictionary_columns( geometries: List[ColumnGeometry], dict_signals: Dict[str, Any], lang_scores: List[Dict[str, float]], content_h: int, ) -> Optional[List[PageRegion]]: """Classify columns for a detected dictionary page. Assigns column_headword, column_article, column_ipa, and column_de/column_en based on dictionary signals and language scores. Returns None if classification fails. """ if not dict_signals.get("is_dictionary"): return None regions: List[PageRegion] = [] assigned = set() article_idx = dict_signals.get("article_col_index") headword_idx = dict_signals.get("headword_col_index") # 1. Assign article column if detected if article_idx is not None: for geom in geometries: if geom.index == article_idx: regions.append(PageRegion( type="column_article", x=geom.x, y=geom.y, width=geom.width, height=content_h, classification_confidence=round( dict_signals["signals"].get("article_density", 0.5), 2), classification_method="dictionary", )) assigned.add(geom.index) break # 2. Assign headword column if headword_idx is not None and headword_idx not in assigned: for geom in geometries: if geom.index == headword_idx: regions.append(PageRegion( type="column_headword", x=geom.x, y=geom.y, width=geom.width, height=content_h, classification_confidence=round( dict_signals["confidence"], 2), classification_method="dictionary", )) assigned.add(geom.index) break # 3. Assign remaining columns by language + content remaining = [g for g in geometries if g.index not in assigned] for geom in remaining: ls = lang_scores[geom.index] if geom.index < len(lang_scores) else {"eng": 0, "deu": 0} # Check if column contains IPA (brackets like [, /, ˈ) ipa_chars = sum( 1 for w in geom.words if any(c in (w.get("text") or "") for c in "[]/ˈˌːɪəɒʊæɑɔ") ) ipa_ratio = ipa_chars / max(len(geom.words), 1) if ipa_ratio > 0.25: col_type = "column_ipa" conf = round(min(1.0, ipa_ratio), 2) elif ls["deu"] > ls["eng"] and ls["deu"] > 0.05: col_type = "column_de" conf = round(ls["deu"], 2) elif ls["eng"] > ls["deu"] and ls["eng"] > 0.05: col_type = "column_en" conf = round(ls["eng"], 2) else: # Positional fallback: leftmost unassigned = EN, next = DE left_unassigned = sorted( [g for g in remaining if g.index not in assigned], key=lambda g: g.x, ) if geom == left_unassigned[0] if left_unassigned else None: col_type = "column_en" else: col_type = "column_de" conf = 0.4 regions.append(PageRegion( type=col_type, x=geom.x, y=geom.y, width=geom.width, height=content_h, classification_confidence=conf, classification_method="dictionary", )) assigned.add(geom.index) regions.sort(key=lambda r: r.x) return regions