breakpilot-lehrer/klausur-service/backend/cv_layout_scoring.py

"""
Language scoring, role scoring, and dictionary detection/classification.

Extracted from cv_layout.py to keep modules under 500 LOC.

Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import logging
from collections import Counter
from typing import Any, Dict, List, Optional

from cv_vocab_types import (
    ColumnGeometry,
    ENGLISH_FUNCTION_WORDS,
    GERMAN_FUNCTION_WORDS,
    PageRegion,
)

logger = logging.getLogger(__name__)

# --- Dictionary / Wörterbuch Detection ---

# Article words that appear as a dedicated column in dictionaries
_DICT_ARTICLE_WORDS = {
    # German articles
    "die", "der", "das", "dem", "den", "des", "ein", "eine", "einem", "einer",
    # English articles / infinitive marker
    "the", "a", "an", "to",
}


# --- Phase B: Content-Based Classification ---

def _score_language(words: List[Dict]) -> Dict[str, float]:
    """Score the language of a column's words.

    Analyzes function words, umlauts, and capitalization patterns
    to determine whether text is English or German.

    Args:
        words: List of word dicts with 'text' and 'conf' keys.

    Returns:
        Dict with 'eng' and 'deu' scores (0.0-1.0).
    """
    if not words:
        return {'eng': 0.0, 'deu': 0.0}

    # Only consider words with decent confidence
    good_words = [w['text'].lower() for w in words if w.get('conf', 0) > 40 and len(w['text']) > 0]
    if not good_words:
        return {'eng': 0.0, 'deu': 0.0}

    total = len(good_words)
    en_hits = sum(1 for w in good_words if w in ENGLISH_FUNCTION_WORDS)
    de_hits = sum(1 for w in good_words if w in GERMAN_FUNCTION_WORDS)

    # Check for umlauts (strong German signal)
    raw_texts = [w['text'] for w in words if w.get('conf', 0) > 40]
    umlaut_count = sum(1 for t in raw_texts
                       for c in t if c in 'äöüÄÖÜß')

    # German capitalization: nouns are capitalized mid-sentence
    # Count words that start with uppercase but aren't at position 0
    cap_words = sum(1 for t in raw_texts if t[0].isupper() and len(t) > 2)

    en_score = en_hits / total if total > 0 else 0.0
    de_score = de_hits / total if total > 0 else 0.0

    # Boost German score for umlauts
    if umlaut_count > 0:
        de_score = min(1.0, de_score + 0.15 * min(umlaut_count, 5))

    # Boost German score for high capitalization ratio (typical for German nouns)
    if total > 5:
        cap_ratio = cap_words / total
        if cap_ratio > 0.3:
            de_score = min(1.0, de_score + 0.1)

    return {'eng': round(en_score, 3), 'deu': round(de_score, 3)}


def _score_role(geom: ColumnGeometry) -> Dict[str, float]:
    """Score the role of a column based on its geometry and content patterns.

    Args:
        geom: ColumnGeometry with words and dimensions.

    Returns:
        Dict with role scores: 'reference', 'marker', 'sentence', 'vocabulary'.
    """
    scores = {'reference': 0.0, 'marker': 0.0, 'sentence': 0.0, 'vocabulary': 0.0}

    if not geom.words:
        return scores

    texts = [w['text'] for w in geom.words if w.get('conf', 0) > 40]
    if not texts:
        return scores

    avg_word_len = sum(len(t) for t in texts) / len(texts)
    has_punctuation = sum(1 for t in texts if any(c in t for c in '.!?;:,'))
    digit_words = sum(1 for t in texts if any(c.isdigit() for c in t))
    digit_ratio = digit_words / len(texts) if texts else 0.0

    # Reference: narrow + mostly numbers/page references
    if geom.width_ratio < 0.12:
        scores['reference'] = 0.5
        if digit_ratio > 0.4:
            scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5)

    # Marker: narrow + few short entries
    if geom.width_ratio < 0.06 and geom.word_count <= 15:
        scores['marker'] = 0.7
        if avg_word_len < 4:
            scores['marker'] = 0.9
    # Very narrow non-edge column → strong marker regardless of word count
    if geom.width_ratio < 0.04 and geom.index > 0:
        scores['marker'] = max(scores['marker'], 0.9)

    # Sentence: longer words + punctuation present
    if geom.width_ratio > 0.15 and has_punctuation > 2:
        scores['sentence'] = 0.3 + min(0.5, has_punctuation / len(texts))
        if avg_word_len > 4:
            scores['sentence'] = min(1.0, scores['sentence'] + 0.2)

    # Vocabulary: medium width + medium word length
    if 0.10 < geom.width_ratio < 0.45:
        scores['vocabulary'] = 0.4
        if 3 < avg_word_len < 8:
            scores['vocabulary'] = min(1.0, scores['vocabulary'] + 0.3)

    return {k: round(v, 3) for k, v in scores.items()}


def _score_dictionary_signals(
    geometries: List[ColumnGeometry],
    document_category: Optional[str] = None,
    margin_strip_detected: bool = False,
) -> Dict[str, Any]:
    """Score dictionary-specific patterns across all columns.

    Combines 4 independent signals to determine if the page is a dictionary:
      1. Alphabetical ordering of words in each column
      2. Article column detection (der/die/das, to)
      3. First-letter uniformity (most headwords share a letter)
      4. Decorative A-Z margin strip (detected upstream)

    Args:
        geometries: List of ColumnGeometry with words.
        document_category: User-selected category (e.g. 'woerterbuch').
        margin_strip_detected: Whether a decorative A-Z margin strip was found.

    Returns:
        Dict with 'is_dictionary', 'confidence', 'article_col_index',
        'headword_col_index', and 'signals' sub-dict.
    """
    result: Dict[str, Any] = {
        "is_dictionary": False,
        "confidence": 0.0,
        "article_col_index": None,
        "headword_col_index": None,
        "signals": {},
    }

    if not geometries or len(geometries) < 2:
        return result

    # --- Signal 1: Alphabetical ordering per column (weight 0.35) ---
    best_alpha_score = 0.0
    best_alpha_col = -1
    for geom in geometries:
        texts = [
            w["text"].strip().lower()
            for w in sorted(geom.words, key=lambda w: w.get("top", 0))
            if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2
        ]
        if len(texts) < 5:
            continue
        # Deduplicate consecutive identical words (OCR double-reads)
        deduped = [texts[0]]
        for t in texts[1:]:
            if t != deduped[-1]:
                deduped.append(t)
        if len(deduped) < 5:
            continue
        # Count consecutive pairs in alphabetical order
        ordered_pairs = sum(
            1 for i in range(len(deduped) - 1)
            if deduped[i] <= deduped[i + 1]
        )
        alpha_score = ordered_pairs / (len(deduped) - 1)
        if alpha_score > best_alpha_score:
            best_alpha_score = alpha_score
            best_alpha_col = geom.index

    result["signals"]["alphabetical_score"] = round(best_alpha_score, 3)
    result["signals"]["alphabetical_col"] = best_alpha_col

    # --- Signal 2: Article detection (weight 0.25) ---
    # Check three patterns:
    # (a) Dedicated narrow article column (der/die/das only)
    # (b) Inline articles: multi-word texts starting with "der X", "die X"
    # (c) High article word frequency: many individual words ARE articles
    #     (common when OCR splits "der Zustand" into separate word_boxes)
    best_article_density = 0.0
    best_article_col = -1
    best_inline_article_ratio = 0.0
    best_article_word_ratio = 0.0

    for geom in geometries:
        texts = [
            w["text"].strip().lower()
            for w in geom.words
            if w.get("conf", 0) > 30 and len(w["text"].strip()) > 0
        ]
        if len(texts) < 3:
            continue

        # (a) Dedicated article column: narrow, mostly article words
        article_count = sum(1 for t in texts if t in _DICT_ARTICLE_WORDS)
        if geom.width_ratio <= 0.20:
            density = article_count / len(texts)
            if density > best_article_density:
                best_article_density = density
                best_article_col = geom.index

        # (b) Inline articles: "der Zustand", "die Zutat", etc.
        inline_count = sum(
            1 for t in texts
            if any(t.startswith(art + " ") for art in _DICT_ARTICLE_WORDS)
        )
        inline_ratio = inline_count / len(texts)
        if inline_ratio > best_inline_article_ratio:
            best_inline_article_ratio = inline_ratio

        # (c) Article word frequency in any column (for OCR-split word_boxes)
        # In dictionaries, articles appear frequently among headwords
        # Require at least 10% articles and >= 3 article words
        if article_count >= 3:
            art_ratio = article_count / len(texts)
            # Only count if column has enough non-article words too
            # (pure article column is handled by (a))
            non_art = len(texts) - article_count
            if non_art >= 3 and art_ratio > best_article_word_ratio:
                best_article_word_ratio = art_ratio

    # Use the strongest signal
    effective_article_score = max(
        best_article_density,
        best_inline_article_ratio,
        best_article_word_ratio * 0.8,  # slight discount for raw word ratio
    )

    result["signals"]["article_density"] = round(best_article_density, 3)
    result["signals"]["inline_article_ratio"] = round(best_inline_article_ratio, 3)
    result["signals"]["article_word_ratio"] = round(best_article_word_ratio, 3)
    result["signals"]["article_col"] = best_article_col

    # --- Signal 3: First-letter uniformity (weight 0.25) ---
    best_uniformity = 0.0
    best_uniform_col = -1
    has_letter_transition = False
    for geom in geometries:
        texts = [
            w["text"].strip().lower()
            for w in sorted(geom.words, key=lambda w: w.get("top", 0))
            if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2
        ]
        if len(texts) < 5:
            continue
        # Count first letters
        first_letters = [t[0] for t in texts if t[0].isalpha()]
        if not first_letters:
            continue
        letter_counts = Counter(first_letters)
        most_common_letter, most_common_count = letter_counts.most_common(1)[0]
        uniformity = most_common_count / len(first_letters)

        # Check for orderly letter transitions (A→B or Y→Z)
        # Group consecutive words by first letter, check if groups are in order
        groups = []
        current_letter = first_letters[0]
        for fl in first_letters:
            if fl != current_letter:
                groups.append(current_letter)
                current_letter = fl
        groups.append(current_letter)
        if len(groups) >= 2 and len(groups) <= 5:
            # Check if groups are alphabetically ordered
            if all(groups[i] <= groups[i + 1] for i in range(len(groups) - 1)):
                has_letter_transition = True
                # Boost uniformity for orderly transitions
                uniformity = max(uniformity, 0.70)

        if uniformity > best_uniformity:
            best_uniformity = uniformity
            best_uniform_col = geom.index

    result["signals"]["first_letter_uniformity"] = round(best_uniformity, 3)
    result["signals"]["uniform_col"] = best_uniform_col
    result["signals"]["has_letter_transition"] = has_letter_transition

    # --- Signal 4: Decorative margin strip (weight 0.15) ---
    result["signals"]["margin_strip_detected"] = margin_strip_detected

    # --- Combine signals ---
    s1 = min(best_alpha_score, 1.0) * 0.35
    s2 = min(effective_article_score, 1.0) * 0.25
    s3 = min(best_uniformity, 1.0) * 0.25
    s4 = (1.0 if margin_strip_detected else 0.0) * 0.15

    combined = s1 + s2 + s3 + s4

    # Boost if user set document_category to 'woerterbuch'
    if document_category == "woerterbuch":
        combined = min(1.0, combined + 0.20)
        result["signals"]["category_boost"] = True

    result["confidence"] = round(combined, 3)

    # Threshold: combined >= 0.40 to classify as dictionary
    # (at least 2 strong signals or 3 moderate ones)
    if combined >= 0.40:
        result["is_dictionary"] = True
        # Identify headword column: best alphabetical OR best uniform
        if best_alpha_col >= 0 and best_alpha_score >= 0.60:
            result["headword_col_index"] = best_alpha_col
        elif best_uniform_col >= 0 and best_uniformity >= 0.50:
            result["headword_col_index"] = best_uniform_col
        if best_article_col >= 0 and best_article_density >= 0.30:
            result["article_col_index"] = best_article_col
        # If inline articles are strong but no dedicated column, note it
        if best_inline_article_ratio >= 0.30 and result["article_col_index"] is None:
            result["signals"]["inline_articles_detected"] = True

    logger.info(
        "DictionaryDetection: combined=%.3f is_dict=%s signals=%s",
        combined, result["is_dictionary"], result["signals"],
    )

    return result


def _classify_dictionary_columns(
    geometries: List[ColumnGeometry],
    dict_signals: Dict[str, Any],
    lang_scores: List[Dict[str, float]],
    content_h: int,
) -> Optional[List[PageRegion]]:
    """Classify columns for a detected dictionary page.

    Assigns column_headword, column_article, column_ipa, and
    column_de/column_en based on dictionary signals and language scores.

    Returns None if classification fails.
    """
    if not dict_signals.get("is_dictionary"):
        return None

    regions: List[PageRegion] = []
    assigned = set()
    article_idx = dict_signals.get("article_col_index")
    headword_idx = dict_signals.get("headword_col_index")

    # 1. Assign article column if detected
    if article_idx is not None:
        for geom in geometries:
            if geom.index == article_idx:
                regions.append(PageRegion(
                    type="column_article",
                    x=geom.x, y=geom.y,
                    width=geom.width, height=content_h,
                    classification_confidence=round(
                        dict_signals["signals"].get("article_density", 0.5), 2),
                    classification_method="dictionary",
                ))
                assigned.add(geom.index)
                break

    # 2. Assign headword column
    if headword_idx is not None and headword_idx not in assigned:
        for geom in geometries:
            if geom.index == headword_idx:
                regions.append(PageRegion(
                    type="column_headword",
                    x=geom.x, y=geom.y,
                    width=geom.width, height=content_h,
                    classification_confidence=round(
                        dict_signals["confidence"], 2),
                    classification_method="dictionary",
                ))
                assigned.add(geom.index)
                break

    # 3. Assign remaining columns by language + content
    remaining = [g for g in geometries if g.index not in assigned]
    for geom in remaining:
        ls = lang_scores[geom.index] if geom.index < len(lang_scores) else {"eng": 0, "deu": 0}

        # Check if column contains IPA (brackets like [, /, ˈ)
        ipa_chars = sum(
            1 for w in geom.words
            if any(c in (w.get("text") or "") for c in "[]/ˈˌːɪəɒʊæɑɔ")
        )
        ipa_ratio = ipa_chars / max(len(geom.words), 1)

        if ipa_ratio > 0.25:
            col_type = "column_ipa"
            conf = round(min(1.0, ipa_ratio), 2)
        elif ls["deu"] > ls["eng"] and ls["deu"] > 0.05:
            col_type = "column_de"
            conf = round(ls["deu"], 2)
        elif ls["eng"] > ls["deu"] and ls["eng"] > 0.05:
            col_type = "column_en"
            conf = round(ls["eng"], 2)
        else:
            # Positional fallback: leftmost unassigned = EN, next = DE
            left_unassigned = sorted(
                [g for g in remaining if g.index not in assigned],
                key=lambda g: g.x,
            )
            if geom == left_unassigned[0] if left_unassigned else None:
                col_type = "column_en"
            else:
                col_type = "column_de"
            conf = 0.4

        regions.append(PageRegion(
            type=col_type,
            x=geom.x, y=geom.y,
            width=geom.width, height=content_h,
            classification_confidence=conf,
            classification_method="dictionary",
        ))
        assigned.add(geom.index)

    regions.sort(key=lambda r: r.x)
    return regions