breakpilot-lehrer/klausur-service/backend/ocr/layout/classify_position.py

"""
Position-based column type classification for OCR layout analysis.

Contains Level 2 and Level 3 classification functions:
  Level 2 – _classify_by_position_enhanced: Position + language confirmation
  Level 3 – _classify_by_position_fallback: Pure positional (no regression)

Extracted from cv_layout_classify.py during file-size split.
"""

import logging
from typing import Dict, List, Optional

from cv_vocab_types import ColumnGeometry, PageRegion

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Level 2: Position-Enhanced Classification
# ---------------------------------------------------------------------------

def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
                                    lang_scores: List[Dict[str, float]],
                                    content_w: int,
                                    content_h: int) -> Optional[List[PageRegion]]:
    """Level 2: Position-based rules enhanced with language confirmation.

    Uses the old positional heuristics but confirms EN/DE assignment
    with language scores (swapping if needed).
    """
    regions = []
    untyped = list(range(len(geometries)))
    first_x = geometries[0].x if geometries else 0
    left_20_threshold = first_x + content_w * 0.20

    # Rule 1: Leftmost narrow column -> page_ref (only if in left 20%, no strong language)
    g0 = geometries[0]
    ls0 = lang_scores[0]
    has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3
    if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0:
        regions.append(PageRegion(
            type='page_ref', x=g0.x, y=g0.y,
            width=g0.width, height=content_h,
            classification_confidence=0.8,
            classification_method='position_enhanced',
        ))
        untyped.remove(0)

    # Rule 2: Narrow columns with few words -> marker
    for i in list(untyped):
        geom = geometries[i]
        if geom.width_ratio < 0.06 and geom.word_count <= 15:
            regions.append(PageRegion(
                type='column_marker', x=geom.x, y=geom.y,
                width=geom.width, height=content_h,
                classification_confidence=0.7,
                classification_method='position_enhanced',
            ))
            untyped.remove(i)

    # Rule 3: Rightmost remaining -> column_example (if 3+ remaining)
    if len(untyped) >= 3:
        last_idx = untyped[-1]
        geom = geometries[last_idx]
        regions.append(PageRegion(
            type='column_example', x=geom.x, y=geom.y,
            width=geom.width, height=content_h,
            classification_confidence=0.7,
            classification_method='position_enhanced',
        ))
        untyped.remove(last_idx)

    # Rule 4: First two remaining -> EN/DE, but check language to possibly swap
    if len(untyped) >= 2:
        idx_a = untyped[0]
        idx_b = untyped[1]
        ls_a = lang_scores[idx_a]
        ls_b = lang_scores[idx_b]

        # Default: first=EN, second=DE (old behavior)
        en_idx, de_idx = idx_a, idx_b
        conf = 0.7

        # Swap if language signals clearly indicate the opposite
        if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']:
            en_idx, de_idx = idx_b, idx_a
            conf = 0.85
            logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores")

        regions.append(PageRegion(
            type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
            width=geometries[en_idx].width, height=content_h,
            classification_confidence=conf,
            classification_method='position_enhanced',
        ))
        regions.append(PageRegion(
            type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
            width=geometries[de_idx].width, height=content_h,
            classification_confidence=conf,
            classification_method='position_enhanced',
        ))
        untyped = untyped[2:]
    elif len(untyped) == 1:
        idx = untyped[0]
        geom = geometries[idx]
        regions.append(PageRegion(
            type='column_en', x=geom.x, y=geom.y,
            width=geom.width, height=content_h,
            classification_confidence=0.5,
            classification_method='position_enhanced',
        ))
        untyped = []

    # Remaining -> example
    for idx in untyped:
        geom = geometries[idx]
        regions.append(PageRegion(
            type='column_example', x=geom.x, y=geom.y,
            width=geom.width, height=content_h,
            classification_confidence=0.5,
            classification_method='position_enhanced',
        ))

    regions.sort(key=lambda r: r.x)
    return regions


# ---------------------------------------------------------------------------
# Level 3: Position Fallback Classification
# ---------------------------------------------------------------------------

def _classify_by_position_fallback(geometries: List[ColumnGeometry],
                                   content_w: int,
                                   content_h: int) -> List[PageRegion]:
    """Level 3: Pure position-based fallback (identical to old code).

    Guarantees no regression from the previous behavior.
    """
    regions = []
    untyped = list(range(len(geometries)))
    first_x = geometries[0].x if geometries else 0
    left_20_threshold = first_x + content_w * 0.20

    # Rule 1: Leftmost narrow column -> page_ref (only if in left 20%)
    g0 = geometries[0]
    if g0.width_ratio < 0.12 and g0.x < left_20_threshold:
        regions.append(PageRegion(
            type='page_ref', x=g0.x, y=g0.y,
            width=g0.width, height=content_h,
            classification_confidence=1.0,
            classification_method='position_fallback',
        ))
        untyped.remove(0)

    # Rule 2: Narrow + few words -> marker
    for i in list(untyped):
        geom = geometries[i]
        if geom.width_ratio < 0.06 and geom.word_count <= 15:
            regions.append(PageRegion(
                type='column_marker', x=geom.x, y=geom.y,
                width=geom.width, height=content_h,
                classification_confidence=1.0,
                classification_method='position_fallback',
            ))
            untyped.remove(i)

    # Rule 3: Rightmost remaining -> example (if 3+)
    if len(untyped) >= 3:
        last_idx = untyped[-1]
        geom = geometries[last_idx]
        regions.append(PageRegion(
            type='column_example', x=geom.x, y=geom.y,
            width=geom.width, height=content_h,
            classification_confidence=1.0,
            classification_method='position_fallback',
        ))
        untyped.remove(last_idx)

    # Rule 4: First remaining -> EN, second -> DE
    if len(untyped) >= 2:
        en_idx = untyped[0]
        de_idx = untyped[1]
        regions.append(PageRegion(
            type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
            width=geometries[en_idx].width, height=content_h,
            classification_confidence=1.0,
            classification_method='position_fallback',
        ))
        regions.append(PageRegion(
            type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
            width=geometries[de_idx].width, height=content_h,
            classification_confidence=1.0,
            classification_method='position_fallback',
        ))
        untyped = untyped[2:]
    elif len(untyped) == 1:
        idx = untyped[0]
        geom = geometries[idx]
        regions.append(PageRegion(
            type='column_en', x=geom.x, y=geom.y,
            width=geom.width, height=content_h,
            classification_confidence=1.0,
            classification_method='position_fallback',
        ))
        untyped = []

    for idx in untyped:
        geom = geometries[idx]
        regions.append(PageRegion(
            type='column_example', x=geom.x, y=geom.y,
            width=geom.width, height=content_h,
            classification_confidence=1.0,
            classification_method='position_fallback',
        ))

    regions.sort(key=lambda r: r.x)
    return regions