Files
breakpilot-lehrer/klausur-service/backend/cv_layout_classify_position.py
Benjamin Admin 9ba420fa91
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Fix: Remove broken getKlausurApiUrl and clean up empty lines
sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00

219 lines
8.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Position-based column type classification for OCR layout analysis.
Contains Level 2 and Level 3 classification functions:
Level 2 _classify_by_position_enhanced: Position + language confirmation
Level 3 _classify_by_position_fallback: Pure positional (no regression)
Extracted from cv_layout_classify.py during file-size split.
"""
import logging
from typing import Dict, List, Optional
from cv_vocab_types import ColumnGeometry, PageRegion
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Level 2: Position-Enhanced Classification
# ---------------------------------------------------------------------------
def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
lang_scores: List[Dict[str, float]],
content_w: int,
content_h: int) -> Optional[List[PageRegion]]:
"""Level 2: Position-based rules enhanced with language confirmation.
Uses the old positional heuristics but confirms EN/DE assignment
with language scores (swapping if needed).
"""
regions = []
untyped = list(range(len(geometries)))
first_x = geometries[0].x if geometries else 0
left_20_threshold = first_x + content_w * 0.20
# Rule 1: Leftmost narrow column -> page_ref (only if in left 20%, no strong language)
g0 = geometries[0]
ls0 = lang_scores[0]
has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3
if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0:
regions.append(PageRegion(
type='page_ref', x=g0.x, y=g0.y,
width=g0.width, height=content_h,
classification_confidence=0.8,
classification_method='position_enhanced',
))
untyped.remove(0)
# Rule 2: Narrow columns with few words -> marker
for i in list(untyped):
geom = geometries[i]
if geom.width_ratio < 0.06 and geom.word_count <= 15:
regions.append(PageRegion(
type='column_marker', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.7,
classification_method='position_enhanced',
))
untyped.remove(i)
# Rule 3: Rightmost remaining -> column_example (if 3+ remaining)
if len(untyped) >= 3:
last_idx = untyped[-1]
geom = geometries[last_idx]
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.7,
classification_method='position_enhanced',
))
untyped.remove(last_idx)
# Rule 4: First two remaining -> EN/DE, but check language to possibly swap
if len(untyped) >= 2:
idx_a = untyped[0]
idx_b = untyped[1]
ls_a = lang_scores[idx_a]
ls_b = lang_scores[idx_b]
# Default: first=EN, second=DE (old behavior)
en_idx, de_idx = idx_a, idx_b
conf = 0.7
# Swap if language signals clearly indicate the opposite
if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']:
en_idx, de_idx = idx_b, idx_a
conf = 0.85
logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores")
regions.append(PageRegion(
type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
width=geometries[en_idx].width, height=content_h,
classification_confidence=conf,
classification_method='position_enhanced',
))
regions.append(PageRegion(
type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
width=geometries[de_idx].width, height=content_h,
classification_confidence=conf,
classification_method='position_enhanced',
))
untyped = untyped[2:]
elif len(untyped) == 1:
idx = untyped[0]
geom = geometries[idx]
regions.append(PageRegion(
type='column_en', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.5,
classification_method='position_enhanced',
))
untyped = []
# Remaining -> example
for idx in untyped:
geom = geometries[idx]
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.5,
classification_method='position_enhanced',
))
regions.sort(key=lambda r: r.x)
return regions
# ---------------------------------------------------------------------------
# Level 3: Position Fallback Classification
# ---------------------------------------------------------------------------
def _classify_by_position_fallback(geometries: List[ColumnGeometry],
content_w: int,
content_h: int) -> List[PageRegion]:
"""Level 3: Pure position-based fallback (identical to old code).
Guarantees no regression from the previous behavior.
"""
regions = []
untyped = list(range(len(geometries)))
first_x = geometries[0].x if geometries else 0
left_20_threshold = first_x + content_w * 0.20
# Rule 1: Leftmost narrow column -> page_ref (only if in left 20%)
g0 = geometries[0]
if g0.width_ratio < 0.12 and g0.x < left_20_threshold:
regions.append(PageRegion(
type='page_ref', x=g0.x, y=g0.y,
width=g0.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped.remove(0)
# Rule 2: Narrow + few words -> marker
for i in list(untyped):
geom = geometries[i]
if geom.width_ratio < 0.06 and geom.word_count <= 15:
regions.append(PageRegion(
type='column_marker', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped.remove(i)
# Rule 3: Rightmost remaining -> example (if 3+)
if len(untyped) >= 3:
last_idx = untyped[-1]
geom = geometries[last_idx]
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped.remove(last_idx)
# Rule 4: First remaining -> EN, second -> DE
if len(untyped) >= 2:
en_idx = untyped[0]
de_idx = untyped[1]
regions.append(PageRegion(
type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
width=geometries[en_idx].width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
regions.append(PageRegion(
type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
width=geometries[de_idx].width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped = untyped[2:]
elif len(untyped) == 1:
idx = untyped[0]
geom = geometries[idx]
regions.append(PageRegion(
type='column_en', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped = []
for idx in untyped:
geom = geometries[idx]
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
regions.sort(key=lambda r: r.x)
return regions