Fix: Remove broken getKlausurApiUrl and clean up empty lines

sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions
--- a/klausur-service/backend/cv_layout_classify_position.py
+++ b/klausur-service/backend/cv_layout_classify_position.py
@@ -0,0 +1,218 @@
+"""
+Position-based column type classification for OCR layout analysis.
+
+Contains Level 2 and Level 3 classification functions:
+  Level 2 – _classify_by_position_enhanced: Position + language confirmation
+  Level 3 – _classify_by_position_fallback: Pure positional (no regression)
+
+Extracted from cv_layout_classify.py during file-size split.
+"""
+
+import logging
+from typing import Dict, List, Optional
+
+from cv_vocab_types import ColumnGeometry, PageRegion
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Level 2: Position-Enhanced Classification
+# ---------------------------------------------------------------------------
+
+def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
+                                    lang_scores: List[Dict[str, float]],
+                                    content_w: int,
+                                    content_h: int) -> Optional[List[PageRegion]]:
+    """Level 2: Position-based rules enhanced with language confirmation.
+
+    Uses the old positional heuristics but confirms EN/DE assignment
+    with language scores (swapping if needed).
+    """
+    regions = []
+    untyped = list(range(len(geometries)))
+    first_x = geometries[0].x if geometries else 0
+    left_20_threshold = first_x + content_w * 0.20
+
+    # Rule 1: Leftmost narrow column -> page_ref (only if in left 20%, no strong language)
+    g0 = geometries[0]
+    ls0 = lang_scores[0]
+    has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3
+    if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0:
+        regions.append(PageRegion(
+            type='page_ref', x=g0.x, y=g0.y,
+            width=g0.width, height=content_h,
+            classification_confidence=0.8,
+            classification_method='position_enhanced',
+        ))
+        untyped.remove(0)
+
+    # Rule 2: Narrow columns with few words -> marker
+    for i in list(untyped):
+        geom = geometries[i]
+        if geom.width_ratio < 0.06 and geom.word_count <= 15:
+            regions.append(PageRegion(
+                type='column_marker', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=0.7,
+                classification_method='position_enhanced',
+            ))
+            untyped.remove(i)
+
+    # Rule 3: Rightmost remaining -> column_example (if 3+ remaining)
+    if len(untyped) >= 3:
+        last_idx = untyped[-1]
+        geom = geometries[last_idx]
+        regions.append(PageRegion(
+            type='column_example', x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=0.7,
+            classification_method='position_enhanced',
+        ))
+        untyped.remove(last_idx)
+
+    # Rule 4: First two remaining -> EN/DE, but check language to possibly swap
+    if len(untyped) >= 2:
+        idx_a = untyped[0]
+        idx_b = untyped[1]
+        ls_a = lang_scores[idx_a]
+        ls_b = lang_scores[idx_b]
+
+        # Default: first=EN, second=DE (old behavior)
+        en_idx, de_idx = idx_a, idx_b
+        conf = 0.7
+
+        # Swap if language signals clearly indicate the opposite
+        if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']:
+            en_idx, de_idx = idx_b, idx_a
+            conf = 0.85
+            logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores")
+
+        regions.append(PageRegion(
+            type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
+            width=geometries[en_idx].width, height=content_h,
+            classification_confidence=conf,
+            classification_method='position_enhanced',
+        ))
+        regions.append(PageRegion(
+            type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
+            width=geometries[de_idx].width, height=content_h,
+            classification_confidence=conf,
+            classification_method='position_enhanced',
+        ))
+        untyped = untyped[2:]
+    elif len(untyped) == 1:
+        idx = untyped[0]
+        geom = geometries[idx]
+        regions.append(PageRegion(
+            type='column_en', x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=0.5,
+            classification_method='position_enhanced',
+        ))
+        untyped = []
+
+    # Remaining -> example
+    for idx in untyped:
+        geom = geometries[idx]
+        regions.append(PageRegion(
+            type='column_example', x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=0.5,
+            classification_method='position_enhanced',
+        ))
+
+    regions.sort(key=lambda r: r.x)
+    return regions
+
+
+# ---------------------------------------------------------------------------
+# Level 3: Position Fallback Classification
+# ---------------------------------------------------------------------------
+
+def _classify_by_position_fallback(geometries: List[ColumnGeometry],
+                                   content_w: int,
+                                   content_h: int) -> List[PageRegion]:
+    """Level 3: Pure position-based fallback (identical to old code).
+
+    Guarantees no regression from the previous behavior.
+    """
+    regions = []
+    untyped = list(range(len(geometries)))
+    first_x = geometries[0].x if geometries else 0
+    left_20_threshold = first_x + content_w * 0.20
+
+    # Rule 1: Leftmost narrow column -> page_ref (only if in left 20%)
+    g0 = geometries[0]
+    if g0.width_ratio < 0.12 and g0.x < left_20_threshold:
+        regions.append(PageRegion(
+            type='page_ref', x=g0.x, y=g0.y,
+            width=g0.width, height=content_h,
+            classification_confidence=1.0,
+            classification_method='position_fallback',
+        ))
+        untyped.remove(0)
+
+    # Rule 2: Narrow + few words -> marker
+    for i in list(untyped):
+        geom = geometries[i]
+        if geom.width_ratio < 0.06 and geom.word_count <= 15:
+            regions.append(PageRegion(
+                type='column_marker', x=geom.x, y=geom.y,
+                width=geom.width, height=content_h,
+                classification_confidence=1.0,
+                classification_method='position_fallback',
+            ))
+            untyped.remove(i)
+
+    # Rule 3: Rightmost remaining -> example (if 3+)
+    if len(untyped) >= 3:
+        last_idx = untyped[-1]
+        geom = geometries[last_idx]
+        regions.append(PageRegion(
+            type='column_example', x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=1.0,
+            classification_method='position_fallback',
+        ))
+        untyped.remove(last_idx)
+
+    # Rule 4: First remaining -> EN, second -> DE
+    if len(untyped) >= 2:
+        en_idx = untyped[0]
+        de_idx = untyped[1]
+        regions.append(PageRegion(
+            type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
+            width=geometries[en_idx].width, height=content_h,
+            classification_confidence=1.0,
+            classification_method='position_fallback',
+        ))
+        regions.append(PageRegion(
+            type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
+            width=geometries[de_idx].width, height=content_h,
+            classification_confidence=1.0,
+            classification_method='position_fallback',
+        ))
+        untyped = untyped[2:]
+    elif len(untyped) == 1:
+        idx = untyped[0]
+        geom = geometries[idx]
+        regions.append(PageRegion(
+            type='column_en', x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=1.0,
+            classification_method='position_fallback',
+        ))
+        untyped = []
+
+    for idx in untyped:
+        geom = geometries[idx]
+        regions.append(PageRegion(
+            type='column_example', x=geom.x, y=geom.y,
+            width=geom.width, height=content_h,
+            classification_confidence=1.0,
+            classification_method='position_fallback',
+        ))
+
+    regions.sort(key=lambda r: r.x)
+    return regions