From a5df2b6e15bc0e7bb5aff08873ca29c9e7ef5991 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Sat, 7 Mar 2026 17:07:11 +0100
Subject: [PATCH] fix: Spaltenklassifikation im Vocab-Worksheet durch
 positionsbasierte Zuordnung ersetzen
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sprachbasiertes Scoring (classify_column_types) verursachte vertauschte
Spalten auf Seite 3 bei Beispielsaetzen mit vielen englischen Funktionswoertern.
Neue _positional_column_regions() ordnet Spalten rein geometrisch (links→rechts)
zu. OCR Pipeline Admin bleibt unveraendert.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../backend/vocab_worksheet_api.py            | 104 +++++++++++++++---
 1 file changed, 88 insertions(+), 16 deletions(-)

diff --git a/klausur-service/backend/vocab_worksheet_api.py b/klausur-service/backend/vocab_worksheet_api.py
index 832e348..d5f42cd 100644
--- a/klausur-service/backend/vocab_worksheet_api.py
+++ b/klausur-service/backend/vocab_worksheet_api.py
@@ -70,8 +70,9 @@ try:
         detect_column_geometry, analyze_layout_by_words, analyze_layout, create_layout_image,
         detect_row_geometry, build_cell_grid_v2,
         _cells_to_vocab_entries, _detect_sub_columns, _detect_header_footer_gaps,
-        expand_narrow_columns, classify_column_types, llm_review_entries,
+        expand_narrow_columns, llm_review_entries,
         _fix_phonetic_brackets,
+        render_pdf_high_res,
         PageRegion, RowGeometry,
     )
     from ocr_pipeline_session_store import (
@@ -1269,14 +1270,12 @@ async def process_single_page(
     if page_number < 0 or page_number >= page_count:
         raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
 
-    # Convert just this ONE page to PNG
-    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
-
-    # --- OCR Pipeline path ---
+    # --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
     if OCR_PIPELINE_AVAILABLE:
         try:
+            img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
             page_vocabulary = await _run_ocr_pipeline_for_page(
-                image_data, page_number, session_id,
+                img_bgr, page_number, session_id,
             )
         except Exception as e:
             logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
@@ -1291,6 +1290,7 @@ async def process_single_page(
     else:
         # Fallback to LLM vision extraction
         logger.warning("OCR pipeline not available, falling back to LLM vision")
+        image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
         vocabulary, confidence, error = await extract_vocabulary_from_image(
             image_data,
             f"page_{page_number + 1}.png",
@@ -1336,13 +1336,89 @@ async def process_single_page(
     }
 
 
+def _positional_column_regions(
+    geometries: list,
+    content_w: int,
+    content_h: int,
+    left_x: int,
+) -> list:
+    """Classify columns by position only (no language scoring).
+
+    Structural columns (page_ref, column_marker) are identified by geometry.
+    Remaining content columns are labelled left→right as column_en, column_de,
+    column_example.  The names are purely positional – no language analysis.
+    """
+    structural = []
+    content_cols = []
+
+    for g in geometries:
+        rel_x = g.x - left_x
+        # page_ref: narrow column in the leftmost 20% region
+        if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20:
+            structural.append(PageRegion(
+                type='page_ref', x=g.x, y=g.y,
+                width=g.width, height=content_h,
+                classification_confidence=0.95,
+                classification_method='positional',
+            ))
+        # column_marker: very narrow, few words
+        elif g.width_ratio < 0.06 and g.word_count <= 15:
+            structural.append(PageRegion(
+                type='column_marker', x=g.x, y=g.y,
+                width=g.width, height=content_h,
+                classification_confidence=0.95,
+                classification_method='positional',
+            ))
+        else:
+            content_cols.append(g)
+
+    # Single content column → plain text page
+    if len(content_cols) == 1:
+        g = content_cols[0]
+        return structural + [PageRegion(
+            type='column_text', x=g.x, y=g.y,
+            width=g.width, height=content_h,
+            classification_confidence=0.9,
+            classification_method='positional',
+        )]
+
+    # No content columns
+    if not content_cols:
+        return structural
+
+    # Sort content columns left→right and assign positional labels
+    content_cols.sort(key=lambda g: g.x)
+    labels = ['column_en', 'column_de', 'column_example']
+    regions = list(structural)
+    for i, g in enumerate(content_cols):
+        label = labels[i] if i < len(labels) else 'column_example'
+        regions.append(PageRegion(
+            type=label, x=g.x, y=g.y,
+            width=g.width, height=content_h,
+            classification_confidence=0.95,
+            classification_method='positional',
+        ))
+
+    logger.info(f"PositionalColumns: {len(structural)} structural, "
+                f"{len(content_cols)} content → "
+                f"{[r.type for r in regions]}")
+    return regions
+
+
 async def _run_ocr_pipeline_for_page(
-    png_data: bytes,
+    img_bgr: np.ndarray,
     page_number: int,
     vocab_session_id: str,
 ) -> list:
     """Run the full OCR pipeline on a single page image and return vocab entries.
 
+    Uses the same pipeline as the admin OCR pipeline (ocr_pipeline_api.py).
+
+    Args:
+        img_bgr: BGR numpy array (from render_pdf_high_res, same as admin pipeline).
+        page_number: 0-indexed page number.
+        vocab_session_id: Vocab session ID for logging.
+
     Steps: deskew → dewarp → columns → rows → words → (LLM review)
     Returns list of dicts with keys: id, english, german, example_sentence, source_page
     """
@@ -1350,23 +1426,19 @@ async def _run_ocr_pipeline_for_page(
 
     t_total = _time.time()
 
-    # 1. Decode PNG → BGR numpy array
-    arr = np.frombuffer(png_data, dtype=np.uint8)
-    img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
-    if img_bgr is None:
-        raise ValueError("Failed to decode page image")
-
     img_h, img_w = img_bgr.shape[:2]
     logger.info(f"OCR Pipeline page {page_number + 1}: image {img_w}x{img_h}")
 
     # 2. Create pipeline session in DB (for debugging in admin UI)
     pipeline_session_id = str(uuid.uuid4())
     try:
+        _, png_buf = cv2.imencode(".png", img_bgr)
+        original_png = png_buf.tobytes()
         await create_pipeline_session_db(
             pipeline_session_id,
             name=f"vocab-ws-{vocab_session_id[:8]}-p{page_number + 1}",
             filename=f"page_{page_number + 1}.png",
-            original_png=png_data,
+            original_png=original_png,
         )
     except Exception as e:
         logger.warning(f"Could not create pipeline session in DB: {e}")
@@ -1406,8 +1478,8 @@ async def _run_ocr_pipeline_for_page(
         geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
                                           top_y=top_y, header_y=header_y, footer_y=footer_y)
         geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
-        regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
-                                        left_x=left_x, right_x=right_x, inv=inv)
+        content_h = bottom_y - top_y
+        regions = _positional_column_regions(geometries, content_w, content_h, left_x)
         content_bounds = (left_x, right_x, top_y, bottom_y)
 
     logger.info(f"  columns: {len(regions)} detected ({_time.time() - t0:.1f}s)")