From a5df2b6e15bc0e7bb5aff08873ca29c9e7ef5991 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 7 Mar 2026 17:07:11 +0100 Subject: [PATCH] fix: Spaltenklassifikation im Vocab-Worksheet durch positionsbasierte Zuordnung ersetzen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sprachbasiertes Scoring (classify_column_types) verursachte vertauschte Spalten auf Seite 3 bei Beispielsaetzen mit vielen englischen Funktionswoertern. Neue _positional_column_regions() ordnet Spalten rein geometrisch (links→rechts) zu. OCR Pipeline Admin bleibt unveraendert. Co-Authored-By: Claude Opus 4.6 --- .../backend/vocab_worksheet_api.py | 104 +++++++++++++++--- 1 file changed, 88 insertions(+), 16 deletions(-) diff --git a/klausur-service/backend/vocab_worksheet_api.py b/klausur-service/backend/vocab_worksheet_api.py index 832e348..d5f42cd 100644 --- a/klausur-service/backend/vocab_worksheet_api.py +++ b/klausur-service/backend/vocab_worksheet_api.py @@ -70,8 +70,9 @@ try: detect_column_geometry, analyze_layout_by_words, analyze_layout, create_layout_image, detect_row_geometry, build_cell_grid_v2, _cells_to_vocab_entries, _detect_sub_columns, _detect_header_footer_gaps, - expand_narrow_columns, classify_column_types, llm_review_entries, + expand_narrow_columns, llm_review_entries, _fix_phonetic_brackets, + render_pdf_high_res, PageRegion, RowGeometry, ) from ocr_pipeline_session_store import ( @@ -1269,14 +1270,12 @@ async def process_single_page( if page_number < 0 or page_number >= page_count: raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).") - # Convert just this ONE page to PNG - image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False) - - # --- OCR Pipeline path --- + # --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) --- if OCR_PIPELINE_AVAILABLE: try: + img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0) page_vocabulary = await _run_ocr_pipeline_for_page( - image_data, page_number, session_id, + img_bgr, page_number, session_id, ) except Exception as e: logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True) @@ -1291,6 +1290,7 @@ async def process_single_page( else: # Fallback to LLM vision extraction logger.warning("OCR pipeline not available, falling back to LLM vision") + image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False) vocabulary, confidence, error = await extract_vocabulary_from_image( image_data, f"page_{page_number + 1}.png", @@ -1336,13 +1336,89 @@ async def process_single_page( } +def _positional_column_regions( + geometries: list, + content_w: int, + content_h: int, + left_x: int, +) -> list: + """Classify columns by position only (no language scoring). + + Structural columns (page_ref, column_marker) are identified by geometry. + Remaining content columns are labelled left→right as column_en, column_de, + column_example. The names are purely positional – no language analysis. + """ + structural = [] + content_cols = [] + + for g in geometries: + rel_x = g.x - left_x + # page_ref: narrow column in the leftmost 20% region + if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20: + structural.append(PageRegion( + type='page_ref', x=g.x, y=g.y, + width=g.width, height=content_h, + classification_confidence=0.95, + classification_method='positional', + )) + # column_marker: very narrow, few words + elif g.width_ratio < 0.06 and g.word_count <= 15: + structural.append(PageRegion( + type='column_marker', x=g.x, y=g.y, + width=g.width, height=content_h, + classification_confidence=0.95, + classification_method='positional', + )) + else: + content_cols.append(g) + + # Single content column → plain text page + if len(content_cols) == 1: + g = content_cols[0] + return structural + [PageRegion( + type='column_text', x=g.x, y=g.y, + width=g.width, height=content_h, + classification_confidence=0.9, + classification_method='positional', + )] + + # No content columns + if not content_cols: + return structural + + # Sort content columns left→right and assign positional labels + content_cols.sort(key=lambda g: g.x) + labels = ['column_en', 'column_de', 'column_example'] + regions = list(structural) + for i, g in enumerate(content_cols): + label = labels[i] if i < len(labels) else 'column_example' + regions.append(PageRegion( + type=label, x=g.x, y=g.y, + width=g.width, height=content_h, + classification_confidence=0.95, + classification_method='positional', + )) + + logger.info(f"PositionalColumns: {len(structural)} structural, " + f"{len(content_cols)} content → " + f"{[r.type for r in regions]}") + return regions + + async def _run_ocr_pipeline_for_page( - png_data: bytes, + img_bgr: np.ndarray, page_number: int, vocab_session_id: str, ) -> list: """Run the full OCR pipeline on a single page image and return vocab entries. + Uses the same pipeline as the admin OCR pipeline (ocr_pipeline_api.py). + + Args: + img_bgr: BGR numpy array (from render_pdf_high_res, same as admin pipeline). + page_number: 0-indexed page number. + vocab_session_id: Vocab session ID for logging. + Steps: deskew → dewarp → columns → rows → words → (LLM review) Returns list of dicts with keys: id, english, german, example_sentence, source_page """ @@ -1350,23 +1426,19 @@ async def _run_ocr_pipeline_for_page( t_total = _time.time() - # 1. Decode PNG → BGR numpy array - arr = np.frombuffer(png_data, dtype=np.uint8) - img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR) - if img_bgr is None: - raise ValueError("Failed to decode page image") - img_h, img_w = img_bgr.shape[:2] logger.info(f"OCR Pipeline page {page_number + 1}: image {img_w}x{img_h}") # 2. Create pipeline session in DB (for debugging in admin UI) pipeline_session_id = str(uuid.uuid4()) try: + _, png_buf = cv2.imencode(".png", img_bgr) + original_png = png_buf.tobytes() await create_pipeline_session_db( pipeline_session_id, name=f"vocab-ws-{vocab_session_id[:8]}-p{page_number + 1}", filename=f"page_{page_number + 1}.png", - original_png=png_data, + original_png=original_png, ) except Exception as e: logger.warning(f"Could not create pipeline session in DB: {e}") @@ -1406,8 +1478,8 @@ async def _run_ocr_pipeline_for_page( geometries = _detect_sub_columns(geometries, content_w, left_x=left_x, top_y=top_y, header_y=header_y, footer_y=footer_y) geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts) - regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y, - left_x=left_x, right_x=right_x, inv=inv) + content_h = bottom_y - top_y + regions = _positional_column_regions(geometries, content_w, content_h, left_x) content_bounds = (left_x, right_x, top_y, bottom_y) logger.info(f" columns: {len(regions)} detected ({_time.time() - t0:.1f}s)")