diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 031b7ff..e1c7725 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3321,6 +3321,75 @@ def _build_margin_regions( return margins +def positional_column_regions( + geometries: List[ColumnGeometry], + content_w: int, + content_h: int, + left_x: int, +) -> List[PageRegion]: + """Classify columns by position only (no language scoring). + + Structural columns (page_ref, column_marker) are identified by geometry. + Remaining content columns are labelled left→right as column_en, column_de, + column_example. The names are purely positional – no language analysis. + """ + structural: List[PageRegion] = [] + content_cols: List[ColumnGeometry] = [] + + for g in geometries: + rel_x = g.x - left_x + # page_ref: narrow column in the leftmost 20% region + if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20: + structural.append(PageRegion( + type='page_ref', x=g.x, y=g.y, + width=g.width, height=content_h, + classification_confidence=0.95, + classification_method='positional', + )) + # column_marker: very narrow, few words + elif g.width_ratio < 0.06 and g.word_count <= 15: + structural.append(PageRegion( + type='column_marker', x=g.x, y=g.y, + width=g.width, height=content_h, + classification_confidence=0.95, + classification_method='positional', + )) + else: + content_cols.append(g) + + # Single content column → plain text page + if len(content_cols) == 1: + g = content_cols[0] + return structural + [PageRegion( + type='column_text', x=g.x, y=g.y, + width=g.width, height=content_h, + classification_confidence=0.9, + classification_method='positional', + )] + + # No content columns + if not content_cols: + return structural + + # Sort content columns left→right and assign positional labels + content_cols.sort(key=lambda g: g.x) + labels = ['column_en', 'column_de', 'column_example'] + regions = list(structural) + for i, g in enumerate(content_cols): + label = labels[i] if i < len(labels) else 'column_example' + regions.append(PageRegion( + type=label, x=g.x, y=g.y, + width=g.width, height=content_h, + classification_confidence=0.95, + classification_method='positional', + )) + + logger.info(f"PositionalColumns: {len(structural)} structural, " + f"{len(content_cols)} content → " + f"{[r.type for r in regions]}") + return regions + + def classify_column_types(geometries: List[ColumnGeometry], content_w: int, top_y: int, @@ -3548,6 +3617,21 @@ def _classify_by_content(geometries: List[ColumnGeometry], best_en = max(en_candidates, key=lambda x: x[2]['eng']) best_de = max(de_candidates, key=lambda x: x[2]['deu']) + # Position-aware EN selection: in typical textbooks the layout is EN | DE | Example. + # Example sentences contain English function words ("the", "a", "is") which inflate + # the eng score of the Example column. When the best EN candidate sits to the RIGHT + # of the DE column and there is another EN candidate to the LEFT, prefer the left one + # — it is almost certainly the real vocabulary column. + if best_de[2]['deu'] > 0.5 and best_en[1].x > best_de[1].x and len(en_candidates) > 1: + left_of_de = [c for c in en_candidates if c[1].x < best_de[1].x] + if left_of_de: + alt_en = max(left_of_de, key=lambda x: x[2]['eng']) + logger.info( + f"ClassifyColumns: Level 1 position fix — best EN col {best_en[0]} " + f"(eng={best_en[2]['eng']:.3f}) is right of DE col {best_de[0]}; " + f"preferring left col {alt_en[0]} (eng={alt_en[2]['eng']:.3f})") + best_en = alt_en + if best_en[0] == best_de[0]: # Same column scored highest for both — ambiguous logger.info("ClassifyColumns: Level 1 failed - same column highest for EN and DE") @@ -3996,9 +4080,9 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li geometries = _detect_sub_columns(geometries, content_w, left_x=left_x, top_y=top_y, header_y=header_y, footer_y=footer_y) - # Phase B: Content-based classification - regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y, - left_x=left_x, right_x=right_x, inv=_inv) + # Phase B: Positional classification (no language scoring) + content_h = bottom_y - top_y + regions = positional_column_regions(geometries, content_w, content_h, left_x) col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref']) methods = set(r.classification_method for r in regions if r.classification_method) diff --git a/klausur-service/backend/vocab_worksheet_api.py b/klausur-service/backend/vocab_worksheet_api.py index d5f42cd..a8e86f5 100644 --- a/klausur-service/backend/vocab_worksheet_api.py +++ b/klausur-service/backend/vocab_worksheet_api.py @@ -70,7 +70,7 @@ try: detect_column_geometry, analyze_layout_by_words, analyze_layout, create_layout_image, detect_row_geometry, build_cell_grid_v2, _cells_to_vocab_entries, _detect_sub_columns, _detect_header_footer_gaps, - expand_narrow_columns, llm_review_entries, + expand_narrow_columns, positional_column_regions, llm_review_entries, _fix_phonetic_brackets, render_pdf_high_res, PageRegion, RowGeometry, @@ -1336,75 +1336,6 @@ async def process_single_page( } -def _positional_column_regions( - geometries: list, - content_w: int, - content_h: int, - left_x: int, -) -> list: - """Classify columns by position only (no language scoring). - - Structural columns (page_ref, column_marker) are identified by geometry. - Remaining content columns are labelled left→right as column_en, column_de, - column_example. The names are purely positional – no language analysis. - """ - structural = [] - content_cols = [] - - for g in geometries: - rel_x = g.x - left_x - # page_ref: narrow column in the leftmost 20% region - if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20: - structural.append(PageRegion( - type='page_ref', x=g.x, y=g.y, - width=g.width, height=content_h, - classification_confidence=0.95, - classification_method='positional', - )) - # column_marker: very narrow, few words - elif g.width_ratio < 0.06 and g.word_count <= 15: - structural.append(PageRegion( - type='column_marker', x=g.x, y=g.y, - width=g.width, height=content_h, - classification_confidence=0.95, - classification_method='positional', - )) - else: - content_cols.append(g) - - # Single content column → plain text page - if len(content_cols) == 1: - g = content_cols[0] - return structural + [PageRegion( - type='column_text', x=g.x, y=g.y, - width=g.width, height=content_h, - classification_confidence=0.9, - classification_method='positional', - )] - - # No content columns - if not content_cols: - return structural - - # Sort content columns left→right and assign positional labels - content_cols.sort(key=lambda g: g.x) - labels = ['column_en', 'column_de', 'column_example'] - regions = list(structural) - for i, g in enumerate(content_cols): - label = labels[i] if i < len(labels) else 'column_example' - regions.append(PageRegion( - type=label, x=g.x, y=g.y, - width=g.width, height=content_h, - classification_confidence=0.95, - classification_method='positional', - )) - - logger.info(f"PositionalColumns: {len(structural)} structural, " - f"{len(content_cols)} content → " - f"{[r.type for r in regions]}") - return regions - - async def _run_ocr_pipeline_for_page( img_bgr: np.ndarray, page_number: int, @@ -1479,7 +1410,7 @@ async def _run_ocr_pipeline_for_page( top_y=top_y, header_y=header_y, footer_y=footer_y) geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts) content_h = bottom_y - top_y - regions = _positional_column_regions(geometries, content_w, content_h, left_x) + regions = positional_column_regions(geometries, content_w, content_h, left_x) content_bounds = (left_x, right_x, top_y, bottom_y) logger.info(f" columns: {len(regions)} detected ({_time.time() - t0:.1f}s)")