feat(ocr-pipeline): generic sub-column detection via left-edge clustering

Detects hidden sub-columns (e.g. page references like "p.59") within already-recognized columns by clustering word left-edge positions and splitting when a clear minority cluster exists. The sub-column is then classified as page_ref and mapped to VocabRow.source_page. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-02 18:18:02 +01:00
parent 0532b2a797
commit 1a246eb059
3 changed files with 343 additions and 2 deletions
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -34,6 +34,7 @@ from cv_vocab_pipeline import (
    PageRegion,
    RowGeometry,
    _cells_to_vocab_entries,
+    _detect_sub_columns,
    _fix_character_confusion,
    _fix_phonetic_brackets,
    analyze_layout,
@@ -698,6 +699,9 @@ async def detect_columns(session_id: str):
        cached["_inv"] = inv
        cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)

+        # Split sub-columns (e.g. page references) before classification
+        geometries = _detect_sub_columns(geometries, content_w)
+
        # Phase B: Content-based classification
        regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
                                        left_x=left_x, right_x=right_x, inv=inv)