From 1393a994f9d3c39a52a0b0b9d46bbfbab89ac4f8 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 26 Feb 2026 23:33:35 +0100 Subject: [PATCH] Flexible inhaltsbasierte Spaltenerkennung (2-Phasen) Ersetzt hardcodierte Positionsregeln durch ein zweistufiges System: Phase A erkennt Spaltengeometrie (Clustering), Phase B klassifiziert Typen per Inhalt (Sprache/Rolle) mit 3-stufiger Fallback-Kette. Co-Authored-By: Claude Opus 4.6 --- .../app/(admin)/ai/ocr-pipeline/types.ts | 5 +- .../ocr-pipeline/ColumnControls.tsx | 18 + klausur-service/backend/cv_vocab_pipeline.py | 636 +++++++++++++++--- klausur-service/backend/ocr_pipeline_api.py | 14 +- 4 files changed, 595 insertions(+), 78 deletions(-) diff --git a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts index df349b7..f836696 100644 --- a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts +++ b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts @@ -64,11 +64,14 @@ export interface DewarpGroundTruth { } export interface PageRegion { - type: 'column_en' | 'column_de' | 'column_example' | 'page_ref' | 'column_marker' | 'header' | 'footer' + type: 'column_en' | 'column_de' | 'column_example' | 'page_ref' + | 'column_marker' | 'column_text' | 'header' | 'footer' x: number y: number width: number height: number + classification_confidence?: number + classification_method?: string } export interface ColumnResult { diff --git a/admin-lehrer/components/ocr-pipeline/ColumnControls.tsx b/admin-lehrer/components/ocr-pipeline/ColumnControls.tsx index cc5a706..329842a 100644 --- a/admin-lehrer/components/ocr-pipeline/ColumnControls.tsx +++ b/admin-lehrer/components/ocr-pipeline/ColumnControls.tsx @@ -15,6 +15,7 @@ const TYPE_COLORS: Record = { column_en: 'bg-blue-100 text-blue-700 dark:bg-blue-900/30 dark:text-blue-400', column_de: 'bg-green-100 text-green-700 dark:bg-green-900/30 dark:text-green-400', column_example: 'bg-orange-100 text-orange-700 dark:bg-orange-900/30 dark:text-orange-400', + column_text: 'bg-cyan-100 text-cyan-700 dark:bg-cyan-900/30 dark:text-cyan-400', page_ref: 'bg-purple-100 text-purple-700 dark:bg-purple-900/30 dark:text-purple-400', column_marker: 'bg-red-100 text-red-700 dark:bg-red-900/30 dark:text-red-400', header: 'bg-gray-100 text-gray-600 dark:bg-gray-700/50 dark:text-gray-400', @@ -25,12 +26,19 @@ const TYPE_LABELS: Record = { column_en: 'EN', column_de: 'DE', column_example: 'Beispiel', + column_text: 'Text', page_ref: 'Seite', column_marker: 'Marker', header: 'Header', footer: 'Footer', } +const METHOD_LABELS: Record = { + content: 'Inhalt', + position_enhanced: 'Position', + position_fallback: 'Fallback', +} + export function ColumnControls({ columnResult, onRerun, onGroundTruth, onNext, isDetecting }: ColumnControlsProps) { const [gtSaved, setGtSaved] = useState(false) @@ -70,6 +78,16 @@ export function ColumnControls({ columnResult, onRerun, onGroundTruth, onNext, i {TYPE_LABELS[col.type] || col.type} + {col.classification_confidence != null && col.classification_confidence < 1.0 && ( + + {Math.round(col.classification_confidence * 100)}% + + )} + {col.classification_method && ( + + ({METHOD_LABELS[col.classification_method] || col.classification_method}) + + )} x={col.x} y={col.y} {col.width}x{col.height}px diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index ace62ab..178b268 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -48,16 +48,46 @@ except ImportError: CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE +# --- Language Detection Constants --- + +GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht', + 'von', 'zu', 'mit', 'auf', 'fuer', 'den', 'dem', 'sich', 'auch', 'wird', + 'nach', 'bei', 'aus', 'wie', 'oder', 'wenn', 'noch', 'aber', 'hat', 'nur', + 'ueber', 'kann', 'als', 'ich', 'er', 'sie', 'es', 'wir', 'ihr', 'haben', + 'sein', 'werden', 'war', 'sind', 'muss', 'soll', 'dieser', 'diese', 'diesem'} + +ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'of', + 'and', 'in', 'that', 'it', 'for', 'on', 'with', 'as', 'at', 'by', 'from', + 'or', 'but', 'not', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', + 'would', 'can', 'could', 'should', 'may', 'might', 'this', 'they', 'you', 'he', + 'she', 'we', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'which'} + + # --- Data Classes --- @dataclass class PageRegion: """A detected region on the page.""" - type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'header', 'footer' + type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer' x: int y: int width: int height: int + classification_confidence: float = 1.0 # 0.0-1.0 + classification_method: str = "" # 'content', 'position_enhanced', 'position_fallback' + + +@dataclass +class ColumnGeometry: + """Geometrisch erkannte Spalte vor Typ-Klassifikation.""" + index: int # 0-basiert, links->rechts + x: int + y: int + width: int + height: int + word_count: int + words: List[Dict] # Wort-Dicts aus Tesseract (text, conf, left, top, ...) + width_ratio: float # width / content_width (0.0-1.0) @dataclass @@ -840,22 +870,24 @@ def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegi # ============================================================================= -# Stage 5b: Word-Based Layout Analysis (5-Column Detection) +# Stage 5b: Word-Based Layout Analysis (Two-Phase Column Detection) # ============================================================================= -def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]: - """Detect columns by clustering left-aligned word positions from Tesseract. +# --- Phase A: Geometry Detection --- - This approach works better than projection profiles for vocabulary pages - with 5 columns (page_ref, EN, DE, markers, examples) because it detects - column starts where left-aligned words cluster. +def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int]]: + """Detect column geometry by clustering left-aligned word positions. + + Phase A of the two-phase column detection. Returns untyped column + geometries with their words for subsequent content-based classification. Args: ocr_img: Binarized grayscale image for layout analysis. dewarped_bgr: Original BGR image (for Tesseract word detection). Returns: - List of PageRegion objects. Falls back to analyze_layout() if < 3 clusters. + Tuple of (geometries, left_x, right_x, top_y, bottom_y) or None if + fewer than 3 clusters are found (signals fallback needed). """ h, w = ocr_img.shape[:2] @@ -870,7 +902,7 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li top_y, bottom_y = 0, h content_w, content_h = w, h - logger.info(f"LayoutByWords: content bounds x=[{left_x}..{right_x}] ({content_w}px), " + logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), " f"y=[{top_y}..{bottom_y}] ({content_h}px)") # --- Get word bounding boxes from Tesseract --- @@ -880,13 +912,12 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li try: data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT) except Exception as e: - logger.warning(f"LayoutByWords: Tesseract image_to_data failed: {e}, falling back") - layout_img = create_layout_image(dewarped_bgr) - return analyze_layout(layout_img, ocr_img) + logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}") + return None - # Collect left edges of recognized words (confidence > 30) + # Collect words with their full info + word_dicts = [] left_edges = [] - word_info = [] # (left, top, width, height, text, conf) n_words = len(data['text']) for i in range(n_words): conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1 @@ -898,20 +929,22 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li bw = int(data['width'][i]) bh = int(data['height'][i]) left_edges.append(lx) - word_info.append((lx, ty, bw, bh, text, conf)) + word_dicts.append({ + 'text': text, 'conf': conf, + 'left': lx, 'top': ty, 'width': bw, 'height': bh, + }) if len(left_edges) < 5: - logger.warning(f"LayoutByWords: only {len(left_edges)} words detected, falling back") - layout_img = create_layout_image(dewarped_bgr) - return analyze_layout(layout_img, ocr_img) + logger.warning(f"ColumnGeometry: only {len(left_edges)} words detected") + return None - logger.info(f"LayoutByWords: {len(left_edges)} words detected in content area") + logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area") # --- Cluster left edges --- - tolerance = max(10, int(content_w * 0.01)) # ~1% of content width + tolerance = max(10, int(content_w * 0.01)) sorted_edges = sorted(left_edges) - clusters = [] # list of (center_x, count, edges) + clusters = [] current_cluster = [sorted_edges[0]] for edge in sorted_edges[1:]: if edge - current_cluster[-1] <= tolerance: @@ -925,20 +958,18 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li significant = [(int(np.mean(c)), len(c), min(c), max(c)) for c in clusters if len(c) >= 2] significant.sort(key=lambda s: s[0]) - logger.info(f"LayoutByWords: {len(significant)} significant clusters " + logger.info(f"ColumnGeometry: {len(significant)} significant clusters " f"(from {len(clusters)} total): " f"{[(s[0]+left_x, s[1]) for s in significant[:10]]}") if len(significant) < 3: - logger.info("LayoutByWords: < 3 clusters, falling back to projection-based layout") - layout_img = create_layout_image(dewarped_bgr) - return analyze_layout(layout_img, ocr_img) + logger.info("ColumnGeometry: < 3 clusters, signaling fallback") + return None # --- Merge clusters that are very close (within 2*tolerance) --- merged = [significant[0]] for s in significant[1:]: if s[0] - merged[-1][0] < 2 * tolerance: - # Merge: weighted average position, sum counts prev = merged[-1] total = prev[1] + s[1] avg_x = (prev[0] * prev[1] + s[0] * s[1]) // total @@ -946,114 +977,562 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li else: merged.append(s) - logger.info(f"LayoutByWords: {len(merged)} clusters after merging: " + logger.info(f"ColumnGeometry: {len(merged)} clusters after merging: " f"{[(m[0]+left_x, m[1]) for m in merged]}") if len(merged) < 3: - logger.info("LayoutByWords: < 3 merged clusters, falling back") - layout_img = create_layout_image(dewarped_bgr) - return analyze_layout(layout_img, ocr_img) + logger.info("ColumnGeometry: < 3 merged clusters, signaling fallback") + return None # --- Derive column boundaries --- - # 2mm margin before each cluster start (~8px at 100dpi, scale with image) margin_px = max(5, int(content_w * 0.005)) - col_starts = [] # (abs_x, word_count) + col_starts = [] for center_x, count, min_edge, max_edge in merged: abs_start = max(0, left_x + min_edge - margin_px) col_starts.append((abs_start, count)) - # Calculate column widths - col_defs = [] # (abs_x, width, word_count) + # Calculate column widths and assign words to columns + geometries = [] for i, (start_x, count) in enumerate(col_starts): if i + 1 < len(col_starts): col_width = col_starts[i + 1][0] - start_x else: col_width = right_x - start_x - col_defs.append((start_x, col_width, count)) - logger.info(f"LayoutByWords: column definitions: " - f"{[(d[0], d[1], d[2]) for d in col_defs]}") + # Assign words to this column based on left edge + col_left_rel = start_x - left_x + col_right_rel = col_left_rel + col_width + col_words = [w for w in word_dicts + if col_left_rel <= w['left'] < col_right_rel] - # --- Assign types based on rules --- + geometries.append(ColumnGeometry( + index=i, + x=start_x, + y=top_y, + width=col_width, + height=content_h, + word_count=len(col_words), + words=col_words, + width_ratio=col_width / content_w if content_w > 0 else 0.0, + )) + + logger.info(f"ColumnGeometry: {len(geometries)} columns: " + f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}") + + return (geometries, left_x, right_x, top_y, bottom_y) + + +# --- Phase B: Content-Based Classification --- + +def _score_language(words: List[Dict]) -> Dict[str, float]: + """Score the language of a column's words. + + Analyzes function words, umlauts, and capitalization patterns + to determine whether text is English or German. + + Args: + words: List of word dicts with 'text' and 'conf' keys. + + Returns: + Dict with 'eng' and 'deu' scores (0.0-1.0). + """ + if not words: + return {'eng': 0.0, 'deu': 0.0} + + # Only consider words with decent confidence + good_words = [w['text'].lower() for w in words if w.get('conf', 0) > 40 and len(w['text']) > 0] + if not good_words: + return {'eng': 0.0, 'deu': 0.0} + + total = len(good_words) + en_hits = sum(1 for w in good_words if w in ENGLISH_FUNCTION_WORDS) + de_hits = sum(1 for w in good_words if w in GERMAN_FUNCTION_WORDS) + + # Check for umlauts (strong German signal) + raw_texts = [w['text'] for w in words if w.get('conf', 0) > 40] + umlaut_count = sum(1 for t in raw_texts + for c in t if c in 'äöüÄÖÜß') + + # German capitalization: nouns are capitalized mid-sentence + # Count words that start with uppercase but aren't at position 0 + cap_words = sum(1 for t in raw_texts if t[0].isupper() and len(t) > 2) + + en_score = en_hits / total if total > 0 else 0.0 + de_score = de_hits / total if total > 0 else 0.0 + + # Boost German score for umlauts + if umlaut_count > 0: + de_score = min(1.0, de_score + 0.15 * min(umlaut_count, 5)) + + # Boost German score for high capitalization ratio (typical for German nouns) + if total > 5: + cap_ratio = cap_words / total + if cap_ratio > 0.3: + de_score = min(1.0, de_score + 0.1) + + return {'eng': round(en_score, 3), 'deu': round(de_score, 3)} + + +def _score_role(geom: ColumnGeometry) -> Dict[str, float]: + """Score the role of a column based on its geometry and content patterns. + + Args: + geom: ColumnGeometry with words and dimensions. + + Returns: + Dict with role scores: 'reference', 'marker', 'sentence', 'vocabulary'. + """ + scores = {'reference': 0.0, 'marker': 0.0, 'sentence': 0.0, 'vocabulary': 0.0} + + if not geom.words: + return scores + + texts = [w['text'] for w in geom.words if w.get('conf', 0) > 40] + if not texts: + return scores + + avg_word_len = sum(len(t) for t in texts) / len(texts) + has_punctuation = sum(1 for t in texts if any(c in t for c in '.!?;:,')) + digit_words = sum(1 for t in texts if any(c.isdigit() for c in t)) + digit_ratio = digit_words / len(texts) if texts else 0.0 + + # Reference: narrow + mostly numbers/page references + if geom.width_ratio < 0.12: + scores['reference'] = 0.5 + if digit_ratio > 0.4: + scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5) + + # Marker: very narrow + few short entries + if geom.width_ratio < 0.08 and geom.word_count <= 10: + scores['marker'] = 0.7 + if avg_word_len < 4: + scores['marker'] = 0.9 + + # Sentence: longer words + punctuation present + if geom.width_ratio > 0.15 and has_punctuation > 2: + scores['sentence'] = 0.3 + min(0.5, has_punctuation / len(texts)) + if avg_word_len > 4: + scores['sentence'] = min(1.0, scores['sentence'] + 0.2) + + # Vocabulary: medium width + medium word length + if 0.10 < geom.width_ratio < 0.45: + scores['vocabulary'] = 0.4 + if 3 < avg_word_len < 8: + scores['vocabulary'] = min(1.0, scores['vocabulary'] + 0.3) + + return {k: round(v, 3) for k, v in scores.items()} + + +def classify_column_types(geometries: List[ColumnGeometry], + content_w: int, + top_y: int, + img_w: int, + img_h: int, + bottom_y: int) -> List[PageRegion]: + """Classify column types using a 3-level fallback chain. + + Level 1: Content-based (language + role scoring) + Level 2: Position + language (old rules enhanced with language detection) + Level 3: Pure position (exact old code, no regression) + + Args: + geometries: List of ColumnGeometry from Phase A. + content_w: Total content width. + top_y: Top Y of content area. + img_w: Full image width. + img_h: Full image height. + bottom_y: Bottom Y of content area. + + Returns: + List of PageRegion with types, confidence, and method. + """ + content_h = bottom_y - top_y + + # Special case: single column → plain text page + if len(geometries) == 1: + geom = geometries[0] + return [PageRegion( + type='column_text', x=geom.x, y=geom.y, + width=geom.width, height=geom.height, + classification_confidence=0.9, + classification_method='content', + )] + + # --- Score all columns --- + lang_scores = [_score_language(g.words) for g in geometries] + role_scores = [_score_role(g) for g in geometries] + + logger.info(f"ClassifyColumns: language scores: " + f"{[(g.index, ls) for g, ls in zip(geometries, lang_scores)]}") + logger.info(f"ClassifyColumns: role scores: " + f"{[(g.index, rs) for g, rs in zip(geometries, role_scores)]}") + + # --- Level 1: Content-based classification --- + regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h) + if regions is not None: + logger.info("ClassifyColumns: Level 1 (content-based) succeeded") + _add_header_footer(regions, top_y, bottom_y, img_w, img_h) + return regions + + # --- Level 2: Position + language enhanced --- + regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h) + if regions is not None: + logger.info("ClassifyColumns: Level 2 (position+language) succeeded") + _add_header_footer(regions, top_y, bottom_y, img_w, img_h) + return regions + + # --- Level 3: Pure position fallback (old code, no regression) --- + logger.info("ClassifyColumns: Level 3 (position fallback)") + regions = _classify_by_position_fallback(geometries, content_w, content_h) + _add_header_footer(regions, top_y, bottom_y, img_w, img_h) + return regions + + +def _classify_by_content(geometries: List[ColumnGeometry], + lang_scores: List[Dict[str, float]], + role_scores: List[Dict[str, float]], + content_w: int, + content_h: int) -> Optional[List[PageRegion]]: + """Level 1: Classify columns purely by content analysis. + + Requires clear language signals to distinguish EN/DE columns. + Returns None if language signals are too weak. + """ regions = [] - total_content_w = right_x - left_x - untyped = list(range(len(col_defs))) # indices not yet assigned + assigned = set() - # Rule 1: Leftmost narrow column (< 12% width) → page_ref - if col_defs[0][1] < total_content_w * 0.12: + # Step 1: Assign structural roles first (reference, marker) + for i, (geom, rs) in enumerate(zip(geometries, role_scores)): + if rs['reference'] >= 0.5 and geom.width_ratio < 0.12: + regions.append(PageRegion( + type='page_ref', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=rs['reference'], + classification_method='content', + )) + assigned.add(i) + elif rs['marker'] >= 0.7 and geom.width_ratio < 0.08: + regions.append(PageRegion( + type='column_marker', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=rs['marker'], + classification_method='content', + )) + assigned.add(i) + + # Step 2: Among remaining columns, find EN and DE by language scores + remaining = [(i, geometries[i], lang_scores[i], role_scores[i]) + for i in range(len(geometries)) if i not in assigned] + + if len(remaining) < 2: + # Not enough columns for EN/DE pair + if len(remaining) == 1: + i, geom, ls, rs = remaining[0] + regions.append(PageRegion( + type='column_text', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=0.6, + classification_method='content', + )) + regions.sort(key=lambda r: r.x) + return regions + + # Check if we have enough language signal + en_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['eng'] > ls['deu'] and ls['eng'] > 0.05] + de_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['deu'] > ls['eng'] and ls['deu'] > 0.05] + + if not en_candidates or not de_candidates: + # Language signals too weak for content-based classification + logger.info("ClassifyColumns: Level 1 failed - no clear EN/DE language split") + return None + + # Pick the best EN and DE candidates + best_en = max(en_candidates, key=lambda x: x[2]['eng']) + best_de = max(de_candidates, key=lambda x: x[2]['deu']) + + if best_en[0] == best_de[0]: + # Same column scored highest for both — ambiguous + logger.info("ClassifyColumns: Level 1 failed - same column highest for EN and DE") + return None + + en_conf = best_en[2]['eng'] + de_conf = best_de[2]['deu'] + + regions.append(PageRegion( + type='column_en', x=best_en[1].x, y=best_en[1].y, + width=best_en[1].width, height=content_h, + classification_confidence=round(en_conf, 2), + classification_method='content', + )) + assigned.add(best_en[0]) + + regions.append(PageRegion( + type='column_de', x=best_de[1].x, y=best_de[1].y, + width=best_de[1].width, height=content_h, + classification_confidence=round(de_conf, 2), + classification_method='content', + )) + assigned.add(best_de[0]) + + # Step 3: Remaining columns → example or text based on role scores + for i, geom, ls, rs in remaining: + if i in assigned: + continue + if rs['sentence'] > 0.4: + regions.append(PageRegion( + type='column_example', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=round(rs['sentence'], 2), + classification_method='content', + )) + else: + regions.append(PageRegion( + type='column_example', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=0.5, + classification_method='content', + )) + + regions.sort(key=lambda r: r.x) + return regions + + +def _classify_by_position_enhanced(geometries: List[ColumnGeometry], + lang_scores: List[Dict[str, float]], + content_w: int, + content_h: int) -> Optional[List[PageRegion]]: + """Level 2: Position-based rules enhanced with language confirmation. + + Uses the old positional heuristics but confirms EN/DE assignment + with language scores (swapping if needed). + """ + regions = [] + untyped = list(range(len(geometries))) + + # Rule 1: Leftmost narrow column → page_ref + g0 = geometries[0] + if g0.width_ratio < 0.12: regions.append(PageRegion( - type='page_ref', x=col_defs[0][0], y=top_y, - width=col_defs[0][1], height=content_h + type='page_ref', x=g0.x, y=g0.y, + width=g0.width, height=content_h, + classification_confidence=0.8, + classification_method='position_enhanced', )) untyped.remove(0) - logger.info(f"LayoutByWords: col 0 → page_ref (width={col_defs[0][1]}px, " - f"{col_defs[0][1]*100/total_content_w:.1f}%)") - # Rule 2: Narrow column with few words (< 8% width, <= 8 words) → column_marker + # Rule 2: Narrow columns with few words → marker for i in list(untyped): - col_x, col_w, col_count = col_defs[i] - if col_w < total_content_w * 0.08 and col_count <= 8: + geom = geometries[i] + if geom.width_ratio < 0.08 and geom.word_count <= 10: regions.append(PageRegion( - type='column_marker', x=col_x, y=top_y, - width=col_w, height=content_h + type='column_marker', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=0.7, + classification_method='position_enhanced', )) untyped.remove(i) - logger.info(f"LayoutByWords: col {i} → column_marker (width={col_w}px, " - f"{col_w*100/total_content_w:.1f}%, words={col_count})") - # Rule 3: Rightmost remaining (widest or last) → column_example + # Rule 3: Rightmost remaining → column_example (if 3+ remaining) if len(untyped) >= 3: last_idx = untyped[-1] + geom = geometries[last_idx] regions.append(PageRegion( - type='column_example', x=col_defs[last_idx][0], y=top_y, - width=col_defs[last_idx][1], height=content_h + type='column_example', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=0.7, + classification_method='position_enhanced', )) untyped.remove(last_idx) - logger.info(f"LayoutByWords: col {last_idx} → column_example") - # Rule 4: First remaining → column_en, second → column_de + # Rule 4: First two remaining → EN/DE, but check language to possibly swap + if len(untyped) >= 2: + idx_a = untyped[0] + idx_b = untyped[1] + ls_a = lang_scores[idx_a] + ls_b = lang_scores[idx_b] + + # Default: first=EN, second=DE (old behavior) + en_idx, de_idx = idx_a, idx_b + conf = 0.7 + + # Swap if language signals clearly indicate the opposite + if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']: + en_idx, de_idx = idx_b, idx_a + conf = 0.85 + logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores") + + regions.append(PageRegion( + type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y, + width=geometries[en_idx].width, height=content_h, + classification_confidence=conf, + classification_method='position_enhanced', + )) + regions.append(PageRegion( + type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y, + width=geometries[de_idx].width, height=content_h, + classification_confidence=conf, + classification_method='position_enhanced', + )) + untyped = untyped[2:] + elif len(untyped) == 1: + idx = untyped[0] + geom = geometries[idx] + regions.append(PageRegion( + type='column_en', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=0.5, + classification_method='position_enhanced', + )) + untyped = [] + + # Remaining → example + for idx in untyped: + geom = geometries[idx] + regions.append(PageRegion( + type='column_example', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=0.5, + classification_method='position_enhanced', + )) + + regions.sort(key=lambda r: r.x) + return regions + + +def _classify_by_position_fallback(geometries: List[ColumnGeometry], + content_w: int, + content_h: int) -> List[PageRegion]: + """Level 3: Pure position-based fallback (identical to old code). + + Guarantees no regression from the previous behavior. + """ + regions = [] + untyped = list(range(len(geometries))) + + # Rule 1: Leftmost narrow column → page_ref + g0 = geometries[0] + if g0.width_ratio < 0.12: + regions.append(PageRegion( + type='page_ref', x=g0.x, y=g0.y, + width=g0.width, height=content_h, + classification_confidence=1.0, + classification_method='position_fallback', + )) + untyped.remove(0) + + # Rule 2: Narrow + few words → marker + for i in list(untyped): + geom = geometries[i] + if geom.width_ratio < 0.08 and geom.word_count <= 8: + regions.append(PageRegion( + type='column_marker', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=1.0, + classification_method='position_fallback', + )) + untyped.remove(i) + + # Rule 3: Rightmost remaining → example (if 3+) + if len(untyped) >= 3: + last_idx = untyped[-1] + geom = geometries[last_idx] + regions.append(PageRegion( + type='column_example', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=1.0, + classification_method='position_fallback', + )) + untyped.remove(last_idx) + + # Rule 4: First remaining → EN, second → DE if len(untyped) >= 2: en_idx = untyped[0] de_idx = untyped[1] regions.append(PageRegion( - type='column_en', x=col_defs[en_idx][0], y=top_y, - width=col_defs[en_idx][1], height=content_h + type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y, + width=geometries[en_idx].width, height=content_h, + classification_confidence=1.0, + classification_method='position_fallback', )) regions.append(PageRegion( - type='column_de', x=col_defs[de_idx][0], y=top_y, - width=col_defs[de_idx][1], height=content_h + type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y, + width=geometries[de_idx].width, height=content_h, + classification_confidence=1.0, + classification_method='position_fallback', )) untyped = untyped[2:] - logger.info(f"LayoutByWords: col {en_idx} → column_en, col {de_idx} → column_de") elif len(untyped) == 1: - # Only one left — call it column_en idx = untyped[0] + geom = geometries[idx] regions.append(PageRegion( - type='column_en', x=col_defs[idx][0], y=top_y, - width=col_defs[idx][1], height=content_h + type='column_en', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=1.0, + classification_method='position_fallback', )) untyped = [] - # Any remaining untyped columns get generic column_example type for idx in untyped: + geom = geometries[idx] regions.append(PageRegion( - type='column_example', x=col_defs[idx][0], y=top_y, - width=col_defs[idx][1], height=content_h + type='column_example', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=1.0, + classification_method='position_fallback', )) - # Sort by x position for consistent output regions.sort(key=lambda r: r.x) + return regions - # Add header/footer + +def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int, + img_w: int, img_h: int) -> None: + """Add header/footer regions in-place.""" if top_y > 10: - regions.append(PageRegion(type='header', x=0, y=0, width=w, height=top_y)) - if bottom_y < h - 10: - regions.append(PageRegion(type='footer', x=0, y=bottom_y, width=w, height=h - bottom_y)) + regions.append(PageRegion(type='header', x=0, y=0, width=img_w, height=top_y)) + if bottom_y < img_h - 10: + regions.append(PageRegion(type='footer', x=0, y=bottom_y, width=img_w, height=img_h - bottom_y)) + + +# --- Main Entry Point --- + +def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]: + """Detect columns using two-phase approach: geometry then content classification. + + Phase A: detect_column_geometry() — clustering word positions into columns. + Phase B: classify_column_types() — content-based type assignment with fallback. + + Falls back to projection-based analyze_layout() if geometry detection fails. + + Args: + ocr_img: Binarized grayscale image for layout analysis. + dewarped_bgr: Original BGR image (for Tesseract word detection). + + Returns: + List of PageRegion objects with types, confidence, and method. + """ + h, w = ocr_img.shape[:2] + + # Phase A: Geometry detection + result = detect_column_geometry(ocr_img, dewarped_bgr) + + if result is None: + # Fallback to projection-based layout + logger.info("LayoutByWords: geometry detection failed, falling back to projection profiles") + layout_img = create_layout_image(dewarped_bgr) + return analyze_layout(layout_img, ocr_img) + + geometries, left_x, right_x, top_y, bottom_y = result + content_w = right_x - left_x + + # Phase B: Content-based classification + regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y) col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref']) - logger.info(f"LayoutByWords: {col_count} columns detected: " - f"{[(r.type, r.x, r.width) for r in regions if r.type not in ('header','footer')]}") + methods = set(r.classification_method for r in regions if r.classification_method) + logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): " + f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer')]}") return regions @@ -1276,6 +1755,11 @@ def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]], Returns: List of VocabRow objects. """ + # If no vocabulary columns detected (e.g. plain text page), return empty + if 'column_en' not in ocr_results and 'column_de' not in ocr_results: + logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty") + return [] + # Group words into lines per column en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px) de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px) diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index d7cfd1a..1c1f070 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -648,8 +648,16 @@ async def detect_columns(session_id: str): duration = time.time() - t0 columns = [asdict(r) for r in regions] + + # Determine classification methods used + methods = list(set( + c.get("classification_method", "") for c in columns + if c.get("classification_method") + )) + column_result = { "columns": columns, + "classification_methods": methods, "duration_seconds": round(duration, 2), } @@ -742,6 +750,7 @@ async def _get_columns_overlay(session_id: str) -> Response: "column_en": (255, 180, 0), # Blue "column_de": (0, 200, 0), # Green "column_example": (0, 140, 255), # Orange + "column_text": (200, 200, 0), # Cyan/Turquoise "page_ref": (200, 0, 200), # Purple "column_marker": (0, 0, 220), # Red "header": (128, 128, 128), # Gray @@ -760,8 +769,11 @@ async def _get_columns_overlay(session_id: str) -> Response: # Solid border cv2.rectangle(img, (x, y), (x + w, y + h), color, 3) - # Label + # Label with confidence label = col.get("type", "unknown").replace("column_", "").upper() + conf = col.get("classification_confidence") + if conf is not None and conf < 1.0: + label = f"{label} {int(conf * 100)}%" cv2.putText(img, label, (x + 10, y + 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, color, 2)