diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index f2120e5..e145565 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -2067,6 +2067,148 @@ def _detect_sub_columns( return result +def _split_broad_columns( + geometries: List[ColumnGeometry], + content_w: int, + left_x: int = 0, + _broad_threshold: float = 0.35, + _min_gap_px: int = 15, + _min_words_per_split: int = 5, +) -> List[ColumnGeometry]: + """Split overly broad columns that contain two language blocks (EN+DE). + + Uses word-coverage gap analysis: builds a per-pixel coverage array from the + words inside each broad column, finds the largest horizontal gap, and splits + the column at that gap. + + Args: + geometries: Column geometries from _detect_sub_columns. + content_w: Width of the content area in pixels. + left_x: Left edge of content ROI in absolute image coordinates. + _broad_threshold: Minimum width_ratio to consider a column "broad". + _min_gap_px: Minimum gap width (pixels) to trigger a split. + _min_words_per_split: Both halves must have at least this many words. + + Returns: + Updated list of ColumnGeometry (possibly with more columns). + """ + result: List[ColumnGeometry] = [] + + for geo in geometries: + if geo.width_ratio <= _broad_threshold or len(geo.words) < 10: + result.append(geo) + continue + + # Build word-coverage array (per pixel within column) + col_left_rel = geo.x - left_x # column left in content-relative coords + coverage = np.zeros(geo.width, dtype=np.float32) + + for wd in geo.words: + # wd['left'] is relative to left_x (content ROI) + wl = wd['left'] - col_left_rel + wr = wl + wd.get('width', 0) + wl = max(0, int(wl)) + wr = min(geo.width, int(wr)) + if wr > wl: + coverage[wl:wr] += 1.0 + + # Light smoothing (kernel=3px) to avoid noise + if len(coverage) > 3: + kernel = np.ones(3, dtype=np.float32) / 3.0 + coverage = np.convolve(coverage, kernel, mode='same') + + # Normalise to [0, 1] + cmax = coverage.max() + if cmax > 0: + coverage /= cmax + + # Find gaps where coverage < 0.5 + low_mask = coverage < 0.5 + gap_start = None + best_gap = None # (start, end, width) + for px in range(len(low_mask)): + if low_mask[px]: + if gap_start is None: + gap_start = px + else: + if gap_start is not None: + gw = px - gap_start + if best_gap is None or gw > best_gap[2]: + best_gap = (gap_start, px, gw) + gap_start = None + # Handle trailing gap + if gap_start is not None: + gw = len(low_mask) - gap_start + if best_gap is None or gw > best_gap[2]: + best_gap = (gap_start, len(low_mask), gw) + + if best_gap is None or best_gap[2] < _min_gap_px: + result.append(geo) + continue + + gap_center = (best_gap[0] + best_gap[1]) // 2 + + # Split words by midpoint relative to gap + left_words = [] + right_words = [] + for wd in geo.words: + wl = wd['left'] - col_left_rel + mid = wl + wd.get('width', 0) / 2.0 + if mid < gap_center: + left_words.append(wd) + else: + right_words.append(wd) + + if len(left_words) < _min_words_per_split or len(right_words) < _min_words_per_split: + result.append(geo) + continue + + # Build two new ColumnGeometry objects + split_x_abs = geo.x + gap_center + left_w = gap_center + right_w = geo.width - gap_center + + left_geo = ColumnGeometry( + index=0, + x=geo.x, + y=geo.y, + width=left_w, + height=geo.height, + word_count=len(left_words), + words=left_words, + width_ratio=left_w / content_w if content_w else 0, + is_sub_column=True, + ) + right_geo = ColumnGeometry( + index=0, + x=split_x_abs, + y=geo.y, + width=right_w, + height=geo.height, + word_count=len(right_words), + words=right_words, + width_ratio=right_w / content_w if content_w else 0, + is_sub_column=True, + ) + + logger.info( + f"SplitBroadCols: col {geo.index} SPLIT at gap_center={gap_center} " + f"(gap {best_gap[2]}px @ [{best_gap[0]}..{best_gap[1]}]), " + f"left={len(left_words)} words (w={left_w}), " + f"right={len(right_words)} words (w={right_w})" + ) + + result.append(left_geo) + result.append(right_geo) + + # Re-index left-to-right + result.sort(key=lambda g: g.x) + for i, g in enumerate(result): + g.index = i + + return result + + def _build_geometries_from_starts( col_starts: List[Tuple[int, int]], word_dicts: List[Dict], @@ -4128,6 +4270,9 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li geometries = _detect_sub_columns(geometries, content_w, left_x=left_x, top_y=top_y, header_y=header_y, footer_y=footer_y) + # Split broad columns that contain EN+DE mixed via word-coverage gaps + geometries = _split_broad_columns(geometries, content_w, left_x=left_x) + # Phase B: Positional classification (no language scoring) content_h = bottom_y - top_y regions = positional_column_regions(geometries, content_w, content_h, left_x) diff --git a/klausur-service/backend/vocab_worksheet_api.py b/klausur-service/backend/vocab_worksheet_api.py index 90a44bf..20b70f0 100644 --- a/klausur-service/backend/vocab_worksheet_api.py +++ b/klausur-service/backend/vocab_worksheet_api.py @@ -70,6 +70,7 @@ try: detect_column_geometry, analyze_layout_by_words, analyze_layout, create_layout_image, detect_row_geometry, build_cell_grid_v2, _cells_to_vocab_entries, _detect_sub_columns, _detect_header_footer_gaps, + _split_broad_columns, expand_narrow_columns, positional_column_regions, llm_review_entries, detect_and_fix_orientation, _fix_phonetic_brackets, @@ -1182,6 +1183,9 @@ async def upload_pdf_get_info( async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)): """Get a thumbnail image of a specific PDF page. + Uses fitz for rendering so that page_rotations (from OCR orientation + detection) are applied consistently. + Args: hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5). """ @@ -1194,10 +1198,25 @@ async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Que if not pdf_data: raise HTTPException(status_code=400, detail="No PDF uploaded for this session") - thumbnail = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=not hires) + try: + import fitz + zoom = 2.0 if hires else 0.5 + pdf_document = fitz.open(stream=pdf_data, filetype="pdf") + page = pdf_document[page_number] + # Apply orientation correction detected during OCR processing + rot = session.get("page_rotations", {}).get(page_number, 0) + if rot: + page.set_rotation(rot) + mat = fitz.Matrix(zoom, zoom) + pix = page.get_pixmap(matrix=mat) + png_data = pix.tobytes("png") + pdf_document.close() + except Exception as e: + logger.error(f"PDF thumbnail failed: {e}") + raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}") return StreamingResponse( - io.BytesIO(thumbnail), + io.BytesIO(png_data), media_type="image/png", ) @@ -1226,11 +1245,15 @@ async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Qu import fitz pdf_document = fitz.open(stream=pdf_data, filetype="pdf") page = pdf_document[page_number] + # Apply orientation correction detected during OCR processing + rot = session.get("page_rotations", {}).get(page_number, 0) + if rot: + page.set_rotation(rot) mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat) png_data = pix.tobytes("png") pdf_document.close() - logger.info(f"PDF page {page_number} rendered at zoom={zoom}: {len(png_data)} bytes") + logger.info(f"PDF page {page_number} rendered at zoom={zoom} rot={rot}: {len(png_data)} bytes") except Exception as e: logger.error(f"PDF page image failed: {e}") raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}") @@ -1272,10 +1295,11 @@ async def process_single_page( raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).") # --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) --- + rotation_deg = 0 if OCR_PIPELINE_AVAILABLE: try: img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0) - page_vocabulary = await _run_ocr_pipeline_for_page( + page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page( img_bgr, page_number, session_id, ) except Exception as e: @@ -1317,6 +1341,9 @@ async def process_single_page( logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert") + # Store rotation for this page (used by image/thumbnail endpoints) + session.setdefault("page_rotations", {})[page_number] = rotation_deg + # Add to session's vocabulary (append, don't replace) existing_vocab = session.get("vocabulary", []) # Remove any existing entries from this page (in case of re-processing) @@ -1334,6 +1361,7 @@ async def process_single_page( "vocabulary_count": len(page_vocabulary), "total_vocabulary_count": len(existing_vocab), "extraction_confidence": 0.9, + "rotation": rotation_deg, } @@ -1341,7 +1369,7 @@ async def _run_ocr_pipeline_for_page( img_bgr: np.ndarray, page_number: int, vocab_session_id: str, -) -> list: +) -> tuple: """Run the full OCR pipeline on a single page image and return vocab entries. Uses the same pipeline as the admin OCR pipeline (ocr_pipeline_api.py). @@ -1352,7 +1380,8 @@ async def _run_ocr_pipeline_for_page( vocab_session_id: Vocab session ID for logging. Steps: deskew → dewarp → columns → rows → words → (LLM review) - Returns list of dicts with keys: id, english, german, example_sentence, source_page + Returns (entries, rotation_deg) where entries is a list of dicts and + rotation_deg is the orientation correction applied (0, 90, 180, 270). """ import time as _time @@ -1418,6 +1447,7 @@ async def _run_ocr_pipeline_for_page( header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None) geometries = _detect_sub_columns(geometries, content_w, left_x=left_x, top_y=top_y, header_y=header_y, footer_y=footer_y) + geometries = _split_broad_columns(geometries, content_w, left_x=left_x) geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts) content_h = bottom_y - top_y regions = positional_column_regions(geometries, content_w, content_h, left_x) @@ -1534,7 +1564,7 @@ async def _run_ocr_pipeline_for_page( logger.info(f"OCR Pipeline page {page_number + 1}: " f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s") - return page_vocabulary + return page_vocabulary, rotation @router.post("/sessions/{session_id}/process-pages") diff --git a/studio-v2/app/vocab-worksheet/page.tsx b/studio-v2/app/vocab-worksheet/page.tsx index 854a31c..77ef950 100644 --- a/studio-v2/app/vocab-worksheet/page.tsx +++ b/studio-v2/app/vocab-worksheet/page.tsx @@ -511,6 +511,26 @@ export default function VocabWorksheetPage() { setExtractionStatus(`Alle Seiten fehlgeschlagen.`) } + // Reload thumbnails for processed pages (server may have rotated them) + if (successful.length > 0 && session) { + const updatedThumbs = [...pagesThumbnails] + for (const pageNum of successful) { + const idx = pageNum - 1 // successful stores 1-indexed + try { + const thumbRes = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session.id}/pdf-thumbnail/${idx}?hires=true&t=${Date.now()}`) + if (thumbRes.ok) { + const blob = await thumbRes.blob() + // Revoke old blob URL to avoid memory leaks + if (updatedThumbs[idx]) URL.revokeObjectURL(updatedThumbs[idx]) + updatedThumbs[idx] = URL.createObjectURL(blob) + } + } catch (e) { + console.error(`Failed to refresh thumbnail for page ${pageNum}`) + } + } + setPagesThumbnails(updatedThumbs) + } + setSession(prev => prev ? { ...prev, status: 'extracted' } : null) }