From 271649525013398f6c0ab6d0d8406ed6fa190429 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 10 Mar 2026 08:43:26 +0100 Subject: [PATCH] =?UTF-8?q?fix:=20Sub-Session=20Zeilenerkennung=20?= =?UTF-8?q?=E2=80=94=20Tesseract+inv=20im=20Spalten-Schritt=20cachen?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bisher wurden _word_dicts, _inv und _content_bounds fuer Sub-Sessions nicht gecacht, sodass detect_rows auf detect_column_geometry() zurueckfiel. Das konnte bei kleinen Box-Bildern mit <5 Woertern fehlschlagen. Jetzt laeuft Tesseract + Binarisierung direkt im Pseudo-Spalten-Block, und die Intermediates werden gecacht. Zusaetzlich ausfuehrliche Kommentare zur Zeilenerkennung (detect_row_geometry, _regularize_row_grid). Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_layout.py | 101 +++++++++++++++----- klausur-service/backend/ocr_pipeline_api.py | 47 ++++++++- 2 files changed, 125 insertions(+), 23 deletions(-) diff --git a/klausur-service/backend/cv_layout.py b/klausur-service/backend/cv_layout.py index 6b49a47..d269fac 100644 --- a/klausur-service/backend/cv_layout.py +++ b/klausur-service/backend/cv_layout.py @@ -1528,9 +1528,38 @@ def detect_row_geometry( ) -> List['RowGeometry']: """Detect row geometry using horizontal whitespace-gap analysis. - Mirrors the vertical gap approach used for columns, but operates on - horizontal projection profiles to find gaps between text lines. - Also classifies header/footer rows based on gap size. + Algorithm overview (two phases): + + Phase 1 — Gap-based detection (Steps 1–6): + 1. Build a horizontal projection profile: for each y-pixel, sum the + ink density across the content width. Only pixels within/near + Tesseract word bounding boxes contribute (word_mask), so that + images/illustrations don't merge adjacent text rows. + 2. Smooth the projection and find contiguous regions below a + threshold (= gaps / horizontal whitespace between text lines). + The threshold is 15% of the median non-zero density. + 3. Validate gaps against word bounding boxes — discard any gap + that overlaps a word, or shift the gap boundary to avoid the word. + 4. Build rows from the spans between validated gaps. + 5. Detect header/footer rows: gaps in the top/bottom 15% of the + page that are >= 2× the median gap size mark section boundaries. + + Phase 2 — Word-center regularization (_regularize_row_grid, Step 7): + For each word, compute its vertical center (top + height/2). + Group words into line clusters by Y-proximity (tolerance = 40% of + the median gap-based row height). + For each cluster, the line center = median of all word centers. + The "pitch" = distance between consecutive line centers. + Section breaks are detected where the pitch exceeds 1.8× the median. + Within each section, row boundaries are placed at the midpoints + between consecutive line centers: + - Row top = midpoint to previous line center (or center - pitch/2 for first) + - Row bottom = midpoint to next line center (or center + pitch/2 for last) + This ensures rows tile without gaps or overlaps. + + Fallback: + If < 2 gaps are found (very dense or uniform text), falls back to + _build_rows_from_word_grouping() which groups words by Y proximity. Args: inv: Inverted binarized image (white text on black bg, full page). @@ -1548,13 +1577,11 @@ def detect_row_geometry( logger.warning("detect_row_geometry: content area too small") return [] - # --- Step 1: Horizontal projection profile (text-only, images masked out) --- + # --- Step 1: Horizontal projection profile --- + # For each y-pixel row, sum ink density across the content width. + # A word-coverage mask ensures only pixels near Tesseract words contribute, + # so that illustrations/images don't inflate the density and merge rows. content_strip = inv[top_y:bottom_y, left_x:right_x] - - # Build a word-coverage mask so that image regions (high ink density but no - # Tesseract words) are ignored. Only pixels within/near word bounding boxes - # contribute to the projection. This prevents large illustrations from - # merging multiple vocabulary rows into one. WORD_PAD_Y = max(4, content_h // 300) # small vertical padding around words word_mask = np.zeros((content_h, content_w), dtype=np.uint8) for wd in word_dicts: @@ -1568,7 +1595,11 @@ def detect_row_geometry( h_proj = np.sum(masked_strip, axis=1).astype(float) h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj - # --- Step 2: Smoothing + threshold --- + # --- Step 2: Smoothing + gap threshold --- + # Smooth the projection to reduce noise, then threshold at 15% of the + # median non-zero density. Pixels below this threshold are considered + # "gap" (horizontal whitespace between text lines). + # MIN_GAP_HEIGHT prevents tiny noise gaps from splitting rows. kernel_size = max(3, content_h // 200) if kernel_size % 2 == 0: kernel_size += 1 @@ -1602,6 +1633,9 @@ def detect_row_geometry( f"min_height={MIN_GAP_HEIGHT}px)") # --- Step 4: Validate gaps against word bounding boxes --- + # A gap is valid only if no word's bounding box overlaps it vertically. + # If a word overlaps, try to shift the gap boundary above or below the + # word. If neither shift yields enough room (>= MIN_GAP_HEIGHT), discard. validated_gaps = [] for gap_start_rel, gap_end_rel in raw_gaps: overlapping = False @@ -1688,7 +1722,9 @@ def detect_row_geometry( f"(gap={best_footer_gap[1] - best_footer_gap[0]}px)") # --- Step 6: Build RowGeometry objects from gaps --- - # Rows are the spans between gaps + # Rows are the spans between consecutive gaps. The gap midpoints define + # where one row ends and the next begins. Each row's height extends + # from the end of the previous gap to the start of the next gap. row_boundaries = [] # (start_y_rel, end_y_rel) # Top of content to first gap @@ -1746,8 +1782,13 @@ def detect_row_geometry( )) # --- Step 7: Word-center grid regularization --- - # Derive precise row boundaries from word vertical centers. Detects - # section breaks (headings, paragraphs) and builds per-section grids. + # Refine the gap-based rows using word vertical centers. For each word, + # compute center_y = top + height/2. Group into line clusters, compute + # the pitch (distance between consecutive line centers), and place row + # boundaries at the midpoints between centers. This gives more precise + # and evenly-spaced rows than the gap-based approach alone. + # Also detects section breaks (headings, paragraphs) where the pitch + # exceeds 1.8× the median, and handles each section independently. rows = _regularize_row_grid(rows, word_dicts, left_x, right_x, top_y, content_w, content_h, inv) @@ -1772,15 +1813,31 @@ def _regularize_row_grid( Instead of overlaying a rigid grid, this derives row positions bottom-up from the words themselves: - 1. Group words into line clusters (by Y proximity). - 2. For each cluster compute center_y (median of word vertical centers) - and letter_height (median of word heights). - 3. Compute the pitch (distance between consecutive centers). - 4. Detect section breaks where the gap is >1.8× the median pitch - (headings, sub-headings, paragraph breaks). - 5. Within each section, use the local pitch to place row boundaries - at the midpoints between consecutive centers. - 6. Validate that ≥85% of words land in a grid row; otherwise fall back. + Step A: Group all content words into line clusters by Y-proximity. + Tolerance = 40% of median gap-based row height. + Step B: For each cluster compute: + - center_y = median of (word_top + word_height/2) for all words + - letter_h = median of word heights (excluding outliers > 2× median) + Step B2: Merge clusters whose centers are closer than 30% of row height + (spurious splits from OCR jitter). + Step C: Compute pitches (distances between consecutive centers). + Detect section breaks where gap > 1.8× median pitch. + Step D: Split clusters into sections at the section breaks. + Step E: Within each section, place row boundaries at midpoints between + consecutive line centers: + - First row top = center - local_pitch/2 + - Last row bottom = center + local_pitch/2 + - Interior boundaries = (center_i + center_{i+1}) / 2 + This ensures rows tile seamlessly without gaps or overlaps. + Step F: Re-assign words to the nearest grid row by vertical center distance. + Step G: Validate that >= 85% of words land in a grid row; otherwise + fall back to the original gap-based rows. + Step H: Merge with preserved header/footer rows and re-index. + + Guard: Requires >= 5 content rows from gap-based detection to activate. + This prevents the regularizer from running on very small images (e.g. + box sub-sessions with only 3-6 rows) where the gap-based detection + is already accurate enough. Header/footer rows from the gap-based detection are preserved. """ diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 24f4268..e78084d 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -1209,10 +1209,55 @@ async def detect_columns(session_id: str): if img_bgr is None: raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before column detection") - # Sub-sessions: skip column detection, create single pseudo-column + # ----------------------------------------------------------------------- + # Sub-sessions (box crops): skip column detection entirely. + # Instead, create a single pseudo-column spanning the full image width. + # Also run Tesseract + binarization here so that the row detection step + # can reuse the cached intermediates (_word_dicts, _inv, _content_bounds) + # instead of falling back to detect_column_geometry() which may fail + # on small box images with < 5 words. + # ----------------------------------------------------------------------- session = await get_session_db(session_id) if session and session.get("parent_session_id"): h, w = img_bgr.shape[:2] + + # Binarize + invert for row detection (horizontal projection profile) + ocr_img = create_ocr_image(img_bgr) + inv = cv2.bitwise_not(ocr_img) + + # Run Tesseract to get word bounding boxes. + # Word positions are relative to the full image (no ROI crop needed + # because the sub-session image IS the cropped box already). + # detect_row_geometry expects word positions relative to content ROI, + # so with content_bounds = (0, w, 0, h) the coordinates are correct. + try: + from PIL import Image as PILImage + pil_img = PILImage.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)) + import pytesseract + data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT) + word_dicts = [] + for i in range(len(data['text'])): + conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1 + text = str(data['text'][i]).strip() + if conf < 30 or not text: + continue + word_dicts.append({ + 'text': text, 'conf': conf, + 'left': int(data['left'][i]), + 'top': int(data['top'][i]), + 'width': int(data['width'][i]), + 'height': int(data['height'][i]), + }) + logger.info(f"OCR Pipeline: sub-session {session_id}: Tesseract found {len(word_dicts)} words") + except Exception as e: + logger.warning(f"OCR Pipeline: sub-session {session_id}: Tesseract failed: {e}") + word_dicts = [] + + # Cache intermediates for row detection (detect_rows reuses these) + cached["_word_dicts"] = word_dicts + cached["_inv"] = inv + cached["_content_bounds"] = (0, w, 0, h) + column_result = { "columns": [{ "type": "column_text",