[split-required] Split remaining Python monoliths (Phase 1 continued)
klausur-service (7 monoliths): - grid_editor_helpers.py (1,737 → 5 files: columns, filters, headers, zones) - cv_cell_grid.py (1,675 → 7 files: build, legacy, streaming, merge, vocab) - worksheet_editor_api.py (1,305 → 4 files: models, AI, reconstruct, routes) - legal_corpus_ingestion.py (1,280 → 3 files: registry, chunking, ingestion) - cv_review.py (1,248 → 4 files: pipeline, spell, LLM, barrel) - cv_preprocessing.py (1,166 → 3 files: deskew, dewarp, barrel) - rbac.py, admin_api.py, routes/eh.py remain (next batch) backend-lehrer (1 monolith): - classroom_engine/repository.py (1,705 → 7 files by domain) All re-export barrels preserve backward compatibility. Zero import errors verified. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
498
klausur-service/backend/cv_cell_grid_build.py
Normal file
498
klausur-service/backend/cv_cell_grid_build.py
Normal file
@@ -0,0 +1,498 @@
|
||||
"""
|
||||
Cell-grid construction v2 (hybrid: broad columns via word lookup, narrow via cell-crop).
|
||||
Extracted from cv_cell_grid.py.
|
||||
Lizenz: Apache 2.0 — DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import PageRegion, RowGeometry
|
||||
from cv_ocr_engines import (
|
||||
RAPIDOCR_AVAILABLE,
|
||||
_assign_row_words_to_columns,
|
||||
_clean_cell_text,
|
||||
_clean_cell_text_lite,
|
||||
_words_to_reading_order_text,
|
||||
_words_to_spaced_text,
|
||||
ocr_region_lighton,
|
||||
ocr_region_rapid,
|
||||
ocr_region_trocr,
|
||||
)
|
||||
from cv_cell_grid_helpers import (
|
||||
_MIN_WORD_CONF,
|
||||
_ensure_minimum_crop_size,
|
||||
_heal_row_gaps,
|
||||
_is_artifact_row,
|
||||
_select_psm_for_column,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _ocr_cell_crop — isolated cell-crop OCR for v2 hybrid mode
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _ocr_cell_crop(
|
||||
row_idx: int,
|
||||
col_idx: int,
|
||||
row: RowGeometry,
|
||||
col: PageRegion,
|
||||
ocr_img: np.ndarray,
|
||||
img_bgr: Optional[np.ndarray],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
engine_name: str,
|
||||
lang: str,
|
||||
lang_map: Dict[str, str],
|
||||
) -> Dict[str, Any]:
|
||||
"""OCR a single cell by cropping the exact column x row intersection.
|
||||
|
||||
No padding beyond cell boundaries -> no neighbour bleeding.
|
||||
"""
|
||||
# Display bbox: exact column x row intersection
|
||||
disp_x = col.x
|
||||
disp_y = row.y
|
||||
disp_w = col.width
|
||||
disp_h = row.height
|
||||
|
||||
# Crop boundaries: add small internal padding (3px each side) to avoid
|
||||
# clipping characters near column/row edges (e.g. parentheses, descenders).
|
||||
# Stays within image bounds but may extend slightly beyond strict cell.
|
||||
# 3px is small enough to avoid neighbour content at typical scan DPI (200-300).
|
||||
_PAD = 3
|
||||
cx = max(0, disp_x - _PAD)
|
||||
cy = max(0, disp_y - _PAD)
|
||||
cx2 = min(img_w, disp_x + disp_w + _PAD)
|
||||
cy2 = min(img_h, disp_y + disp_h + _PAD)
|
||||
cw = cx2 - cx
|
||||
ch = cy2 - cy
|
||||
|
||||
empty_cell = {
|
||||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||||
'row_index': row_idx,
|
||||
'col_index': col_idx,
|
||||
'col_type': col.type,
|
||||
'text': '',
|
||||
'confidence': 0.0,
|
||||
'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
|
||||
'bbox_pct': {
|
||||
'x': round(disp_x / img_w * 100, 2) if img_w else 0,
|
||||
'y': round(disp_y / img_h * 100, 2) if img_h else 0,
|
||||
'w': round(disp_w / img_w * 100, 2) if img_w else 0,
|
||||
'h': round(disp_h / img_h * 100, 2) if img_h else 0,
|
||||
},
|
||||
'ocr_engine': 'cell_crop_v2',
|
||||
'is_bold': False,
|
||||
}
|
||||
|
||||
if cw <= 0 or ch <= 0:
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
|
||||
return empty_cell
|
||||
|
||||
# --- Pixel-density check: skip truly empty cells ---
|
||||
if ocr_img is not None:
|
||||
crop = ocr_img[cy:cy + ch, cx:cx + cw]
|
||||
if crop.size > 0:
|
||||
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
|
||||
if dark_ratio < 0.005:
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
|
||||
row_idx, col_idx, dark_ratio, cw, ch)
|
||||
return empty_cell
|
||||
|
||||
# --- Prepare crop for OCR ---
|
||||
cell_lang = lang_map.get(col.type, lang)
|
||||
psm = _select_psm_for_column(col.type, col.width, row.height)
|
||||
text = ''
|
||||
avg_conf = 0.0
|
||||
used_engine = 'cell_crop_v2'
|
||||
|
||||
if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
|
||||
cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
|
||||
words = ocr_region_trocr(img_bgr, cell_region,
|
||||
handwritten=(engine_name == "trocr-handwritten"))
|
||||
elif engine_name == "lighton" and img_bgr is not None:
|
||||
cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
|
||||
words = ocr_region_lighton(img_bgr, cell_region)
|
||||
elif engine_name == "rapid" and img_bgr is not None:
|
||||
# Upscale small BGR crops for RapidOCR.
|
||||
bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw]
|
||||
if bgr_crop.size == 0:
|
||||
words = []
|
||||
else:
|
||||
crop_h, crop_w = bgr_crop.shape[:2]
|
||||
if crop_h < 80:
|
||||
# Force 3x upscale for short rows — small chars need more pixels
|
||||
scale = 3.0
|
||||
bgr_up = cv2.resize(bgr_crop, None, fx=scale, fy=scale,
|
||||
interpolation=cv2.INTER_CUBIC)
|
||||
else:
|
||||
bgr_up = _ensure_minimum_crop_size(bgr_crop, min_dim=150, max_scale=3)
|
||||
up_h, up_w = bgr_up.shape[:2]
|
||||
scale_x = up_w / max(crop_w, 1)
|
||||
scale_y = up_h / max(crop_h, 1)
|
||||
was_scaled = (up_w != crop_w or up_h != crop_h)
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
|
||||
row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y)
|
||||
tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
|
||||
words = ocr_region_rapid(bgr_up, tmp_region)
|
||||
# Remap positions back to original image coords
|
||||
if words and was_scaled:
|
||||
for w in words:
|
||||
w['left'] = int(w['left'] / scale_x) + cx
|
||||
w['top'] = int(w['top'] / scale_y) + cy
|
||||
w['width'] = int(w['width'] / scale_x)
|
||||
w['height'] = int(w['height'] / scale_y)
|
||||
elif words:
|
||||
for w in words:
|
||||
w['left'] += cx
|
||||
w['top'] += cy
|
||||
else:
|
||||
# Tesseract: upscale tiny crops for better recognition
|
||||
if ocr_img is not None:
|
||||
crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
|
||||
upscaled = _ensure_minimum_crop_size(crop_slice)
|
||||
up_h, up_w = upscaled.shape[:2]
|
||||
tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
|
||||
words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=psm)
|
||||
# Remap word positions back to original image coordinates
|
||||
if words and (up_w != cw or up_h != ch):
|
||||
sx = cw / max(up_w, 1)
|
||||
sy = ch / max(up_h, 1)
|
||||
for w in words:
|
||||
w['left'] = int(w['left'] * sx) + cx
|
||||
w['top'] = int(w['top'] * sy) + cy
|
||||
w['width'] = int(w['width'] * sx)
|
||||
w['height'] = int(w['height'] * sy)
|
||||
elif words:
|
||||
for w in words:
|
||||
w['left'] += cx
|
||||
w['top'] += cy
|
||||
else:
|
||||
words = []
|
||||
|
||||
# Filter low-confidence words
|
||||
if words:
|
||||
words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||||
|
||||
if words:
|
||||
y_tol = max(15, ch)
|
||||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
|
||||
row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
|
||||
else:
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
|
||||
row_idx, col_idx, cw, ch, psm, engine_name)
|
||||
|
||||
# --- PSM 7 fallback for still-empty Tesseract cells ---
|
||||
if not text.strip() and engine_name == "tesseract" and ocr_img is not None:
|
||||
crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
|
||||
upscaled = _ensure_minimum_crop_size(crop_slice)
|
||||
up_h, up_w = upscaled.shape[:2]
|
||||
tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
|
||||
psm7_words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=7)
|
||||
if psm7_words:
|
||||
psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||||
if psm7_words:
|
||||
p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
|
||||
if p7_text.strip():
|
||||
text = p7_text
|
||||
avg_conf = round(
|
||||
sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
|
||||
)
|
||||
used_engine = 'cell_crop_v2_psm7'
|
||||
# Remap PSM7 word positions back to original image coords
|
||||
if up_w != cw or up_h != ch:
|
||||
sx = cw / max(up_w, 1)
|
||||
sy = ch / max(up_h, 1)
|
||||
for w in psm7_words:
|
||||
w['left'] = int(w['left'] * sx) + cx
|
||||
w['top'] = int(w['top'] * sy) + cy
|
||||
w['width'] = int(w['width'] * sx)
|
||||
w['height'] = int(w['height'] * sy)
|
||||
else:
|
||||
for w in psm7_words:
|
||||
w['left'] += cx
|
||||
w['top'] += cy
|
||||
words = psm7_words
|
||||
|
||||
# --- Noise filter ---
|
||||
if text.strip():
|
||||
pre_filter = text
|
||||
text = _clean_cell_text_lite(text)
|
||||
if not text:
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
|
||||
row_idx, col_idx, pre_filter)
|
||||
avg_conf = 0.0
|
||||
|
||||
result = dict(empty_cell)
|
||||
result['text'] = text
|
||||
result['confidence'] = avg_conf
|
||||
result['ocr_engine'] = used_engine
|
||||
|
||||
# Store individual word bounding boxes (absolute image coordinates)
|
||||
# for pixel-accurate overlay positioning in the frontend.
|
||||
if words and text.strip():
|
||||
result['word_boxes'] = [
|
||||
{
|
||||
'text': w.get('text', ''),
|
||||
'left': w['left'],
|
||||
'top': w['top'],
|
||||
'width': w['width'],
|
||||
'height': w['height'],
|
||||
'conf': w.get('conf', 0),
|
||||
}
|
||||
for w in words
|
||||
if w.get('text', '').strip()
|
||||
]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# Threshold: columns narrower than this (% of image width) use single-cell
|
||||
# crop OCR instead of full-page word assignment.
|
||||
_NARROW_COL_THRESHOLD_PCT = 15.0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# build_cell_grid_v2 — hybrid grid builder (current default)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_cell_grid_v2(
|
||||
ocr_img: np.ndarray,
|
||||
column_regions: List[PageRegion],
|
||||
row_geometries: List[RowGeometry],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
lang: str = "eng+deu",
|
||||
ocr_engine: str = "auto",
|
||||
img_bgr: Optional[np.ndarray] = None,
|
||||
skip_heal_gaps: bool = False,
|
||||
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||||
"""Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones.
|
||||
|
||||
Drop-in replacement for build_cell_grid() -- same signature & return type.
|
||||
|
||||
Strategy:
|
||||
- Broad columns (>15% image width): Use pre-assigned full-page Tesseract
|
||||
words (from row.words). Handles IPA brackets, punctuation, sentence
|
||||
continuity correctly.
|
||||
- Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent
|
||||
neighbour bleeding from adjacent broad columns.
|
||||
"""
|
||||
engine_name = "tesseract"
|
||||
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
||||
engine_name = ocr_engine
|
||||
elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE:
|
||||
engine_name = "rapid"
|
||||
|
||||
logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)")
|
||||
|
||||
# Filter to content rows only
|
||||
content_rows = [r for r in row_geometries if r.row_type == 'content']
|
||||
if not content_rows:
|
||||
logger.warning("build_cell_grid_v2: no content rows found")
|
||||
return [], []
|
||||
|
||||
# Filter phantom rows (word_count=0) and artifact rows
|
||||
before = len(content_rows)
|
||||
content_rows = [r for r in content_rows if r.word_count > 0]
|
||||
skipped = before - len(content_rows)
|
||||
if skipped > 0:
|
||||
logger.info(f"build_cell_grid_v2: skipped {skipped} phantom rows (word_count=0)")
|
||||
if not content_rows:
|
||||
logger.warning("build_cell_grid_v2: no content rows with words found")
|
||||
return [], []
|
||||
|
||||
before_art = len(content_rows)
|
||||
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
|
||||
artifact_skipped = before_art - len(content_rows)
|
||||
if artifact_skipped > 0:
|
||||
logger.info(f"build_cell_grid_v2: skipped {artifact_skipped} artifact rows")
|
||||
if not content_rows:
|
||||
logger.warning("build_cell_grid_v2: no content rows after artifact filtering")
|
||||
return [], []
|
||||
|
||||
# Filter columns
|
||||
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
|
||||
'margin_bottom', 'margin_left', 'margin_right'}
|
||||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||||
if not relevant_cols:
|
||||
logger.warning("build_cell_grid_v2: no usable columns found")
|
||||
return [], []
|
||||
|
||||
# Heal row gaps -- use header/footer boundaries
|
||||
content_rows.sort(key=lambda r: r.y)
|
||||
header_rows = [r for r in row_geometries if r.row_type == 'header']
|
||||
footer_rows = [r for r in row_geometries if r.row_type == 'footer']
|
||||
if header_rows:
|
||||
top_bound = max(r.y + r.height for r in header_rows)
|
||||
else:
|
||||
top_bound = content_rows[0].y
|
||||
if footer_rows:
|
||||
bottom_bound = min(r.y for r in footer_rows)
|
||||
else:
|
||||
bottom_bound = content_rows[-1].y + content_rows[-1].height
|
||||
|
||||
# skip_heal_gaps: When True, keep cell positions at their exact row geometry
|
||||
# positions without expanding to fill gaps from removed rows.
|
||||
if not skip_heal_gaps:
|
||||
_heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
|
||||
|
||||
relevant_cols.sort(key=lambda c: c.x)
|
||||
|
||||
columns_meta = [
|
||||
{'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
|
||||
for ci, c in enumerate(relevant_cols)
|
||||
]
|
||||
|
||||
lang_map = {
|
||||
'column_en': 'eng',
|
||||
'column_de': 'deu',
|
||||
'column_example': 'eng+deu',
|
||||
}
|
||||
|
||||
# --- Classify columns as broad vs narrow ---
|
||||
narrow_col_indices = set()
|
||||
for ci, col in enumerate(relevant_cols):
|
||||
col_pct = (col.width / img_w * 100) if img_w > 0 else 0
|
||||
if col_pct < _NARROW_COL_THRESHOLD_PCT:
|
||||
narrow_col_indices.add(ci)
|
||||
|
||||
broad_col_count = len(relevant_cols) - len(narrow_col_indices)
|
||||
logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), "
|
||||
f"{len(narrow_col_indices)} narrow columns (cell-crop)")
|
||||
|
||||
# --- Phase 1: Broad columns via full-page word assignment ---
|
||||
cells: List[Dict[str, Any]] = []
|
||||
|
||||
for row_idx, row in enumerate(content_rows):
|
||||
# Assign full-page words to columns for this row
|
||||
col_words = _assign_row_words_to_columns(row, relevant_cols)
|
||||
|
||||
for col_idx, col in enumerate(relevant_cols):
|
||||
if col_idx not in narrow_col_indices:
|
||||
# BROAD column: use pre-assigned full-page words
|
||||
words = col_words.get(col_idx, [])
|
||||
# Filter low-confidence words
|
||||
words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||||
|
||||
# Single full-width column (box sub-session): preserve spacing
|
||||
is_single_full_column = (
|
||||
len(relevant_cols) == 1
|
||||
and img_w > 0
|
||||
and relevant_cols[0].width / img_w > 0.9
|
||||
)
|
||||
|
||||
if words:
|
||||
y_tol = max(15, row.height)
|
||||
if is_single_full_column:
|
||||
text = _words_to_spaced_text(words, y_tolerance_px=y_tol)
|
||||
logger.info(f"R{row_idx:02d}: {len(words)} words, "
|
||||
f"text={text!r:.100}")
|
||||
else:
|
||||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
||||
else:
|
||||
text = ''
|
||||
avg_conf = 0.0
|
||||
if is_single_full_column:
|
||||
logger.info(f"R{row_idx:02d}: 0 words (row has "
|
||||
f"{row.word_count} total, y={row.y}..{row.y+row.height})")
|
||||
|
||||
# Apply noise filter -- but NOT for single-column sub-sessions
|
||||
if not is_single_full_column:
|
||||
text = _clean_cell_text(text)
|
||||
|
||||
cell = {
|
||||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||||
'row_index': row_idx,
|
||||
'col_index': col_idx,
|
||||
'col_type': col.type,
|
||||
'text': text,
|
||||
'confidence': avg_conf,
|
||||
'bbox_px': {
|
||||
'x': col.x, 'y': row.y,
|
||||
'w': col.width, 'h': row.height,
|
||||
},
|
||||
'bbox_pct': {
|
||||
'x': round(col.x / img_w * 100, 2) if img_w else 0,
|
||||
'y': round(row.y / img_h * 100, 2) if img_h else 0,
|
||||
'w': round(col.width / img_w * 100, 2) if img_w else 0,
|
||||
'h': round(row.height / img_h * 100, 2) if img_h else 0,
|
||||
},
|
||||
'ocr_engine': 'word_lookup',
|
||||
'is_bold': False,
|
||||
}
|
||||
# Store word bounding boxes for pixel-accurate overlay
|
||||
if words and text.strip():
|
||||
cell['word_boxes'] = [
|
||||
{
|
||||
'text': w.get('text', ''),
|
||||
'left': w['left'],
|
||||
'top': w['top'],
|
||||
'width': w['width'],
|
||||
'height': w['height'],
|
||||
'conf': w.get('conf', 0),
|
||||
}
|
||||
for w in words
|
||||
if w.get('text', '').strip()
|
||||
]
|
||||
cells.append(cell)
|
||||
|
||||
# --- Phase 2: Narrow columns via cell-crop OCR (parallel) ---
|
||||
narrow_tasks = []
|
||||
for row_idx, row in enumerate(content_rows):
|
||||
for col_idx, col in enumerate(relevant_cols):
|
||||
if col_idx in narrow_col_indices:
|
||||
narrow_tasks.append((row_idx, col_idx, row, col))
|
||||
|
||||
if narrow_tasks:
|
||||
max_workers = 4 if engine_name == "tesseract" else 2
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
||||
futures = {
|
||||
pool.submit(
|
||||
_ocr_cell_crop,
|
||||
ri, ci, row, col,
|
||||
ocr_img, img_bgr, img_w, img_h,
|
||||
engine_name, lang, lang_map,
|
||||
): (ri, ci)
|
||||
for ri, ci, row, col in narrow_tasks
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
cell = future.result()
|
||||
cells.append(cell)
|
||||
except Exception as e:
|
||||
ri, ci = futures[future]
|
||||
logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}")
|
||||
|
||||
# Sort cells by (row_index, col_index)
|
||||
cells.sort(key=lambda c: (c['row_index'], c['col_index']))
|
||||
|
||||
# Remove all-empty rows
|
||||
rows_with_text: set = set()
|
||||
for cell in cells:
|
||||
if cell['text'].strip():
|
||||
rows_with_text.add(cell['row_index'])
|
||||
before_filter = len(cells)
|
||||
cells = [c for c in cells if c['row_index'] in rows_with_text]
|
||||
empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
|
||||
if empty_rows_removed > 0:
|
||||
logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows")
|
||||
|
||||
logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
|
||||
f"{len(content_rows)} rows x {len(relevant_cols)} columns, "
|
||||
f"engine={engine_name} (hybrid)")
|
||||
|
||||
return cells, columns_meta
|
||||
Reference in New Issue
Block a user