fix: Spaltenklassifikation im Vocab-Worksheet durch positionsbasierte Zuordnung ersetzen
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m47s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 20s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m47s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 20s
Sprachbasiertes Scoring (classify_column_types) verursachte vertauschte Spalten auf Seite 3 bei Beispielsaetzen mit vielen englischen Funktionswoertern. Neue _positional_column_regions() ordnet Spalten rein geometrisch (links→rechts) zu. OCR Pipeline Admin bleibt unveraendert. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -70,8 +70,9 @@ try:
|
|||||||
detect_column_geometry, analyze_layout_by_words, analyze_layout, create_layout_image,
|
detect_column_geometry, analyze_layout_by_words, analyze_layout, create_layout_image,
|
||||||
detect_row_geometry, build_cell_grid_v2,
|
detect_row_geometry, build_cell_grid_v2,
|
||||||
_cells_to_vocab_entries, _detect_sub_columns, _detect_header_footer_gaps,
|
_cells_to_vocab_entries, _detect_sub_columns, _detect_header_footer_gaps,
|
||||||
expand_narrow_columns, classify_column_types, llm_review_entries,
|
expand_narrow_columns, llm_review_entries,
|
||||||
_fix_phonetic_brackets,
|
_fix_phonetic_brackets,
|
||||||
|
render_pdf_high_res,
|
||||||
PageRegion, RowGeometry,
|
PageRegion, RowGeometry,
|
||||||
)
|
)
|
||||||
from ocr_pipeline_session_store import (
|
from ocr_pipeline_session_store import (
|
||||||
@@ -1269,14 +1270,12 @@ async def process_single_page(
|
|||||||
if page_number < 0 or page_number >= page_count:
|
if page_number < 0 or page_number >= page_count:
|
||||||
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
||||||
|
|
||||||
# Convert just this ONE page to PNG
|
# --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
|
||||||
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
|
||||||
|
|
||||||
# --- OCR Pipeline path ---
|
|
||||||
if OCR_PIPELINE_AVAILABLE:
|
if OCR_PIPELINE_AVAILABLE:
|
||||||
try:
|
try:
|
||||||
|
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
|
||||||
page_vocabulary = await _run_ocr_pipeline_for_page(
|
page_vocabulary = await _run_ocr_pipeline_for_page(
|
||||||
image_data, page_number, session_id,
|
img_bgr, page_number, session_id,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
|
logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
|
||||||
@@ -1291,6 +1290,7 @@ async def process_single_page(
|
|||||||
else:
|
else:
|
||||||
# Fallback to LLM vision extraction
|
# Fallback to LLM vision extraction
|
||||||
logger.warning("OCR pipeline not available, falling back to LLM vision")
|
logger.warning("OCR pipeline not available, falling back to LLM vision")
|
||||||
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
||||||
vocabulary, confidence, error = await extract_vocabulary_from_image(
|
vocabulary, confidence, error = await extract_vocabulary_from_image(
|
||||||
image_data,
|
image_data,
|
||||||
f"page_{page_number + 1}.png",
|
f"page_{page_number + 1}.png",
|
||||||
@@ -1336,13 +1336,89 @@ async def process_single_page(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _positional_column_regions(
|
||||||
|
geometries: list,
|
||||||
|
content_w: int,
|
||||||
|
content_h: int,
|
||||||
|
left_x: int,
|
||||||
|
) -> list:
|
||||||
|
"""Classify columns by position only (no language scoring).
|
||||||
|
|
||||||
|
Structural columns (page_ref, column_marker) are identified by geometry.
|
||||||
|
Remaining content columns are labelled left→right as column_en, column_de,
|
||||||
|
column_example. The names are purely positional – no language analysis.
|
||||||
|
"""
|
||||||
|
structural = []
|
||||||
|
content_cols = []
|
||||||
|
|
||||||
|
for g in geometries:
|
||||||
|
rel_x = g.x - left_x
|
||||||
|
# page_ref: narrow column in the leftmost 20% region
|
||||||
|
if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20:
|
||||||
|
structural.append(PageRegion(
|
||||||
|
type='page_ref', x=g.x, y=g.y,
|
||||||
|
width=g.width, height=content_h,
|
||||||
|
classification_confidence=0.95,
|
||||||
|
classification_method='positional',
|
||||||
|
))
|
||||||
|
# column_marker: very narrow, few words
|
||||||
|
elif g.width_ratio < 0.06 and g.word_count <= 15:
|
||||||
|
structural.append(PageRegion(
|
||||||
|
type='column_marker', x=g.x, y=g.y,
|
||||||
|
width=g.width, height=content_h,
|
||||||
|
classification_confidence=0.95,
|
||||||
|
classification_method='positional',
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
content_cols.append(g)
|
||||||
|
|
||||||
|
# Single content column → plain text page
|
||||||
|
if len(content_cols) == 1:
|
||||||
|
g = content_cols[0]
|
||||||
|
return structural + [PageRegion(
|
||||||
|
type='column_text', x=g.x, y=g.y,
|
||||||
|
width=g.width, height=content_h,
|
||||||
|
classification_confidence=0.9,
|
||||||
|
classification_method='positional',
|
||||||
|
)]
|
||||||
|
|
||||||
|
# No content columns
|
||||||
|
if not content_cols:
|
||||||
|
return structural
|
||||||
|
|
||||||
|
# Sort content columns left→right and assign positional labels
|
||||||
|
content_cols.sort(key=lambda g: g.x)
|
||||||
|
labels = ['column_en', 'column_de', 'column_example']
|
||||||
|
regions = list(structural)
|
||||||
|
for i, g in enumerate(content_cols):
|
||||||
|
label = labels[i] if i < len(labels) else 'column_example'
|
||||||
|
regions.append(PageRegion(
|
||||||
|
type=label, x=g.x, y=g.y,
|
||||||
|
width=g.width, height=content_h,
|
||||||
|
classification_confidence=0.95,
|
||||||
|
classification_method='positional',
|
||||||
|
))
|
||||||
|
|
||||||
|
logger.info(f"PositionalColumns: {len(structural)} structural, "
|
||||||
|
f"{len(content_cols)} content → "
|
||||||
|
f"{[r.type for r in regions]}")
|
||||||
|
return regions
|
||||||
|
|
||||||
|
|
||||||
async def _run_ocr_pipeline_for_page(
|
async def _run_ocr_pipeline_for_page(
|
||||||
png_data: bytes,
|
img_bgr: np.ndarray,
|
||||||
page_number: int,
|
page_number: int,
|
||||||
vocab_session_id: str,
|
vocab_session_id: str,
|
||||||
) -> list:
|
) -> list:
|
||||||
"""Run the full OCR pipeline on a single page image and return vocab entries.
|
"""Run the full OCR pipeline on a single page image and return vocab entries.
|
||||||
|
|
||||||
|
Uses the same pipeline as the admin OCR pipeline (ocr_pipeline_api.py).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
img_bgr: BGR numpy array (from render_pdf_high_res, same as admin pipeline).
|
||||||
|
page_number: 0-indexed page number.
|
||||||
|
vocab_session_id: Vocab session ID for logging.
|
||||||
|
|
||||||
Steps: deskew → dewarp → columns → rows → words → (LLM review)
|
Steps: deskew → dewarp → columns → rows → words → (LLM review)
|
||||||
Returns list of dicts with keys: id, english, german, example_sentence, source_page
|
Returns list of dicts with keys: id, english, german, example_sentence, source_page
|
||||||
"""
|
"""
|
||||||
@@ -1350,23 +1426,19 @@ async def _run_ocr_pipeline_for_page(
|
|||||||
|
|
||||||
t_total = _time.time()
|
t_total = _time.time()
|
||||||
|
|
||||||
# 1. Decode PNG → BGR numpy array
|
|
||||||
arr = np.frombuffer(png_data, dtype=np.uint8)
|
|
||||||
img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
|
|
||||||
if img_bgr is None:
|
|
||||||
raise ValueError("Failed to decode page image")
|
|
||||||
|
|
||||||
img_h, img_w = img_bgr.shape[:2]
|
img_h, img_w = img_bgr.shape[:2]
|
||||||
logger.info(f"OCR Pipeline page {page_number + 1}: image {img_w}x{img_h}")
|
logger.info(f"OCR Pipeline page {page_number + 1}: image {img_w}x{img_h}")
|
||||||
|
|
||||||
# 2. Create pipeline session in DB (for debugging in admin UI)
|
# 2. Create pipeline session in DB (for debugging in admin UI)
|
||||||
pipeline_session_id = str(uuid.uuid4())
|
pipeline_session_id = str(uuid.uuid4())
|
||||||
try:
|
try:
|
||||||
|
_, png_buf = cv2.imencode(".png", img_bgr)
|
||||||
|
original_png = png_buf.tobytes()
|
||||||
await create_pipeline_session_db(
|
await create_pipeline_session_db(
|
||||||
pipeline_session_id,
|
pipeline_session_id,
|
||||||
name=f"vocab-ws-{vocab_session_id[:8]}-p{page_number + 1}",
|
name=f"vocab-ws-{vocab_session_id[:8]}-p{page_number + 1}",
|
||||||
filename=f"page_{page_number + 1}.png",
|
filename=f"page_{page_number + 1}.png",
|
||||||
original_png=png_data,
|
original_png=original_png,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Could not create pipeline session in DB: {e}")
|
logger.warning(f"Could not create pipeline session in DB: {e}")
|
||||||
@@ -1406,8 +1478,8 @@ async def _run_ocr_pipeline_for_page(
|
|||||||
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
|
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
|
||||||
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
||||||
geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
|
geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
|
||||||
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
|
content_h = bottom_y - top_y
|
||||||
left_x=left_x, right_x=right_x, inv=inv)
|
regions = _positional_column_regions(geometries, content_w, content_h, left_x)
|
||||||
content_bounds = (left_x, right_x, top_y, bottom_y)
|
content_bounds = (left_x, right_x, top_y, bottom_y)
|
||||||
|
|
||||||
logger.info(f" columns: {len(regions)} detected ({_time.time() - t0:.1f}s)")
|
logger.info(f" columns: {len(regions)} detected ({_time.time() - t0:.1f}s)")
|
||||||
|
|||||||
Reference in New Issue
Block a user