feat: breite Spalten per Word-Gap splitten + gedrehte Scans im Frontend anzeigen
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 15s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 15s
_split_broad_columns() erkennt EN/DE-Gemisch in breiten Spalten via Word-Coverage-Analyse und trennt sie am groessten Luecken-Gap. Thumbnails und Page-Images werden serverseitig per fitz rotiert, Frontend laedt Thumbnails nach OCR-Processing neu. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2067,6 +2067,148 @@ def _detect_sub_columns(
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _split_broad_columns(
|
||||||
|
geometries: List[ColumnGeometry],
|
||||||
|
content_w: int,
|
||||||
|
left_x: int = 0,
|
||||||
|
_broad_threshold: float = 0.35,
|
||||||
|
_min_gap_px: int = 15,
|
||||||
|
_min_words_per_split: int = 5,
|
||||||
|
) -> List[ColumnGeometry]:
|
||||||
|
"""Split overly broad columns that contain two language blocks (EN+DE).
|
||||||
|
|
||||||
|
Uses word-coverage gap analysis: builds a per-pixel coverage array from the
|
||||||
|
words inside each broad column, finds the largest horizontal gap, and splits
|
||||||
|
the column at that gap.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
geometries: Column geometries from _detect_sub_columns.
|
||||||
|
content_w: Width of the content area in pixels.
|
||||||
|
left_x: Left edge of content ROI in absolute image coordinates.
|
||||||
|
_broad_threshold: Minimum width_ratio to consider a column "broad".
|
||||||
|
_min_gap_px: Minimum gap width (pixels) to trigger a split.
|
||||||
|
_min_words_per_split: Both halves must have at least this many words.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Updated list of ColumnGeometry (possibly with more columns).
|
||||||
|
"""
|
||||||
|
result: List[ColumnGeometry] = []
|
||||||
|
|
||||||
|
for geo in geometries:
|
||||||
|
if geo.width_ratio <= _broad_threshold or len(geo.words) < 10:
|
||||||
|
result.append(geo)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Build word-coverage array (per pixel within column)
|
||||||
|
col_left_rel = geo.x - left_x # column left in content-relative coords
|
||||||
|
coverage = np.zeros(geo.width, dtype=np.float32)
|
||||||
|
|
||||||
|
for wd in geo.words:
|
||||||
|
# wd['left'] is relative to left_x (content ROI)
|
||||||
|
wl = wd['left'] - col_left_rel
|
||||||
|
wr = wl + wd.get('width', 0)
|
||||||
|
wl = max(0, int(wl))
|
||||||
|
wr = min(geo.width, int(wr))
|
||||||
|
if wr > wl:
|
||||||
|
coverage[wl:wr] += 1.0
|
||||||
|
|
||||||
|
# Light smoothing (kernel=3px) to avoid noise
|
||||||
|
if len(coverage) > 3:
|
||||||
|
kernel = np.ones(3, dtype=np.float32) / 3.0
|
||||||
|
coverage = np.convolve(coverage, kernel, mode='same')
|
||||||
|
|
||||||
|
# Normalise to [0, 1]
|
||||||
|
cmax = coverage.max()
|
||||||
|
if cmax > 0:
|
||||||
|
coverage /= cmax
|
||||||
|
|
||||||
|
# Find gaps where coverage < 0.5
|
||||||
|
low_mask = coverage < 0.5
|
||||||
|
gap_start = None
|
||||||
|
best_gap = None # (start, end, width)
|
||||||
|
for px in range(len(low_mask)):
|
||||||
|
if low_mask[px]:
|
||||||
|
if gap_start is None:
|
||||||
|
gap_start = px
|
||||||
|
else:
|
||||||
|
if gap_start is not None:
|
||||||
|
gw = px - gap_start
|
||||||
|
if best_gap is None or gw > best_gap[2]:
|
||||||
|
best_gap = (gap_start, px, gw)
|
||||||
|
gap_start = None
|
||||||
|
# Handle trailing gap
|
||||||
|
if gap_start is not None:
|
||||||
|
gw = len(low_mask) - gap_start
|
||||||
|
if best_gap is None or gw > best_gap[2]:
|
||||||
|
best_gap = (gap_start, len(low_mask), gw)
|
||||||
|
|
||||||
|
if best_gap is None or best_gap[2] < _min_gap_px:
|
||||||
|
result.append(geo)
|
||||||
|
continue
|
||||||
|
|
||||||
|
gap_center = (best_gap[0] + best_gap[1]) // 2
|
||||||
|
|
||||||
|
# Split words by midpoint relative to gap
|
||||||
|
left_words = []
|
||||||
|
right_words = []
|
||||||
|
for wd in geo.words:
|
||||||
|
wl = wd['left'] - col_left_rel
|
||||||
|
mid = wl + wd.get('width', 0) / 2.0
|
||||||
|
if mid < gap_center:
|
||||||
|
left_words.append(wd)
|
||||||
|
else:
|
||||||
|
right_words.append(wd)
|
||||||
|
|
||||||
|
if len(left_words) < _min_words_per_split or len(right_words) < _min_words_per_split:
|
||||||
|
result.append(geo)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Build two new ColumnGeometry objects
|
||||||
|
split_x_abs = geo.x + gap_center
|
||||||
|
left_w = gap_center
|
||||||
|
right_w = geo.width - gap_center
|
||||||
|
|
||||||
|
left_geo = ColumnGeometry(
|
||||||
|
index=0,
|
||||||
|
x=geo.x,
|
||||||
|
y=geo.y,
|
||||||
|
width=left_w,
|
||||||
|
height=geo.height,
|
||||||
|
word_count=len(left_words),
|
||||||
|
words=left_words,
|
||||||
|
width_ratio=left_w / content_w if content_w else 0,
|
||||||
|
is_sub_column=True,
|
||||||
|
)
|
||||||
|
right_geo = ColumnGeometry(
|
||||||
|
index=0,
|
||||||
|
x=split_x_abs,
|
||||||
|
y=geo.y,
|
||||||
|
width=right_w,
|
||||||
|
height=geo.height,
|
||||||
|
word_count=len(right_words),
|
||||||
|
words=right_words,
|
||||||
|
width_ratio=right_w / content_w if content_w else 0,
|
||||||
|
is_sub_column=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"SplitBroadCols: col {geo.index} SPLIT at gap_center={gap_center} "
|
||||||
|
f"(gap {best_gap[2]}px @ [{best_gap[0]}..{best_gap[1]}]), "
|
||||||
|
f"left={len(left_words)} words (w={left_w}), "
|
||||||
|
f"right={len(right_words)} words (w={right_w})"
|
||||||
|
)
|
||||||
|
|
||||||
|
result.append(left_geo)
|
||||||
|
result.append(right_geo)
|
||||||
|
|
||||||
|
# Re-index left-to-right
|
||||||
|
result.sort(key=lambda g: g.x)
|
||||||
|
for i, g in enumerate(result):
|
||||||
|
g.index = i
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _build_geometries_from_starts(
|
def _build_geometries_from_starts(
|
||||||
col_starts: List[Tuple[int, int]],
|
col_starts: List[Tuple[int, int]],
|
||||||
word_dicts: List[Dict],
|
word_dicts: List[Dict],
|
||||||
@@ -4128,6 +4270,9 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
|
|||||||
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
|
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
|
||||||
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
||||||
|
|
||||||
|
# Split broad columns that contain EN+DE mixed via word-coverage gaps
|
||||||
|
geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
|
||||||
|
|
||||||
# Phase B: Positional classification (no language scoring)
|
# Phase B: Positional classification (no language scoring)
|
||||||
content_h = bottom_y - top_y
|
content_h = bottom_y - top_y
|
||||||
regions = positional_column_regions(geometries, content_w, content_h, left_x)
|
regions = positional_column_regions(geometries, content_w, content_h, left_x)
|
||||||
|
|||||||
@@ -70,6 +70,7 @@ try:
|
|||||||
detect_column_geometry, analyze_layout_by_words, analyze_layout, create_layout_image,
|
detect_column_geometry, analyze_layout_by_words, analyze_layout, create_layout_image,
|
||||||
detect_row_geometry, build_cell_grid_v2,
|
detect_row_geometry, build_cell_grid_v2,
|
||||||
_cells_to_vocab_entries, _detect_sub_columns, _detect_header_footer_gaps,
|
_cells_to_vocab_entries, _detect_sub_columns, _detect_header_footer_gaps,
|
||||||
|
_split_broad_columns,
|
||||||
expand_narrow_columns, positional_column_regions, llm_review_entries,
|
expand_narrow_columns, positional_column_regions, llm_review_entries,
|
||||||
detect_and_fix_orientation,
|
detect_and_fix_orientation,
|
||||||
_fix_phonetic_brackets,
|
_fix_phonetic_brackets,
|
||||||
@@ -1182,6 +1183,9 @@ async def upload_pdf_get_info(
|
|||||||
async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)):
|
async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)):
|
||||||
"""Get a thumbnail image of a specific PDF page.
|
"""Get a thumbnail image of a specific PDF page.
|
||||||
|
|
||||||
|
Uses fitz for rendering so that page_rotations (from OCR orientation
|
||||||
|
detection) are applied consistently.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5).
|
hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5).
|
||||||
"""
|
"""
|
||||||
@@ -1194,10 +1198,25 @@ async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Que
|
|||||||
if not pdf_data:
|
if not pdf_data:
|
||||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||||
|
|
||||||
thumbnail = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=not hires)
|
try:
|
||||||
|
import fitz
|
||||||
|
zoom = 2.0 if hires else 0.5
|
||||||
|
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
||||||
|
page = pdf_document[page_number]
|
||||||
|
# Apply orientation correction detected during OCR processing
|
||||||
|
rot = session.get("page_rotations", {}).get(page_number, 0)
|
||||||
|
if rot:
|
||||||
|
page.set_rotation(rot)
|
||||||
|
mat = fitz.Matrix(zoom, zoom)
|
||||||
|
pix = page.get_pixmap(matrix=mat)
|
||||||
|
png_data = pix.tobytes("png")
|
||||||
|
pdf_document.close()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"PDF thumbnail failed: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
|
||||||
|
|
||||||
return StreamingResponse(
|
return StreamingResponse(
|
||||||
io.BytesIO(thumbnail),
|
io.BytesIO(png_data),
|
||||||
media_type="image/png",
|
media_type="image/png",
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1226,11 +1245,15 @@ async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Qu
|
|||||||
import fitz
|
import fitz
|
||||||
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
||||||
page = pdf_document[page_number]
|
page = pdf_document[page_number]
|
||||||
|
# Apply orientation correction detected during OCR processing
|
||||||
|
rot = session.get("page_rotations", {}).get(page_number, 0)
|
||||||
|
if rot:
|
||||||
|
page.set_rotation(rot)
|
||||||
mat = fitz.Matrix(zoom, zoom)
|
mat = fitz.Matrix(zoom, zoom)
|
||||||
pix = page.get_pixmap(matrix=mat)
|
pix = page.get_pixmap(matrix=mat)
|
||||||
png_data = pix.tobytes("png")
|
png_data = pix.tobytes("png")
|
||||||
pdf_document.close()
|
pdf_document.close()
|
||||||
logger.info(f"PDF page {page_number} rendered at zoom={zoom}: {len(png_data)} bytes")
|
logger.info(f"PDF page {page_number} rendered at zoom={zoom} rot={rot}: {len(png_data)} bytes")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"PDF page image failed: {e}")
|
logger.error(f"PDF page image failed: {e}")
|
||||||
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
|
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
|
||||||
@@ -1272,10 +1295,11 @@ async def process_single_page(
|
|||||||
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
||||||
|
|
||||||
# --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
|
# --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
|
||||||
|
rotation_deg = 0
|
||||||
if OCR_PIPELINE_AVAILABLE:
|
if OCR_PIPELINE_AVAILABLE:
|
||||||
try:
|
try:
|
||||||
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
|
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
|
||||||
page_vocabulary = await _run_ocr_pipeline_for_page(
|
page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page(
|
||||||
img_bgr, page_number, session_id,
|
img_bgr, page_number, session_id,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -1317,6 +1341,9 @@ async def process_single_page(
|
|||||||
|
|
||||||
logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert")
|
logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert")
|
||||||
|
|
||||||
|
# Store rotation for this page (used by image/thumbnail endpoints)
|
||||||
|
session.setdefault("page_rotations", {})[page_number] = rotation_deg
|
||||||
|
|
||||||
# Add to session's vocabulary (append, don't replace)
|
# Add to session's vocabulary (append, don't replace)
|
||||||
existing_vocab = session.get("vocabulary", [])
|
existing_vocab = session.get("vocabulary", [])
|
||||||
# Remove any existing entries from this page (in case of re-processing)
|
# Remove any existing entries from this page (in case of re-processing)
|
||||||
@@ -1334,6 +1361,7 @@ async def process_single_page(
|
|||||||
"vocabulary_count": len(page_vocabulary),
|
"vocabulary_count": len(page_vocabulary),
|
||||||
"total_vocabulary_count": len(existing_vocab),
|
"total_vocabulary_count": len(existing_vocab),
|
||||||
"extraction_confidence": 0.9,
|
"extraction_confidence": 0.9,
|
||||||
|
"rotation": rotation_deg,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -1341,7 +1369,7 @@ async def _run_ocr_pipeline_for_page(
|
|||||||
img_bgr: np.ndarray,
|
img_bgr: np.ndarray,
|
||||||
page_number: int,
|
page_number: int,
|
||||||
vocab_session_id: str,
|
vocab_session_id: str,
|
||||||
) -> list:
|
) -> tuple:
|
||||||
"""Run the full OCR pipeline on a single page image and return vocab entries.
|
"""Run the full OCR pipeline on a single page image and return vocab entries.
|
||||||
|
|
||||||
Uses the same pipeline as the admin OCR pipeline (ocr_pipeline_api.py).
|
Uses the same pipeline as the admin OCR pipeline (ocr_pipeline_api.py).
|
||||||
@@ -1352,7 +1380,8 @@ async def _run_ocr_pipeline_for_page(
|
|||||||
vocab_session_id: Vocab session ID for logging.
|
vocab_session_id: Vocab session ID for logging.
|
||||||
|
|
||||||
Steps: deskew → dewarp → columns → rows → words → (LLM review)
|
Steps: deskew → dewarp → columns → rows → words → (LLM review)
|
||||||
Returns list of dicts with keys: id, english, german, example_sentence, source_page
|
Returns (entries, rotation_deg) where entries is a list of dicts and
|
||||||
|
rotation_deg is the orientation correction applied (0, 90, 180, 270).
|
||||||
"""
|
"""
|
||||||
import time as _time
|
import time as _time
|
||||||
|
|
||||||
@@ -1418,6 +1447,7 @@ async def _run_ocr_pipeline_for_page(
|
|||||||
header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None)
|
header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None)
|
||||||
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
|
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
|
||||||
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
||||||
|
geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
|
||||||
geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
|
geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
|
||||||
content_h = bottom_y - top_y
|
content_h = bottom_y - top_y
|
||||||
regions = positional_column_regions(geometries, content_w, content_h, left_x)
|
regions = positional_column_regions(geometries, content_w, content_h, left_x)
|
||||||
@@ -1534,7 +1564,7 @@ async def _run_ocr_pipeline_for_page(
|
|||||||
logger.info(f"OCR Pipeline page {page_number + 1}: "
|
logger.info(f"OCR Pipeline page {page_number + 1}: "
|
||||||
f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
|
f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
|
||||||
|
|
||||||
return page_vocabulary
|
return page_vocabulary, rotation
|
||||||
|
|
||||||
|
|
||||||
@router.post("/sessions/{session_id}/process-pages")
|
@router.post("/sessions/{session_id}/process-pages")
|
||||||
|
|||||||
@@ -511,6 +511,26 @@ export default function VocabWorksheetPage() {
|
|||||||
setExtractionStatus(`Alle Seiten fehlgeschlagen.`)
|
setExtractionStatus(`Alle Seiten fehlgeschlagen.`)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reload thumbnails for processed pages (server may have rotated them)
|
||||||
|
if (successful.length > 0 && session) {
|
||||||
|
const updatedThumbs = [...pagesThumbnails]
|
||||||
|
for (const pageNum of successful) {
|
||||||
|
const idx = pageNum - 1 // successful stores 1-indexed
|
||||||
|
try {
|
||||||
|
const thumbRes = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session.id}/pdf-thumbnail/${idx}?hires=true&t=${Date.now()}`)
|
||||||
|
if (thumbRes.ok) {
|
||||||
|
const blob = await thumbRes.blob()
|
||||||
|
// Revoke old blob URL to avoid memory leaks
|
||||||
|
if (updatedThumbs[idx]) URL.revokeObjectURL(updatedThumbs[idx])
|
||||||
|
updatedThumbs[idx] = URL.createObjectURL(blob)
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.error(`Failed to refresh thumbnail for page ${pageNum}`)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
setPagesThumbnails(updatedThumbs)
|
||||||
|
}
|
||||||
|
|
||||||
setSession(prev => prev ? { ...prev, status: 'extracted' } : null)
|
setSession(prev => prev ? { ...prev, status: 'extracted' } : null)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user