feat: breite Spalten per Word-Gap splitten + gedrehte Scans im Frontend anzeigen
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 15s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 15s
_split_broad_columns() erkennt EN/DE-Gemisch in breiten Spalten via Word-Coverage-Analyse und trennt sie am groessten Luecken-Gap. Thumbnails und Page-Images werden serverseitig per fitz rotiert, Frontend laedt Thumbnails nach OCR-Processing neu. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -70,6 +70,7 @@ try:
|
||||
detect_column_geometry, analyze_layout_by_words, analyze_layout, create_layout_image,
|
||||
detect_row_geometry, build_cell_grid_v2,
|
||||
_cells_to_vocab_entries, _detect_sub_columns, _detect_header_footer_gaps,
|
||||
_split_broad_columns,
|
||||
expand_narrow_columns, positional_column_regions, llm_review_entries,
|
||||
detect_and_fix_orientation,
|
||||
_fix_phonetic_brackets,
|
||||
@@ -1182,6 +1183,9 @@ async def upload_pdf_get_info(
|
||||
async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)):
|
||||
"""Get a thumbnail image of a specific PDF page.
|
||||
|
||||
Uses fitz for rendering so that page_rotations (from OCR orientation
|
||||
detection) are applied consistently.
|
||||
|
||||
Args:
|
||||
hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5).
|
||||
"""
|
||||
@@ -1194,10 +1198,25 @@ async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Que
|
||||
if not pdf_data:
|
||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||
|
||||
thumbnail = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=not hires)
|
||||
try:
|
||||
import fitz
|
||||
zoom = 2.0 if hires else 0.5
|
||||
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
||||
page = pdf_document[page_number]
|
||||
# Apply orientation correction detected during OCR processing
|
||||
rot = session.get("page_rotations", {}).get(page_number, 0)
|
||||
if rot:
|
||||
page.set_rotation(rot)
|
||||
mat = fitz.Matrix(zoom, zoom)
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
png_data = pix.tobytes("png")
|
||||
pdf_document.close()
|
||||
except Exception as e:
|
||||
logger.error(f"PDF thumbnail failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
|
||||
|
||||
return StreamingResponse(
|
||||
io.BytesIO(thumbnail),
|
||||
io.BytesIO(png_data),
|
||||
media_type="image/png",
|
||||
)
|
||||
|
||||
@@ -1226,11 +1245,15 @@ async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Qu
|
||||
import fitz
|
||||
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
||||
page = pdf_document[page_number]
|
||||
# Apply orientation correction detected during OCR processing
|
||||
rot = session.get("page_rotations", {}).get(page_number, 0)
|
||||
if rot:
|
||||
page.set_rotation(rot)
|
||||
mat = fitz.Matrix(zoom, zoom)
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
png_data = pix.tobytes("png")
|
||||
pdf_document.close()
|
||||
logger.info(f"PDF page {page_number} rendered at zoom={zoom}: {len(png_data)} bytes")
|
||||
logger.info(f"PDF page {page_number} rendered at zoom={zoom} rot={rot}: {len(png_data)} bytes")
|
||||
except Exception as e:
|
||||
logger.error(f"PDF page image failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
|
||||
@@ -1272,10 +1295,11 @@ async def process_single_page(
|
||||
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
||||
|
||||
# --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
|
||||
rotation_deg = 0
|
||||
if OCR_PIPELINE_AVAILABLE:
|
||||
try:
|
||||
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
|
||||
page_vocabulary = await _run_ocr_pipeline_for_page(
|
||||
page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page(
|
||||
img_bgr, page_number, session_id,
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -1317,6 +1341,9 @@ async def process_single_page(
|
||||
|
||||
logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert")
|
||||
|
||||
# Store rotation for this page (used by image/thumbnail endpoints)
|
||||
session.setdefault("page_rotations", {})[page_number] = rotation_deg
|
||||
|
||||
# Add to session's vocabulary (append, don't replace)
|
||||
existing_vocab = session.get("vocabulary", [])
|
||||
# Remove any existing entries from this page (in case of re-processing)
|
||||
@@ -1334,6 +1361,7 @@ async def process_single_page(
|
||||
"vocabulary_count": len(page_vocabulary),
|
||||
"total_vocabulary_count": len(existing_vocab),
|
||||
"extraction_confidence": 0.9,
|
||||
"rotation": rotation_deg,
|
||||
}
|
||||
|
||||
|
||||
@@ -1341,7 +1369,7 @@ async def _run_ocr_pipeline_for_page(
|
||||
img_bgr: np.ndarray,
|
||||
page_number: int,
|
||||
vocab_session_id: str,
|
||||
) -> list:
|
||||
) -> tuple:
|
||||
"""Run the full OCR pipeline on a single page image and return vocab entries.
|
||||
|
||||
Uses the same pipeline as the admin OCR pipeline (ocr_pipeline_api.py).
|
||||
@@ -1352,7 +1380,8 @@ async def _run_ocr_pipeline_for_page(
|
||||
vocab_session_id: Vocab session ID for logging.
|
||||
|
||||
Steps: deskew → dewarp → columns → rows → words → (LLM review)
|
||||
Returns list of dicts with keys: id, english, german, example_sentence, source_page
|
||||
Returns (entries, rotation_deg) where entries is a list of dicts and
|
||||
rotation_deg is the orientation correction applied (0, 90, 180, 270).
|
||||
"""
|
||||
import time as _time
|
||||
|
||||
@@ -1418,6 +1447,7 @@ async def _run_ocr_pipeline_for_page(
|
||||
header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None)
|
||||
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
|
||||
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
||||
geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
|
||||
geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
|
||||
content_h = bottom_y - top_y
|
||||
regions = positional_column_regions(geometries, content_w, content_h, left_x)
|
||||
@@ -1534,7 +1564,7 @@ async def _run_ocr_pipeline_for_page(
|
||||
logger.info(f"OCR Pipeline page {page_number + 1}: "
|
||||
f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
|
||||
|
||||
return page_vocabulary
|
||||
return page_vocabulary, rotation
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/process-pages")
|
||||
|
||||
Reference in New Issue
Block a user