[split-required] Split remaining Python monoliths (Phase 1 continued)
klausur-service (7 monoliths): - grid_editor_helpers.py (1,737 → 5 files: columns, filters, headers, zones) - cv_cell_grid.py (1,675 → 7 files: build, legacy, streaming, merge, vocab) - worksheet_editor_api.py (1,305 → 4 files: models, AI, reconstruct, routes) - legal_corpus_ingestion.py (1,280 → 3 files: registry, chunking, ingestion) - cv_review.py (1,248 → 4 files: pipeline, spell, LLM, barrel) - cv_preprocessing.py (1,166 → 3 files: deskew, dewarp, barrel) - rbac.py, admin_api.py, routes/eh.py remain (next batch) backend-lehrer (1 monolith): - classroom_engine/repository.py (1,705 → 7 files by domain) All re-export barrels preserve backward compatibility. Zero import errors verified. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
430
klausur-service/backend/cv_review_pipeline.py
Normal file
430
klausur-service/backend/cv_review_pipeline.py
Normal file
@@ -0,0 +1,430 @@
|
||||
"""
|
||||
CV Review Pipeline — Multi-pass OCR, line alignment, LLM post-correction, and orchestration.
|
||||
|
||||
Stages 6-8 of the CV vocabulary pipeline plus the main orchestrator.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import (
|
||||
CV_PIPELINE_AVAILABLE,
|
||||
PageRegion,
|
||||
PipelineResult,
|
||||
VocabRow,
|
||||
)
|
||||
from cv_preprocessing import (
|
||||
deskew_image,
|
||||
dewarp_image,
|
||||
render_image_high_res,
|
||||
render_pdf_high_res,
|
||||
)
|
||||
from cv_layout import (
|
||||
analyze_layout,
|
||||
create_layout_image,
|
||||
create_ocr_image,
|
||||
)
|
||||
from cv_ocr_engines import (
|
||||
_group_words_into_lines,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytesseract = None # type: ignore[assignment]
|
||||
Image = None # type: ignore[assignment,misc]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Stage 6: Multi-Pass OCR
|
||||
# =============================================================================
|
||||
|
||||
def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str,
|
||||
psm: int, fallback_psm: Optional[int] = None,
|
||||
min_confidence: float = 40.0) -> List[Dict[str, Any]]:
|
||||
"""Run Tesseract OCR on a specific region with given PSM.
|
||||
|
||||
Args:
|
||||
ocr_img: Binarized full-page image.
|
||||
region: Region to crop and OCR.
|
||||
lang: Tesseract language string.
|
||||
psm: Page Segmentation Mode.
|
||||
fallback_psm: If confidence too low, retry with this PSM per line.
|
||||
min_confidence: Minimum average confidence before fallback.
|
||||
|
||||
Returns:
|
||||
List of word dicts with text, position, confidence.
|
||||
"""
|
||||
crop = ocr_img[region.y:region.y + region.height,
|
||||
region.x:region.x + region.width]
|
||||
|
||||
if crop.size == 0:
|
||||
return []
|
||||
|
||||
pil_img = Image.fromarray(crop)
|
||||
|
||||
config = f'--psm {psm} --oem 3'
|
||||
try:
|
||||
data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
|
||||
output_type=pytesseract.Output.DICT)
|
||||
except Exception as e:
|
||||
logger.warning(f"Tesseract failed for region {region.type}: {e}")
|
||||
return []
|
||||
|
||||
words = []
|
||||
for i in range(len(data['text'])):
|
||||
text = data['text'][i].strip()
|
||||
conf = int(data['conf'][i])
|
||||
if not text or conf < 10:
|
||||
continue
|
||||
words.append({
|
||||
'text': text,
|
||||
'left': data['left'][i] + region.x,
|
||||
'top': data['top'][i] + region.y,
|
||||
'width': data['width'][i],
|
||||
'height': data['height'][i],
|
||||
'conf': conf,
|
||||
'region_type': region.type,
|
||||
})
|
||||
|
||||
if words and fallback_psm is not None:
|
||||
avg_conf = sum(w['conf'] for w in words) / len(words)
|
||||
if avg_conf < min_confidence:
|
||||
logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, "
|
||||
f"trying fallback PSM {fallback_psm}")
|
||||
words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm)
|
||||
|
||||
return words
|
||||
|
||||
|
||||
def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion,
|
||||
lang: str, psm: int) -> List[Dict[str, Any]]:
|
||||
"""OCR a region line by line (fallback for low-confidence regions)."""
|
||||
crop = ocr_img[region.y:region.y + region.height,
|
||||
region.x:region.x + region.width]
|
||||
|
||||
if crop.size == 0:
|
||||
return []
|
||||
|
||||
inv = cv2.bitwise_not(crop)
|
||||
h_proj = np.sum(inv, axis=1)
|
||||
threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0
|
||||
|
||||
lines = []
|
||||
in_text = False
|
||||
line_start = 0
|
||||
for y in range(len(h_proj)):
|
||||
if h_proj[y] > threshold and not in_text:
|
||||
line_start = y
|
||||
in_text = True
|
||||
elif h_proj[y] <= threshold and in_text:
|
||||
if y - line_start > 5:
|
||||
lines.append((line_start, y))
|
||||
in_text = False
|
||||
if in_text and len(h_proj) - line_start > 5:
|
||||
lines.append((line_start, len(h_proj)))
|
||||
|
||||
all_words = []
|
||||
config = f'--psm {psm} --oem 3'
|
||||
|
||||
for line_y_start, line_y_end in lines:
|
||||
pad = 3
|
||||
y1 = max(0, line_y_start - pad)
|
||||
y2 = min(crop.shape[0], line_y_end + pad)
|
||||
line_crop = crop[y1:y2, :]
|
||||
|
||||
if line_crop.size == 0:
|
||||
continue
|
||||
|
||||
pil_img = Image.fromarray(line_crop)
|
||||
try:
|
||||
data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
|
||||
output_type=pytesseract.Output.DICT)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
for i in range(len(data['text'])):
|
||||
text = data['text'][i].strip()
|
||||
conf = int(data['conf'][i])
|
||||
if not text or conf < 10:
|
||||
continue
|
||||
all_words.append({
|
||||
'text': text,
|
||||
'left': data['left'][i] + region.x,
|
||||
'top': data['top'][i] + region.y + y1,
|
||||
'width': data['width'][i],
|
||||
'height': data['height'][i],
|
||||
'conf': conf,
|
||||
'region_type': region.type,
|
||||
})
|
||||
|
||||
return all_words
|
||||
|
||||
|
||||
def run_multi_pass_ocr(ocr_img: np.ndarray,
|
||||
regions: List[PageRegion],
|
||||
lang: str = "eng+deu") -> Dict[str, List[Dict]]:
|
||||
"""Run OCR on each detected region with optimized settings."""
|
||||
results: Dict[str, List[Dict]] = {}
|
||||
|
||||
_ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
|
||||
for region in regions:
|
||||
if region.type in _ocr_skip:
|
||||
continue
|
||||
|
||||
if region.type == 'column_en':
|
||||
words = ocr_region(ocr_img, region, lang='eng', psm=4)
|
||||
elif region.type == 'column_de':
|
||||
words = ocr_region(ocr_img, region, lang='deu', psm=4)
|
||||
elif region.type == 'column_example':
|
||||
words = ocr_region(ocr_img, region, lang=lang, psm=6,
|
||||
fallback_psm=7, min_confidence=40.0)
|
||||
else:
|
||||
words = ocr_region(ocr_img, region, lang=lang, psm=6)
|
||||
|
||||
results[region.type] = words
|
||||
logger.info(f"OCR {region.type}: {len(words)} words")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Stage 7: Line Alignment -> Vocabulary Entries
|
||||
# =============================================================================
|
||||
|
||||
def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
|
||||
regions: List[PageRegion],
|
||||
y_tolerance_px: int = 25) -> List[VocabRow]:
|
||||
"""Align OCR results from different columns into vocabulary rows."""
|
||||
if 'column_en' not in ocr_results and 'column_de' not in ocr_results:
|
||||
logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty")
|
||||
return []
|
||||
|
||||
en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px)
|
||||
de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px)
|
||||
ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px)
|
||||
|
||||
def line_y_center(line: List[Dict]) -> float:
|
||||
return sum(w['top'] + w['height'] / 2 for w in line) / len(line)
|
||||
|
||||
def line_text(line: List[Dict]) -> str:
|
||||
return ' '.join(w['text'] for w in line)
|
||||
|
||||
def line_confidence(line: List[Dict]) -> float:
|
||||
return sum(w['conf'] for w in line) / len(line) if line else 0
|
||||
|
||||
vocab_rows: List[VocabRow] = []
|
||||
|
||||
for en_line in en_lines:
|
||||
en_y = line_y_center(en_line)
|
||||
en_text = line_text(en_line)
|
||||
en_conf = line_confidence(en_line)
|
||||
|
||||
if len(en_text.strip()) < 2:
|
||||
continue
|
||||
|
||||
de_text = ""
|
||||
de_conf = 0.0
|
||||
best_de_dist = float('inf')
|
||||
best_de_idx = -1
|
||||
for idx, de_line in enumerate(de_lines):
|
||||
dist = abs(line_y_center(de_line) - en_y)
|
||||
if dist < y_tolerance_px and dist < best_de_dist:
|
||||
best_de_dist = dist
|
||||
best_de_idx = idx
|
||||
|
||||
if best_de_idx >= 0:
|
||||
de_text = line_text(de_lines[best_de_idx])
|
||||
de_conf = line_confidence(de_lines[best_de_idx])
|
||||
|
||||
ex_text = ""
|
||||
ex_conf = 0.0
|
||||
best_ex_dist = float('inf')
|
||||
best_ex_idx = -1
|
||||
for idx, ex_line in enumerate(ex_lines):
|
||||
dist = abs(line_y_center(ex_line) - en_y)
|
||||
if dist < y_tolerance_px and dist < best_ex_dist:
|
||||
best_ex_dist = dist
|
||||
best_ex_idx = idx
|
||||
|
||||
if best_ex_idx >= 0:
|
||||
ex_text = line_text(ex_lines[best_ex_idx])
|
||||
ex_conf = line_confidence(ex_lines[best_ex_idx])
|
||||
|
||||
avg_conf = en_conf
|
||||
conf_count = 1
|
||||
if de_conf > 0:
|
||||
avg_conf += de_conf
|
||||
conf_count += 1
|
||||
if ex_conf > 0:
|
||||
avg_conf += ex_conf
|
||||
conf_count += 1
|
||||
|
||||
vocab_rows.append(VocabRow(
|
||||
english=en_text.strip(),
|
||||
german=de_text.strip(),
|
||||
example=ex_text.strip(),
|
||||
confidence=avg_conf / conf_count,
|
||||
y_position=int(en_y),
|
||||
))
|
||||
|
||||
# Handle multi-line wrapping in example column
|
||||
matched_ex_ys = set()
|
||||
for row in vocab_rows:
|
||||
if row.example:
|
||||
matched_ex_ys.add(row.y_position)
|
||||
|
||||
for ex_line in ex_lines:
|
||||
ex_y = line_y_center(ex_line)
|
||||
already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys)
|
||||
if already_matched:
|
||||
continue
|
||||
|
||||
best_row = None
|
||||
best_dist = float('inf')
|
||||
for row in vocab_rows:
|
||||
dist = ex_y - row.y_position
|
||||
if 0 < dist < y_tolerance_px * 3 and dist < best_dist:
|
||||
best_dist = dist
|
||||
best_row = row
|
||||
|
||||
if best_row:
|
||||
continuation = line_text(ex_line).strip()
|
||||
if continuation:
|
||||
best_row.example = (best_row.example + " " + continuation).strip()
|
||||
|
||||
vocab_rows.sort(key=lambda r: r.y_position)
|
||||
|
||||
return vocab_rows
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Stage 8: Optional LLM Post-Correction
|
||||
# =============================================================================
|
||||
|
||||
async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow],
|
||||
confidence_threshold: float = 50.0,
|
||||
enabled: bool = False) -> List[VocabRow]:
|
||||
"""Optionally send low-confidence regions to Qwen-VL for correction."""
|
||||
if not enabled:
|
||||
return vocab_rows
|
||||
|
||||
logger.info(f"LLM post-correction skipped (not yet implemented)")
|
||||
return vocab_rows
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Orchestrator
|
||||
# =============================================================================
|
||||
|
||||
async def run_cv_pipeline(
|
||||
pdf_data: Optional[bytes] = None,
|
||||
image_data: Optional[bytes] = None,
|
||||
page_number: int = 0,
|
||||
zoom: float = 3.0,
|
||||
enable_dewarp: bool = True,
|
||||
enable_llm_correction: bool = False,
|
||||
lang: str = "eng+deu",
|
||||
) -> PipelineResult:
|
||||
"""Run the complete CV document reconstruction pipeline."""
|
||||
if not CV_PIPELINE_AVAILABLE:
|
||||
return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)")
|
||||
|
||||
result = PipelineResult()
|
||||
total_start = time.time()
|
||||
|
||||
try:
|
||||
# Stage 1: Render
|
||||
t = time.time()
|
||||
if pdf_data:
|
||||
img = render_pdf_high_res(pdf_data, page_number, zoom)
|
||||
elif image_data:
|
||||
img = render_image_high_res(image_data)
|
||||
else:
|
||||
return PipelineResult(error="No input data (pdf_data or image_data required)")
|
||||
result.stages['render'] = round(time.time() - t, 2)
|
||||
result.image_width = img.shape[1]
|
||||
result.image_height = img.shape[0]
|
||||
logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s")
|
||||
|
||||
# Stage 2: Deskew
|
||||
t = time.time()
|
||||
img, angle = deskew_image(img)
|
||||
result.stages['deskew'] = round(time.time() - t, 2)
|
||||
logger.info(f"Stage 2 (deskew): {angle:.2f}\u00b0 in {result.stages['deskew']}s")
|
||||
|
||||
# Stage 3: Dewarp
|
||||
if enable_dewarp:
|
||||
t = time.time()
|
||||
img, _dewarp_info = dewarp_image(img)
|
||||
result.stages['dewarp'] = round(time.time() - t, 2)
|
||||
|
||||
# Stage 4: Dual image preparation
|
||||
t = time.time()
|
||||
ocr_img = create_ocr_image(img)
|
||||
layout_img = create_layout_image(img)
|
||||
result.stages['image_prep'] = round(time.time() - t, 2)
|
||||
|
||||
# Stage 5: Layout analysis
|
||||
t = time.time()
|
||||
regions = analyze_layout(layout_img, ocr_img)
|
||||
result.stages['layout'] = round(time.time() - t, 2)
|
||||
result.columns_detected = len([r for r in regions if r.type.startswith('column')])
|
||||
logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s")
|
||||
|
||||
# Stage 6: Multi-pass OCR
|
||||
t = time.time()
|
||||
ocr_results = run_multi_pass_ocr(ocr_img, regions, lang)
|
||||
result.stages['ocr'] = round(time.time() - t, 2)
|
||||
total_words = sum(len(w) for w in ocr_results.values())
|
||||
result.word_count = total_words
|
||||
logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s")
|
||||
|
||||
# Stage 7: Line alignment
|
||||
t = time.time()
|
||||
vocab_rows = match_lines_to_vocab(ocr_results, regions)
|
||||
result.stages['alignment'] = round(time.time() - t, 2)
|
||||
|
||||
# Stage 8: Optional LLM correction
|
||||
if enable_llm_correction:
|
||||
t = time.time()
|
||||
vocab_rows = await llm_post_correct(img, vocab_rows)
|
||||
result.stages['llm_correction'] = round(time.time() - t, 2)
|
||||
|
||||
# Convert to output format
|
||||
result.vocabulary = [
|
||||
{
|
||||
"english": row.english,
|
||||
"german": row.german,
|
||||
"example": row.example,
|
||||
"confidence": round(row.confidence, 1),
|
||||
}
|
||||
for row in vocab_rows
|
||||
if row.english or row.german
|
||||
]
|
||||
|
||||
result.duration_seconds = round(time.time() - total_start, 2)
|
||||
logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"CV Pipeline error: {e}")
|
||||
import traceback
|
||||
logger.debug(traceback.format_exc())
|
||||
result.error = str(e)
|
||||
result.duration_seconds = round(time.time() - total_start, 2)
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user