feat: Full-Row OCR mit Spacing fuer Box-Sub-Sessions
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 40s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m16s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 22s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 40s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m16s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 22s
Sub-Sessions ueberspringen Spaltenerkennung und nutzen stattdessen eine Pseudo-Spalte ueber die volle Breite. Text wird mit proportionalem Spacing aus Wort-Positionen rekonstruiert, um raeumliches Layout zu erhalten. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -45,6 +45,7 @@ export function StepColumnDetection({ sessionId, onNext, onBoxSessionsCreated }:
|
|||||||
const [savedGtColumns, setSavedGtColumns] = useState<PageRegion[] | null>(null)
|
const [savedGtColumns, setSavedGtColumns] = useState<PageRegion[] | null>(null)
|
||||||
const [creatingBoxSessions, setCreatingBoxSessions] = useState(false)
|
const [creatingBoxSessions, setCreatingBoxSessions] = useState(false)
|
||||||
const [existingSubSessions, setExistingSubSessions] = useState<SubSession[] | null>(null)
|
const [existingSubSessions, setExistingSubSessions] = useState<SubSession[] | null>(null)
|
||||||
|
const [isSubSession, setIsSubSession] = useState(false)
|
||||||
|
|
||||||
// Fetch session info (image dimensions) + check for cached column result
|
// Fetch session info (image dimensions) + check for cached column result
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
@@ -58,14 +59,31 @@ export function StepColumnDetection({ sessionId, onNext, onBoxSessionsCreated }:
|
|||||||
if (info.image_width && info.image_height) {
|
if (info.image_width && info.image_height) {
|
||||||
setImageDimensions({ width: info.image_width, height: info.image_height })
|
setImageDimensions({ width: info.image_width, height: info.image_height })
|
||||||
}
|
}
|
||||||
|
const isSub = !!info.parent_session_id
|
||||||
|
setIsSubSession(isSub)
|
||||||
if (info.sub_sessions && info.sub_sessions.length > 0) {
|
if (info.sub_sessions && info.sub_sessions.length > 0) {
|
||||||
setExistingSubSessions(info.sub_sessions)
|
setExistingSubSessions(info.sub_sessions)
|
||||||
onBoxSessionsCreated?.(info.sub_sessions)
|
onBoxSessionsCreated?.(info.sub_sessions)
|
||||||
}
|
}
|
||||||
if (info.column_result) {
|
if (info.column_result) {
|
||||||
setColumnResult(info.column_result)
|
setColumnResult(info.column_result)
|
||||||
|
// Sub-session with pseudo-column already set → auto-advance
|
||||||
|
if (isSub) {
|
||||||
|
onNext()
|
||||||
|
return
|
||||||
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
// Sub-session without columns → auto-detect (creates pseudo-column)
|
||||||
|
if (isSub) {
|
||||||
|
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/columns`, { method: 'POST' })
|
||||||
|
if (res.ok) {
|
||||||
|
const data: ColumnResult = await res.json()
|
||||||
|
setColumnResult(data)
|
||||||
|
onNext()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error('Failed to fetch session info:', e)
|
console.error('Failed to fetch session info:', e)
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ from cv_ocr_engines import (
|
|||||||
_fix_phonetic_brackets,
|
_fix_phonetic_brackets,
|
||||||
_split_comma_entries,
|
_split_comma_entries,
|
||||||
_words_to_reading_order_text,
|
_words_to_reading_order_text,
|
||||||
|
_words_to_spaced_text,
|
||||||
ocr_region_lighton,
|
ocr_region_lighton,
|
||||||
ocr_region_rapid,
|
ocr_region_rapid,
|
||||||
ocr_region_trocr,
|
ocr_region_trocr,
|
||||||
@@ -371,7 +372,16 @@ def build_cell_grid_v2(
|
|||||||
|
|
||||||
if words:
|
if words:
|
||||||
y_tol = max(15, row.height)
|
y_tol = max(15, row.height)
|
||||||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
# Single full-width column (box sub-session): preserve spacing
|
||||||
|
is_single_full_column = (
|
||||||
|
len(relevant_cols) == 1
|
||||||
|
and img_w > 0
|
||||||
|
and relevant_cols[0].width / img_w > 0.9
|
||||||
|
)
|
||||||
|
if is_single_full_column:
|
||||||
|
text = _words_to_spaced_text(words, y_tolerance_px=y_tol)
|
||||||
|
else:
|
||||||
|
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||||||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
||||||
else:
|
else:
|
||||||
text = ''
|
text = ''
|
||||||
|
|||||||
@@ -124,6 +124,40 @@ def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) ->
|
|||||||
return '\n'.join(lines)
|
return '\n'.join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def _words_to_spaced_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
|
||||||
|
"""Join OCR words preserving proportional horizontal spacing.
|
||||||
|
|
||||||
|
Instead of single spaces between words, inserts multiple spaces based on
|
||||||
|
the pixel gap between words relative to average character width.
|
||||||
|
Useful for box sub-sessions where spatial layout matters.
|
||||||
|
"""
|
||||||
|
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
|
||||||
|
result_lines = []
|
||||||
|
|
||||||
|
for line_words in lines:
|
||||||
|
if not line_words:
|
||||||
|
continue
|
||||||
|
sorted_words = sorted(line_words, key=lambda w: w['left'])
|
||||||
|
|
||||||
|
# Calculate average character width from all words in line
|
||||||
|
total_chars = sum(len(w['text']) for w in sorted_words if w.get('text'))
|
||||||
|
total_width = sum(w['width'] for w in sorted_words if w.get('text'))
|
||||||
|
avg_char_width = total_width / total_chars if total_chars > 0 else 10
|
||||||
|
|
||||||
|
parts = []
|
||||||
|
for i, word in enumerate(sorted_words):
|
||||||
|
parts.append(word.get('text', ''))
|
||||||
|
if i < len(sorted_words) - 1:
|
||||||
|
next_word = sorted_words[i + 1]
|
||||||
|
gap_px = next_word['left'] - (word['left'] + word['width'])
|
||||||
|
num_spaces = max(1, round(gap_px / avg_char_width))
|
||||||
|
parts.append(' ' * num_spaces)
|
||||||
|
|
||||||
|
result_lines.append(''.join(parts))
|
||||||
|
|
||||||
|
return '\n'.join(result_lines)
|
||||||
|
|
||||||
|
|
||||||
# --- RapidOCR integration (PaddleOCR models on ONNX Runtime) ---
|
# --- RapidOCR integration (PaddleOCR models on ONNX Runtime) ---
|
||||||
|
|
||||||
_rapid_engine = None
|
_rapid_engine = None
|
||||||
|
|||||||
@@ -1209,6 +1209,34 @@ async def detect_columns(session_id: str):
|
|||||||
if img_bgr is None:
|
if img_bgr is None:
|
||||||
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before column detection")
|
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before column detection")
|
||||||
|
|
||||||
|
# Sub-sessions: skip column detection, create single pseudo-column
|
||||||
|
session = await get_session_db(session_id)
|
||||||
|
if session and session.get("parent_session_id"):
|
||||||
|
h, w = img_bgr.shape[:2]
|
||||||
|
column_result = {
|
||||||
|
"columns": [{
|
||||||
|
"type": "column_text",
|
||||||
|
"x": 0, "y": 0,
|
||||||
|
"width": w, "height": h,
|
||||||
|
}],
|
||||||
|
"zones": None,
|
||||||
|
"boxes_detected": 0,
|
||||||
|
"duration_seconds": 0,
|
||||||
|
"method": "sub_session_pseudo_column",
|
||||||
|
}
|
||||||
|
await update_session_db(
|
||||||
|
session_id,
|
||||||
|
column_result=column_result,
|
||||||
|
row_result=None,
|
||||||
|
word_result=None,
|
||||||
|
current_step=6,
|
||||||
|
)
|
||||||
|
cached["column_result"] = column_result
|
||||||
|
cached.pop("row_result", None)
|
||||||
|
cached.pop("word_result", None)
|
||||||
|
logger.info(f"OCR Pipeline: sub-session {session_id}: pseudo-column {w}x{h}px")
|
||||||
|
return {"session_id": session_id, **column_result}
|
||||||
|
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
|
|
||||||
# Binarized image for layout analysis
|
# Binarized image for layout analysis
|
||||||
|
|||||||
Reference in New Issue
Block a user