fix(columns): filter phantom narrow columns + rename step to OCR-Zeichenkorrektur
Phantom column fix: Adjacent tiny gaps (e.g. 11px + 35px) can create very narrow columns (< 3% of content width) with 0 words. These are scan artefacts, not real columns. New Step 9 in detect_column_geometry(): - Filter columns where width < max(20px, 3% content_w) AND words < 3 - After filtering, extend each remaining column to close the gap with its right neighbor, and re-assign words to correct column Example from logs: 5 columns → 4 columns (phantom at x=710, width=36px eliminated; neighbors expanded to cover the gap) UI rename: - 'Schritt 6: LLM-Korrektur' → 'Schritt 6: OCR-Zeichenkorrektur' - 'LLM-Korrektur starten' → 'Zeichenkorrektur starten' - Error message updated accordingly (No LLM involved anymore — spell-checker is the active engine) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -342,7 +342,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) {
|
||||
return (
|
||||
<div className="flex flex-col items-center justify-center py-12 text-center">
|
||||
<div className="text-5xl mb-4">⚠️</div>
|
||||
<h3 className="text-lg font-medium text-red-600 dark:text-red-400 mb-2">Fehler bei LLM-Korrektur</h3>
|
||||
<h3 className="text-lg font-medium text-red-600 dark:text-red-400 mb-2">Fehler bei OCR-Zeichenkorrektur</h3>
|
||||
<p className="text-sm text-gray-500 dark:text-gray-400 max-w-lg mb-4">{error}</p>
|
||||
<div className="flex gap-3">
|
||||
<button onClick={() => { setError(''); loadSessionData() }}
|
||||
@@ -387,7 +387,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) {
|
||||
<div className="flex items-center justify-between">
|
||||
<div>
|
||||
<h3 className="text-base font-medium text-gray-700 dark:text-gray-300">
|
||||
Schritt 6: LLM-Korrektur
|
||||
Schritt 6: OCR-Zeichenkorrektur
|
||||
</h3>
|
||||
<p className="text-xs text-gray-400 mt-0.5">
|
||||
{status === 'ready' && `${vocabEntries.length} Eintraege bereit zur Pruefung`}
|
||||
@@ -405,7 +405,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) {
|
||||
{status === 'ready' && (
|
||||
<button onClick={runReview}
|
||||
className="px-5 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors text-sm font-medium">
|
||||
LLM-Korrektur starten
|
||||
Zeichenkorrektur starten
|
||||
</button>
|
||||
)}
|
||||
{status === 'running' && (
|
||||
|
||||
@@ -1680,6 +1680,33 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
|
||||
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
|
||||
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
||||
|
||||
# --- Step 9: Filter phantom narrow columns ---
|
||||
# Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow
|
||||
# columns (< 3% of content width) with zero or no words. These are not
|
||||
# real columns — remove them and close the gap between neighbors.
|
||||
min_real_col_w = max(20, int(content_w * 0.03))
|
||||
filtered_geoms = [g for g in geometries
|
||||
if not (g.word_count < 3 and g.width < min_real_col_w)]
|
||||
if len(filtered_geoms) < len(geometries):
|
||||
n_removed = len(geometries) - len(filtered_geoms)
|
||||
logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) "
|
||||
f"(width < {min_real_col_w}px and words < 3)")
|
||||
# Extend each remaining column to close gaps with its right neighbor
|
||||
for i, g in enumerate(filtered_geoms):
|
||||
if i + 1 < len(filtered_geoms):
|
||||
g.width = filtered_geoms[i + 1].x - g.x
|
||||
else:
|
||||
g.width = right_x - g.x
|
||||
g.index = i
|
||||
col_left_rel = g.x - left_x
|
||||
col_right_rel = col_left_rel + g.width
|
||||
g.words = [w for w in word_dicts
|
||||
if col_left_rel <= w['left'] < col_right_rel]
|
||||
g.word_count = len(g.words)
|
||||
geometries = filtered_geoms
|
||||
logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: "
|
||||
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
||||
|
||||
return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user