Flexible inhaltsbasierte Spaltenerkennung (2-Phasen)

Ersetzt hardcodierte Positionsregeln durch ein zweistufiges System:
Phase A erkennt Spaltengeometrie (Clustering), Phase B klassifiziert
Typen per Inhalt (Sprache/Rolle) mit 3-stufiger Fallback-Kette.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-26 23:33:35 +01:00
parent cf27a95308
commit 1393a994f9
4 changed files with 595 additions and 78 deletions

View File

@@ -64,11 +64,14 @@ export interface DewarpGroundTruth {
}
export interface PageRegion {
type: 'column_en' | 'column_de' | 'column_example' | 'page_ref' | 'column_marker' | 'header' | 'footer'
type: 'column_en' | 'column_de' | 'column_example' | 'page_ref'
| 'column_marker' | 'column_text' | 'header' | 'footer'
x: number
y: number
width: number
height: number
classification_confidence?: number
classification_method?: string
}
export interface ColumnResult {

View File

@@ -15,6 +15,7 @@ const TYPE_COLORS: Record<string, string> = {
column_en: 'bg-blue-100 text-blue-700 dark:bg-blue-900/30 dark:text-blue-400',
column_de: 'bg-green-100 text-green-700 dark:bg-green-900/30 dark:text-green-400',
column_example: 'bg-orange-100 text-orange-700 dark:bg-orange-900/30 dark:text-orange-400',
column_text: 'bg-cyan-100 text-cyan-700 dark:bg-cyan-900/30 dark:text-cyan-400',
page_ref: 'bg-purple-100 text-purple-700 dark:bg-purple-900/30 dark:text-purple-400',
column_marker: 'bg-red-100 text-red-700 dark:bg-red-900/30 dark:text-red-400',
header: 'bg-gray-100 text-gray-600 dark:bg-gray-700/50 dark:text-gray-400',
@@ -25,12 +26,19 @@ const TYPE_LABELS: Record<string, string> = {
column_en: 'EN',
column_de: 'DE',
column_example: 'Beispiel',
column_text: 'Text',
page_ref: 'Seite',
column_marker: 'Marker',
header: 'Header',
footer: 'Footer',
}
const METHOD_LABELS: Record<string, string> = {
content: 'Inhalt',
position_enhanced: 'Position',
position_fallback: 'Fallback',
}
export function ColumnControls({ columnResult, onRerun, onGroundTruth, onNext, isDetecting }: ColumnControlsProps) {
const [gtSaved, setGtSaved] = useState(false)
@@ -70,6 +78,16 @@ export function ColumnControls({ columnResult, onRerun, onGroundTruth, onNext, i
<span className={`px-2 py-0.5 rounded text-xs font-medium ${TYPE_COLORS[col.type] || ''}`}>
{TYPE_LABELS[col.type] || col.type}
</span>
{col.classification_confidence != null && col.classification_confidence < 1.0 && (
<span className="text-xs font-medium text-gray-600 dark:text-gray-300">
{Math.round(col.classification_confidence * 100)}%
</span>
)}
{col.classification_method && (
<span className="text-xs text-gray-400 dark:text-gray-500">
({METHOD_LABELS[col.classification_method] || col.classification_method})
</span>
)}
<span className="text-gray-500 dark:text-gray-400 text-xs font-mono">
x={col.x} y={col.y} {col.width}x{col.height}px
</span>

View File

@@ -48,16 +48,46 @@ except ImportError:
CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
# --- Language Detection Constants ---
GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht',
'von', 'zu', 'mit', 'auf', 'fuer', 'den', 'dem', 'sich', 'auch', 'wird',
'nach', 'bei', 'aus', 'wie', 'oder', 'wenn', 'noch', 'aber', 'hat', 'nur',
'ueber', 'kann', 'als', 'ich', 'er', 'sie', 'es', 'wir', 'ihr', 'haben',
'sein', 'werden', 'war', 'sind', 'muss', 'soll', 'dieser', 'diese', 'diesem'}
ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'of',
'and', 'in', 'that', 'it', 'for', 'on', 'with', 'as', 'at', 'by', 'from',
'or', 'but', 'not', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
'would', 'can', 'could', 'should', 'may', 'might', 'this', 'they', 'you', 'he',
'she', 'we', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'which'}
# --- Data Classes ---
@dataclass
class PageRegion:
"""A detected region on the page."""
type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'header', 'footer'
type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer'
x: int
y: int
width: int
height: int
classification_confidence: float = 1.0 # 0.0-1.0
classification_method: str = "" # 'content', 'position_enhanced', 'position_fallback'
@dataclass
class ColumnGeometry:
"""Geometrisch erkannte Spalte vor Typ-Klassifikation."""
index: int # 0-basiert, links->rechts
x: int
y: int
width: int
height: int
word_count: int
words: List[Dict] # Wort-Dicts aus Tesseract (text, conf, left, top, ...)
width_ratio: float # width / content_width (0.0-1.0)
@dataclass
@@ -840,22 +870,24 @@ def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegi
# =============================================================================
# Stage 5b: Word-Based Layout Analysis (5-Column Detection)
# Stage 5b: Word-Based Layout Analysis (Two-Phase Column Detection)
# =============================================================================
def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]:
"""Detect columns by clustering left-aligned word positions from Tesseract.
# --- Phase A: Geometry Detection ---
This approach works better than projection profiles for vocabulary pages
with 5 columns (page_ref, EN, DE, markers, examples) because it detects
column starts where left-aligned words cluster.
def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int]]:
"""Detect column geometry by clustering left-aligned word positions.
Phase A of the two-phase column detection. Returns untyped column
geometries with their words for subsequent content-based classification.
Args:
ocr_img: Binarized grayscale image for layout analysis.
dewarped_bgr: Original BGR image (for Tesseract word detection).
Returns:
List of PageRegion objects. Falls back to analyze_layout() if < 3 clusters.
Tuple of (geometries, left_x, right_x, top_y, bottom_y) or None if
fewer than 3 clusters are found (signals fallback needed).
"""
h, w = ocr_img.shape[:2]
@@ -870,7 +902,7 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
top_y, bottom_y = 0, h
content_w, content_h = w, h
logger.info(f"LayoutByWords: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
f"y=[{top_y}..{bottom_y}] ({content_h}px)")
# --- Get word bounding boxes from Tesseract ---
@@ -880,13 +912,12 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
try:
data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT)
except Exception as e:
logger.warning(f"LayoutByWords: Tesseract image_to_data failed: {e}, falling back")
layout_img = create_layout_image(dewarped_bgr)
return analyze_layout(layout_img, ocr_img)
logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}")
return None
# Collect left edges of recognized words (confidence > 30)
# Collect words with their full info
word_dicts = []
left_edges = []
word_info = [] # (left, top, width, height, text, conf)
n_words = len(data['text'])
for i in range(n_words):
conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1
@@ -898,20 +929,22 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
bw = int(data['width'][i])
bh = int(data['height'][i])
left_edges.append(lx)
word_info.append((lx, ty, bw, bh, text, conf))
word_dicts.append({
'text': text, 'conf': conf,
'left': lx, 'top': ty, 'width': bw, 'height': bh,
})
if len(left_edges) < 5:
logger.warning(f"LayoutByWords: only {len(left_edges)} words detected, falling back")
layout_img = create_layout_image(dewarped_bgr)
return analyze_layout(layout_img, ocr_img)
logger.warning(f"ColumnGeometry: only {len(left_edges)} words detected")
return None
logger.info(f"LayoutByWords: {len(left_edges)} words detected in content area")
logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area")
# --- Cluster left edges ---
tolerance = max(10, int(content_w * 0.01)) # ~1% of content width
tolerance = max(10, int(content_w * 0.01))
sorted_edges = sorted(left_edges)
clusters = [] # list of (center_x, count, edges)
clusters = []
current_cluster = [sorted_edges[0]]
for edge in sorted_edges[1:]:
if edge - current_cluster[-1] <= tolerance:
@@ -925,20 +958,18 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
significant = [(int(np.mean(c)), len(c), min(c), max(c)) for c in clusters if len(c) >= 2]
significant.sort(key=lambda s: s[0])
logger.info(f"LayoutByWords: {len(significant)} significant clusters "
logger.info(f"ColumnGeometry: {len(significant)} significant clusters "
f"(from {len(clusters)} total): "
f"{[(s[0]+left_x, s[1]) for s in significant[:10]]}")
if len(significant) < 3:
logger.info("LayoutByWords: < 3 clusters, falling back to projection-based layout")
layout_img = create_layout_image(dewarped_bgr)
return analyze_layout(layout_img, ocr_img)
logger.info("ColumnGeometry: < 3 clusters, signaling fallback")
return None
# --- Merge clusters that are very close (within 2*tolerance) ---
merged = [significant[0]]
for s in significant[1:]:
if s[0] - merged[-1][0] < 2 * tolerance:
# Merge: weighted average position, sum counts
prev = merged[-1]
total = prev[1] + s[1]
avg_x = (prev[0] * prev[1] + s[0] * s[1]) // total
@@ -946,114 +977,562 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
else:
merged.append(s)
logger.info(f"LayoutByWords: {len(merged)} clusters after merging: "
logger.info(f"ColumnGeometry: {len(merged)} clusters after merging: "
f"{[(m[0]+left_x, m[1]) for m in merged]}")
if len(merged) < 3:
logger.info("LayoutByWords: < 3 merged clusters, falling back")
layout_img = create_layout_image(dewarped_bgr)
return analyze_layout(layout_img, ocr_img)
logger.info("ColumnGeometry: < 3 merged clusters, signaling fallback")
return None
# --- Derive column boundaries ---
# 2mm margin before each cluster start (~8px at 100dpi, scale with image)
margin_px = max(5, int(content_w * 0.005))
col_starts = [] # (abs_x, word_count)
col_starts = []
for center_x, count, min_edge, max_edge in merged:
abs_start = max(0, left_x + min_edge - margin_px)
col_starts.append((abs_start, count))
# Calculate column widths
col_defs = [] # (abs_x, width, word_count)
# Calculate column widths and assign words to columns
geometries = []
for i, (start_x, count) in enumerate(col_starts):
if i + 1 < len(col_starts):
col_width = col_starts[i + 1][0] - start_x
else:
col_width = right_x - start_x
col_defs.append((start_x, col_width, count))
logger.info(f"LayoutByWords: column definitions: "
f"{[(d[0], d[1], d[2]) for d in col_defs]}")
# Assign words to this column based on left edge
col_left_rel = start_x - left_x
col_right_rel = col_left_rel + col_width
col_words = [w for w in word_dicts
if col_left_rel <= w['left'] < col_right_rel]
# --- Assign types based on rules ---
geometries.append(ColumnGeometry(
index=i,
x=start_x,
y=top_y,
width=col_width,
height=content_h,
word_count=len(col_words),
words=col_words,
width_ratio=col_width / content_w if content_w > 0 else 0.0,
))
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
return (geometries, left_x, right_x, top_y, bottom_y)
# --- Phase B: Content-Based Classification ---
def _score_language(words: List[Dict]) -> Dict[str, float]:
"""Score the language of a column's words.
Analyzes function words, umlauts, and capitalization patterns
to determine whether text is English or German.
Args:
words: List of word dicts with 'text' and 'conf' keys.
Returns:
Dict with 'eng' and 'deu' scores (0.0-1.0).
"""
if not words:
return {'eng': 0.0, 'deu': 0.0}
# Only consider words with decent confidence
good_words = [w['text'].lower() for w in words if w.get('conf', 0) > 40 and len(w['text']) > 0]
if not good_words:
return {'eng': 0.0, 'deu': 0.0}
total = len(good_words)
en_hits = sum(1 for w in good_words if w in ENGLISH_FUNCTION_WORDS)
de_hits = sum(1 for w in good_words if w in GERMAN_FUNCTION_WORDS)
# Check for umlauts (strong German signal)
raw_texts = [w['text'] for w in words if w.get('conf', 0) > 40]
umlaut_count = sum(1 for t in raw_texts
for c in t if c in 'äöüÄÖÜß')
# German capitalization: nouns are capitalized mid-sentence
# Count words that start with uppercase but aren't at position 0
cap_words = sum(1 for t in raw_texts if t[0].isupper() and len(t) > 2)
en_score = en_hits / total if total > 0 else 0.0
de_score = de_hits / total if total > 0 else 0.0
# Boost German score for umlauts
if umlaut_count > 0:
de_score = min(1.0, de_score + 0.15 * min(umlaut_count, 5))
# Boost German score for high capitalization ratio (typical for German nouns)
if total > 5:
cap_ratio = cap_words / total
if cap_ratio > 0.3:
de_score = min(1.0, de_score + 0.1)
return {'eng': round(en_score, 3), 'deu': round(de_score, 3)}
def _score_role(geom: ColumnGeometry) -> Dict[str, float]:
"""Score the role of a column based on its geometry and content patterns.
Args:
geom: ColumnGeometry with words and dimensions.
Returns:
Dict with role scores: 'reference', 'marker', 'sentence', 'vocabulary'.
"""
scores = {'reference': 0.0, 'marker': 0.0, 'sentence': 0.0, 'vocabulary': 0.0}
if not geom.words:
return scores
texts = [w['text'] for w in geom.words if w.get('conf', 0) > 40]
if not texts:
return scores
avg_word_len = sum(len(t) for t in texts) / len(texts)
has_punctuation = sum(1 for t in texts if any(c in t for c in '.!?;:,'))
digit_words = sum(1 for t in texts if any(c.isdigit() for c in t))
digit_ratio = digit_words / len(texts) if texts else 0.0
# Reference: narrow + mostly numbers/page references
if geom.width_ratio < 0.12:
scores['reference'] = 0.5
if digit_ratio > 0.4:
scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5)
# Marker: very narrow + few short entries
if geom.width_ratio < 0.08 and geom.word_count <= 10:
scores['marker'] = 0.7
if avg_word_len < 4:
scores['marker'] = 0.9
# Sentence: longer words + punctuation present
if geom.width_ratio > 0.15 and has_punctuation > 2:
scores['sentence'] = 0.3 + min(0.5, has_punctuation / len(texts))
if avg_word_len > 4:
scores['sentence'] = min(1.0, scores['sentence'] + 0.2)
# Vocabulary: medium width + medium word length
if 0.10 < geom.width_ratio < 0.45:
scores['vocabulary'] = 0.4
if 3 < avg_word_len < 8:
scores['vocabulary'] = min(1.0, scores['vocabulary'] + 0.3)
return {k: round(v, 3) for k, v in scores.items()}
def classify_column_types(geometries: List[ColumnGeometry],
content_w: int,
top_y: int,
img_w: int,
img_h: int,
bottom_y: int) -> List[PageRegion]:
"""Classify column types using a 3-level fallback chain.
Level 1: Content-based (language + role scoring)
Level 2: Position + language (old rules enhanced with language detection)
Level 3: Pure position (exact old code, no regression)
Args:
geometries: List of ColumnGeometry from Phase A.
content_w: Total content width.
top_y: Top Y of content area.
img_w: Full image width.
img_h: Full image height.
bottom_y: Bottom Y of content area.
Returns:
List of PageRegion with types, confidence, and method.
"""
content_h = bottom_y - top_y
# Special case: single column → plain text page
if len(geometries) == 1:
geom = geometries[0]
return [PageRegion(
type='column_text', x=geom.x, y=geom.y,
width=geom.width, height=geom.height,
classification_confidence=0.9,
classification_method='content',
)]
# --- Score all columns ---
lang_scores = [_score_language(g.words) for g in geometries]
role_scores = [_score_role(g) for g in geometries]
logger.info(f"ClassifyColumns: language scores: "
f"{[(g.index, ls) for g, ls in zip(geometries, lang_scores)]}")
logger.info(f"ClassifyColumns: role scores: "
f"{[(g.index, rs) for g, rs in zip(geometries, role_scores)]}")
# --- Level 1: Content-based classification ---
regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h)
if regions is not None:
logger.info("ClassifyColumns: Level 1 (content-based) succeeded")
_add_header_footer(regions, top_y, bottom_y, img_w, img_h)
return regions
# --- Level 2: Position + language enhanced ---
regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h)
if regions is not None:
logger.info("ClassifyColumns: Level 2 (position+language) succeeded")
_add_header_footer(regions, top_y, bottom_y, img_w, img_h)
return regions
# --- Level 3: Pure position fallback (old code, no regression) ---
logger.info("ClassifyColumns: Level 3 (position fallback)")
regions = _classify_by_position_fallback(geometries, content_w, content_h)
_add_header_footer(regions, top_y, bottom_y, img_w, img_h)
return regions
def _classify_by_content(geometries: List[ColumnGeometry],
lang_scores: List[Dict[str, float]],
role_scores: List[Dict[str, float]],
content_w: int,
content_h: int) -> Optional[List[PageRegion]]:
"""Level 1: Classify columns purely by content analysis.
Requires clear language signals to distinguish EN/DE columns.
Returns None if language signals are too weak.
"""
regions = []
total_content_w = right_x - left_x
untyped = list(range(len(col_defs))) # indices not yet assigned
assigned = set()
# Rule 1: Leftmost narrow column (< 12% width) → page_ref
if col_defs[0][1] < total_content_w * 0.12:
# Step 1: Assign structural roles first (reference, marker)
for i, (geom, rs) in enumerate(zip(geometries, role_scores)):
if rs['reference'] >= 0.5 and geom.width_ratio < 0.12:
regions.append(PageRegion(
type='page_ref', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=rs['reference'],
classification_method='content',
))
assigned.add(i)
elif rs['marker'] >= 0.7 and geom.width_ratio < 0.08:
regions.append(PageRegion(
type='column_marker', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=rs['marker'],
classification_method='content',
))
assigned.add(i)
# Step 2: Among remaining columns, find EN and DE by language scores
remaining = [(i, geometries[i], lang_scores[i], role_scores[i])
for i in range(len(geometries)) if i not in assigned]
if len(remaining) < 2:
# Not enough columns for EN/DE pair
if len(remaining) == 1:
i, geom, ls, rs = remaining[0]
regions.append(PageRegion(
type='column_text', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.6,
classification_method='content',
))
regions.sort(key=lambda r: r.x)
return regions
# Check if we have enough language signal
en_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['eng'] > ls['deu'] and ls['eng'] > 0.05]
de_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['deu'] > ls['eng'] and ls['deu'] > 0.05]
if not en_candidates or not de_candidates:
# Language signals too weak for content-based classification
logger.info("ClassifyColumns: Level 1 failed - no clear EN/DE language split")
return None
# Pick the best EN and DE candidates
best_en = max(en_candidates, key=lambda x: x[2]['eng'])
best_de = max(de_candidates, key=lambda x: x[2]['deu'])
if best_en[0] == best_de[0]:
# Same column scored highest for both — ambiguous
logger.info("ClassifyColumns: Level 1 failed - same column highest for EN and DE")
return None
en_conf = best_en[2]['eng']
de_conf = best_de[2]['deu']
regions.append(PageRegion(
type='column_en', x=best_en[1].x, y=best_en[1].y,
width=best_en[1].width, height=content_h,
classification_confidence=round(en_conf, 2),
classification_method='content',
))
assigned.add(best_en[0])
regions.append(PageRegion(
type='column_de', x=best_de[1].x, y=best_de[1].y,
width=best_de[1].width, height=content_h,
classification_confidence=round(de_conf, 2),
classification_method='content',
))
assigned.add(best_de[0])
# Step 3: Remaining columns → example or text based on role scores
for i, geom, ls, rs in remaining:
if i in assigned:
continue
if rs['sentence'] > 0.4:
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=round(rs['sentence'], 2),
classification_method='content',
))
else:
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.5,
classification_method='content',
))
regions.sort(key=lambda r: r.x)
return regions
def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
lang_scores: List[Dict[str, float]],
content_w: int,
content_h: int) -> Optional[List[PageRegion]]:
"""Level 2: Position-based rules enhanced with language confirmation.
Uses the old positional heuristics but confirms EN/DE assignment
with language scores (swapping if needed).
"""
regions = []
untyped = list(range(len(geometries)))
# Rule 1: Leftmost narrow column → page_ref
g0 = geometries[0]
if g0.width_ratio < 0.12:
regions.append(PageRegion(
type='page_ref', x=col_defs[0][0], y=top_y,
width=col_defs[0][1], height=content_h
type='page_ref', x=g0.x, y=g0.y,
width=g0.width, height=content_h,
classification_confidence=0.8,
classification_method='position_enhanced',
))
untyped.remove(0)
logger.info(f"LayoutByWords: col 0 → page_ref (width={col_defs[0][1]}px, "
f"{col_defs[0][1]*100/total_content_w:.1f}%)")
# Rule 2: Narrow column with few words (< 8% width, <= 8 words) → column_marker
# Rule 2: Narrow columns with few words marker
for i in list(untyped):
col_x, col_w, col_count = col_defs[i]
if col_w < total_content_w * 0.08 and col_count <= 8:
geom = geometries[i]
if geom.width_ratio < 0.08 and geom.word_count <= 10:
regions.append(PageRegion(
type='column_marker', x=col_x, y=top_y,
width=col_w, height=content_h
type='column_marker', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.7,
classification_method='position_enhanced',
))
untyped.remove(i)
logger.info(f"LayoutByWords: col {i} → column_marker (width={col_w}px, "
f"{col_w*100/total_content_w:.1f}%, words={col_count})")
# Rule 3: Rightmost remaining (widest or last) → column_example
# Rule 3: Rightmost remaining → column_example (if 3+ remaining)
if len(untyped) >= 3:
last_idx = untyped[-1]
geom = geometries[last_idx]
regions.append(PageRegion(
type='column_example', x=col_defs[last_idx][0], y=top_y,
width=col_defs[last_idx][1], height=content_h
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.7,
classification_method='position_enhanced',
))
untyped.remove(last_idx)
logger.info(f"LayoutByWords: col {last_idx} → column_example")
# Rule 4: First remaining → column_en, second → column_de
# Rule 4: First two remaining → EN/DE, but check language to possibly swap
if len(untyped) >= 2:
idx_a = untyped[0]
idx_b = untyped[1]
ls_a = lang_scores[idx_a]
ls_b = lang_scores[idx_b]
# Default: first=EN, second=DE (old behavior)
en_idx, de_idx = idx_a, idx_b
conf = 0.7
# Swap if language signals clearly indicate the opposite
if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']:
en_idx, de_idx = idx_b, idx_a
conf = 0.85
logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores")
regions.append(PageRegion(
type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
width=geometries[en_idx].width, height=content_h,
classification_confidence=conf,
classification_method='position_enhanced',
))
regions.append(PageRegion(
type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
width=geometries[de_idx].width, height=content_h,
classification_confidence=conf,
classification_method='position_enhanced',
))
untyped = untyped[2:]
elif len(untyped) == 1:
idx = untyped[0]
geom = geometries[idx]
regions.append(PageRegion(
type='column_en', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.5,
classification_method='position_enhanced',
))
untyped = []
# Remaining → example
for idx in untyped:
geom = geometries[idx]
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.5,
classification_method='position_enhanced',
))
regions.sort(key=lambda r: r.x)
return regions
def _classify_by_position_fallback(geometries: List[ColumnGeometry],
content_w: int,
content_h: int) -> List[PageRegion]:
"""Level 3: Pure position-based fallback (identical to old code).
Guarantees no regression from the previous behavior.
"""
regions = []
untyped = list(range(len(geometries)))
# Rule 1: Leftmost narrow column → page_ref
g0 = geometries[0]
if g0.width_ratio < 0.12:
regions.append(PageRegion(
type='page_ref', x=g0.x, y=g0.y,
width=g0.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped.remove(0)
# Rule 2: Narrow + few words → marker
for i in list(untyped):
geom = geometries[i]
if geom.width_ratio < 0.08 and geom.word_count <= 8:
regions.append(PageRegion(
type='column_marker', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped.remove(i)
# Rule 3: Rightmost remaining → example (if 3+)
if len(untyped) >= 3:
last_idx = untyped[-1]
geom = geometries[last_idx]
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped.remove(last_idx)
# Rule 4: First remaining → EN, second → DE
if len(untyped) >= 2:
en_idx = untyped[0]
de_idx = untyped[1]
regions.append(PageRegion(
type='column_en', x=col_defs[en_idx][0], y=top_y,
width=col_defs[en_idx][1], height=content_h
type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
width=geometries[en_idx].width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
regions.append(PageRegion(
type='column_de', x=col_defs[de_idx][0], y=top_y,
width=col_defs[de_idx][1], height=content_h
type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
width=geometries[de_idx].width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped = untyped[2:]
logger.info(f"LayoutByWords: col {en_idx} → column_en, col {de_idx} → column_de")
elif len(untyped) == 1:
# Only one left — call it column_en
idx = untyped[0]
geom = geometries[idx]
regions.append(PageRegion(
type='column_en', x=col_defs[idx][0], y=top_y,
width=col_defs[idx][1], height=content_h
type='column_en', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped = []
# Any remaining untyped columns get generic column_example type
for idx in untyped:
geom = geometries[idx]
regions.append(PageRegion(
type='column_example', x=col_defs[idx][0], y=top_y,
width=col_defs[idx][1], height=content_h
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
# Sort by x position for consistent output
regions.sort(key=lambda r: r.x)
return regions
# Add header/footer
def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
img_w: int, img_h: int) -> None:
"""Add header/footer regions in-place."""
if top_y > 10:
regions.append(PageRegion(type='header', x=0, y=0, width=w, height=top_y))
if bottom_y < h - 10:
regions.append(PageRegion(type='footer', x=0, y=bottom_y, width=w, height=h - bottom_y))
regions.append(PageRegion(type='header', x=0, y=0, width=img_w, height=top_y))
if bottom_y < img_h - 10:
regions.append(PageRegion(type='footer', x=0, y=bottom_y, width=img_w, height=img_h - bottom_y))
# --- Main Entry Point ---
def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]:
"""Detect columns using two-phase approach: geometry then content classification.
Phase A: detect_column_geometry() — clustering word positions into columns.
Phase B: classify_column_types() — content-based type assignment with fallback.
Falls back to projection-based analyze_layout() if geometry detection fails.
Args:
ocr_img: Binarized grayscale image for layout analysis.
dewarped_bgr: Original BGR image (for Tesseract word detection).
Returns:
List of PageRegion objects with types, confidence, and method.
"""
h, w = ocr_img.shape[:2]
# Phase A: Geometry detection
result = detect_column_geometry(ocr_img, dewarped_bgr)
if result is None:
# Fallback to projection-based layout
logger.info("LayoutByWords: geometry detection failed, falling back to projection profiles")
layout_img = create_layout_image(dewarped_bgr)
return analyze_layout(layout_img, ocr_img)
geometries, left_x, right_x, top_y, bottom_y = result
content_w = right_x - left_x
# Phase B: Content-based classification
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y)
col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
logger.info(f"LayoutByWords: {col_count} columns detected: "
f"{[(r.type, r.x, r.width) for r in regions if r.type not in ('header','footer')]}")
methods = set(r.classification_method for r in regions if r.classification_method)
logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): "
f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer')]}")
return regions
@@ -1276,6 +1755,11 @@ def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
Returns:
List of VocabRow objects.
"""
# If no vocabulary columns detected (e.g. plain text page), return empty
if 'column_en' not in ocr_results and 'column_de' not in ocr_results:
logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty")
return []
# Group words into lines per column
en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px)
de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px)

View File

@@ -648,8 +648,16 @@ async def detect_columns(session_id: str):
duration = time.time() - t0
columns = [asdict(r) for r in regions]
# Determine classification methods used
methods = list(set(
c.get("classification_method", "") for c in columns
if c.get("classification_method")
))
column_result = {
"columns": columns,
"classification_methods": methods,
"duration_seconds": round(duration, 2),
}
@@ -742,6 +750,7 @@ async def _get_columns_overlay(session_id: str) -> Response:
"column_en": (255, 180, 0), # Blue
"column_de": (0, 200, 0), # Green
"column_example": (0, 140, 255), # Orange
"column_text": (200, 200, 0), # Cyan/Turquoise
"page_ref": (200, 0, 200), # Purple
"column_marker": (0, 0, 220), # Red
"header": (128, 128, 128), # Gray
@@ -760,8 +769,11 @@ async def _get_columns_overlay(session_id: str) -> Response:
# Solid border
cv2.rectangle(img, (x, y), (x + w, y + h), color, 3)
# Label
# Label with confidence
label = col.get("type", "unknown").replace("column_", "").upper()
conf = col.get("classification_confidence")
if conf is not None and conf < 1.0:
label = f"{label} {int(conf * 100)}%"
cv2.putText(img, label, (x + 10, y + 30),
cv2.FONT_HERSHEY_SIMPLEX, 0.8, color, 2)