Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
219 lines
8.0 KiB
Python
219 lines
8.0 KiB
Python
"""
|
||
Position-based column type classification for OCR layout analysis.
|
||
|
||
Contains Level 2 and Level 3 classification functions:
|
||
Level 2 – _classify_by_position_enhanced: Position + language confirmation
|
||
Level 3 – _classify_by_position_fallback: Pure positional (no regression)
|
||
|
||
Extracted from cv_layout_classify.py during file-size split.
|
||
"""
|
||
|
||
import logging
|
||
from typing import Dict, List, Optional
|
||
|
||
from cv_vocab_types import ColumnGeometry, PageRegion
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Level 2: Position-Enhanced Classification
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
|
||
lang_scores: List[Dict[str, float]],
|
||
content_w: int,
|
||
content_h: int) -> Optional[List[PageRegion]]:
|
||
"""Level 2: Position-based rules enhanced with language confirmation.
|
||
|
||
Uses the old positional heuristics but confirms EN/DE assignment
|
||
with language scores (swapping if needed).
|
||
"""
|
||
regions = []
|
||
untyped = list(range(len(geometries)))
|
||
first_x = geometries[0].x if geometries else 0
|
||
left_20_threshold = first_x + content_w * 0.20
|
||
|
||
# Rule 1: Leftmost narrow column -> page_ref (only if in left 20%, no strong language)
|
||
g0 = geometries[0]
|
||
ls0 = lang_scores[0]
|
||
has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3
|
||
if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0:
|
||
regions.append(PageRegion(
|
||
type='page_ref', x=g0.x, y=g0.y,
|
||
width=g0.width, height=content_h,
|
||
classification_confidence=0.8,
|
||
classification_method='position_enhanced',
|
||
))
|
||
untyped.remove(0)
|
||
|
||
# Rule 2: Narrow columns with few words -> marker
|
||
for i in list(untyped):
|
||
geom = geometries[i]
|
||
if geom.width_ratio < 0.06 and geom.word_count <= 15:
|
||
regions.append(PageRegion(
|
||
type='column_marker', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.7,
|
||
classification_method='position_enhanced',
|
||
))
|
||
untyped.remove(i)
|
||
|
||
# Rule 3: Rightmost remaining -> column_example (if 3+ remaining)
|
||
if len(untyped) >= 3:
|
||
last_idx = untyped[-1]
|
||
geom = geometries[last_idx]
|
||
regions.append(PageRegion(
|
||
type='column_example', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.7,
|
||
classification_method='position_enhanced',
|
||
))
|
||
untyped.remove(last_idx)
|
||
|
||
# Rule 4: First two remaining -> EN/DE, but check language to possibly swap
|
||
if len(untyped) >= 2:
|
||
idx_a = untyped[0]
|
||
idx_b = untyped[1]
|
||
ls_a = lang_scores[idx_a]
|
||
ls_b = lang_scores[idx_b]
|
||
|
||
# Default: first=EN, second=DE (old behavior)
|
||
en_idx, de_idx = idx_a, idx_b
|
||
conf = 0.7
|
||
|
||
# Swap if language signals clearly indicate the opposite
|
||
if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']:
|
||
en_idx, de_idx = idx_b, idx_a
|
||
conf = 0.85
|
||
logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores")
|
||
|
||
regions.append(PageRegion(
|
||
type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
|
||
width=geometries[en_idx].width, height=content_h,
|
||
classification_confidence=conf,
|
||
classification_method='position_enhanced',
|
||
))
|
||
regions.append(PageRegion(
|
||
type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
|
||
width=geometries[de_idx].width, height=content_h,
|
||
classification_confidence=conf,
|
||
classification_method='position_enhanced',
|
||
))
|
||
untyped = untyped[2:]
|
||
elif len(untyped) == 1:
|
||
idx = untyped[0]
|
||
geom = geometries[idx]
|
||
regions.append(PageRegion(
|
||
type='column_en', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.5,
|
||
classification_method='position_enhanced',
|
||
))
|
||
untyped = []
|
||
|
||
# Remaining -> example
|
||
for idx in untyped:
|
||
geom = geometries[idx]
|
||
regions.append(PageRegion(
|
||
type='column_example', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.5,
|
||
classification_method='position_enhanced',
|
||
))
|
||
|
||
regions.sort(key=lambda r: r.x)
|
||
return regions
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Level 3: Position Fallback Classification
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _classify_by_position_fallback(geometries: List[ColumnGeometry],
|
||
content_w: int,
|
||
content_h: int) -> List[PageRegion]:
|
||
"""Level 3: Pure position-based fallback (identical to old code).
|
||
|
||
Guarantees no regression from the previous behavior.
|
||
"""
|
||
regions = []
|
||
untyped = list(range(len(geometries)))
|
||
first_x = geometries[0].x if geometries else 0
|
||
left_20_threshold = first_x + content_w * 0.20
|
||
|
||
# Rule 1: Leftmost narrow column -> page_ref (only if in left 20%)
|
||
g0 = geometries[0]
|
||
if g0.width_ratio < 0.12 and g0.x < left_20_threshold:
|
||
regions.append(PageRegion(
|
||
type='page_ref', x=g0.x, y=g0.y,
|
||
width=g0.width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
untyped.remove(0)
|
||
|
||
# Rule 2: Narrow + few words -> marker
|
||
for i in list(untyped):
|
||
geom = geometries[i]
|
||
if geom.width_ratio < 0.06 and geom.word_count <= 15:
|
||
regions.append(PageRegion(
|
||
type='column_marker', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
untyped.remove(i)
|
||
|
||
# Rule 3: Rightmost remaining -> example (if 3+)
|
||
if len(untyped) >= 3:
|
||
last_idx = untyped[-1]
|
||
geom = geometries[last_idx]
|
||
regions.append(PageRegion(
|
||
type='column_example', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
untyped.remove(last_idx)
|
||
|
||
# Rule 4: First remaining -> EN, second -> DE
|
||
if len(untyped) >= 2:
|
||
en_idx = untyped[0]
|
||
de_idx = untyped[1]
|
||
regions.append(PageRegion(
|
||
type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
|
||
width=geometries[en_idx].width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
regions.append(PageRegion(
|
||
type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
|
||
width=geometries[de_idx].width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
untyped = untyped[2:]
|
||
elif len(untyped) == 1:
|
||
idx = untyped[0]
|
||
geom = geometries[idx]
|
||
regions.append(PageRegion(
|
||
type='column_en', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
untyped = []
|
||
|
||
for idx in untyped:
|
||
geom = geometries[idx]
|
||
regions.append(PageRegion(
|
||
type='column_example', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
|
||
regions.sort(key=lambda r: r.x)
|
||
return regions
|