refactor: positional_column_regions auch in OCR Pipeline verwenden
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 24s
CI / test-python-klausur (push) Failing after 1m48s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 16s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 24s
CI / test-python-klausur (push) Failing after 1m48s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 16s
Shared Funktion positional_column_regions() in cv_vocab_pipeline.py, wird jetzt von beiden Pfaden (Vocab-Worksheet + OCR Pipeline Admin) genutzt. classify_column_types() bleibt als Legacy erhalten. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3321,6 +3321,75 @@ def _build_margin_regions(
|
|||||||
return margins
|
return margins
|
||||||
|
|
||||||
|
|
||||||
|
def positional_column_regions(
|
||||||
|
geometries: List[ColumnGeometry],
|
||||||
|
content_w: int,
|
||||||
|
content_h: int,
|
||||||
|
left_x: int,
|
||||||
|
) -> List[PageRegion]:
|
||||||
|
"""Classify columns by position only (no language scoring).
|
||||||
|
|
||||||
|
Structural columns (page_ref, column_marker) are identified by geometry.
|
||||||
|
Remaining content columns are labelled left→right as column_en, column_de,
|
||||||
|
column_example. The names are purely positional – no language analysis.
|
||||||
|
"""
|
||||||
|
structural: List[PageRegion] = []
|
||||||
|
content_cols: List[ColumnGeometry] = []
|
||||||
|
|
||||||
|
for g in geometries:
|
||||||
|
rel_x = g.x - left_x
|
||||||
|
# page_ref: narrow column in the leftmost 20% region
|
||||||
|
if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20:
|
||||||
|
structural.append(PageRegion(
|
||||||
|
type='page_ref', x=g.x, y=g.y,
|
||||||
|
width=g.width, height=content_h,
|
||||||
|
classification_confidence=0.95,
|
||||||
|
classification_method='positional',
|
||||||
|
))
|
||||||
|
# column_marker: very narrow, few words
|
||||||
|
elif g.width_ratio < 0.06 and g.word_count <= 15:
|
||||||
|
structural.append(PageRegion(
|
||||||
|
type='column_marker', x=g.x, y=g.y,
|
||||||
|
width=g.width, height=content_h,
|
||||||
|
classification_confidence=0.95,
|
||||||
|
classification_method='positional',
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
content_cols.append(g)
|
||||||
|
|
||||||
|
# Single content column → plain text page
|
||||||
|
if len(content_cols) == 1:
|
||||||
|
g = content_cols[0]
|
||||||
|
return structural + [PageRegion(
|
||||||
|
type='column_text', x=g.x, y=g.y,
|
||||||
|
width=g.width, height=content_h,
|
||||||
|
classification_confidence=0.9,
|
||||||
|
classification_method='positional',
|
||||||
|
)]
|
||||||
|
|
||||||
|
# No content columns
|
||||||
|
if not content_cols:
|
||||||
|
return structural
|
||||||
|
|
||||||
|
# Sort content columns left→right and assign positional labels
|
||||||
|
content_cols.sort(key=lambda g: g.x)
|
||||||
|
labels = ['column_en', 'column_de', 'column_example']
|
||||||
|
regions = list(structural)
|
||||||
|
for i, g in enumerate(content_cols):
|
||||||
|
label = labels[i] if i < len(labels) else 'column_example'
|
||||||
|
regions.append(PageRegion(
|
||||||
|
type=label, x=g.x, y=g.y,
|
||||||
|
width=g.width, height=content_h,
|
||||||
|
classification_confidence=0.95,
|
||||||
|
classification_method='positional',
|
||||||
|
))
|
||||||
|
|
||||||
|
logger.info(f"PositionalColumns: {len(structural)} structural, "
|
||||||
|
f"{len(content_cols)} content → "
|
||||||
|
f"{[r.type for r in regions]}")
|
||||||
|
return regions
|
||||||
|
|
||||||
|
|
||||||
def classify_column_types(geometries: List[ColumnGeometry],
|
def classify_column_types(geometries: List[ColumnGeometry],
|
||||||
content_w: int,
|
content_w: int,
|
||||||
top_y: int,
|
top_y: int,
|
||||||
@@ -3548,6 +3617,21 @@ def _classify_by_content(geometries: List[ColumnGeometry],
|
|||||||
best_en = max(en_candidates, key=lambda x: x[2]['eng'])
|
best_en = max(en_candidates, key=lambda x: x[2]['eng'])
|
||||||
best_de = max(de_candidates, key=lambda x: x[2]['deu'])
|
best_de = max(de_candidates, key=lambda x: x[2]['deu'])
|
||||||
|
|
||||||
|
# Position-aware EN selection: in typical textbooks the layout is EN | DE | Example.
|
||||||
|
# Example sentences contain English function words ("the", "a", "is") which inflate
|
||||||
|
# the eng score of the Example column. When the best EN candidate sits to the RIGHT
|
||||||
|
# of the DE column and there is another EN candidate to the LEFT, prefer the left one
|
||||||
|
# — it is almost certainly the real vocabulary column.
|
||||||
|
if best_de[2]['deu'] > 0.5 and best_en[1].x > best_de[1].x and len(en_candidates) > 1:
|
||||||
|
left_of_de = [c for c in en_candidates if c[1].x < best_de[1].x]
|
||||||
|
if left_of_de:
|
||||||
|
alt_en = max(left_of_de, key=lambda x: x[2]['eng'])
|
||||||
|
logger.info(
|
||||||
|
f"ClassifyColumns: Level 1 position fix — best EN col {best_en[0]} "
|
||||||
|
f"(eng={best_en[2]['eng']:.3f}) is right of DE col {best_de[0]}; "
|
||||||
|
f"preferring left col {alt_en[0]} (eng={alt_en[2]['eng']:.3f})")
|
||||||
|
best_en = alt_en
|
||||||
|
|
||||||
if best_en[0] == best_de[0]:
|
if best_en[0] == best_de[0]:
|
||||||
# Same column scored highest for both — ambiguous
|
# Same column scored highest for both — ambiguous
|
||||||
logger.info("ClassifyColumns: Level 1 failed - same column highest for EN and DE")
|
logger.info("ClassifyColumns: Level 1 failed - same column highest for EN and DE")
|
||||||
@@ -3996,9 +4080,9 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
|
|||||||
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
|
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
|
||||||
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
||||||
|
|
||||||
# Phase B: Content-based classification
|
# Phase B: Positional classification (no language scoring)
|
||||||
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
|
content_h = bottom_y - top_y
|
||||||
left_x=left_x, right_x=right_x, inv=_inv)
|
regions = positional_column_regions(geometries, content_w, content_h, left_x)
|
||||||
|
|
||||||
col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
|
col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
|
||||||
methods = set(r.classification_method for r in regions if r.classification_method)
|
methods = set(r.classification_method for r in regions if r.classification_method)
|
||||||
|
|||||||
@@ -70,7 +70,7 @@ try:
|
|||||||
detect_column_geometry, analyze_layout_by_words, analyze_layout, create_layout_image,
|
detect_column_geometry, analyze_layout_by_words, analyze_layout, create_layout_image,
|
||||||
detect_row_geometry, build_cell_grid_v2,
|
detect_row_geometry, build_cell_grid_v2,
|
||||||
_cells_to_vocab_entries, _detect_sub_columns, _detect_header_footer_gaps,
|
_cells_to_vocab_entries, _detect_sub_columns, _detect_header_footer_gaps,
|
||||||
expand_narrow_columns, llm_review_entries,
|
expand_narrow_columns, positional_column_regions, llm_review_entries,
|
||||||
_fix_phonetic_brackets,
|
_fix_phonetic_brackets,
|
||||||
render_pdf_high_res,
|
render_pdf_high_res,
|
||||||
PageRegion, RowGeometry,
|
PageRegion, RowGeometry,
|
||||||
@@ -1336,75 +1336,6 @@ async def process_single_page(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _positional_column_regions(
|
|
||||||
geometries: list,
|
|
||||||
content_w: int,
|
|
||||||
content_h: int,
|
|
||||||
left_x: int,
|
|
||||||
) -> list:
|
|
||||||
"""Classify columns by position only (no language scoring).
|
|
||||||
|
|
||||||
Structural columns (page_ref, column_marker) are identified by geometry.
|
|
||||||
Remaining content columns are labelled left→right as column_en, column_de,
|
|
||||||
column_example. The names are purely positional – no language analysis.
|
|
||||||
"""
|
|
||||||
structural = []
|
|
||||||
content_cols = []
|
|
||||||
|
|
||||||
for g in geometries:
|
|
||||||
rel_x = g.x - left_x
|
|
||||||
# page_ref: narrow column in the leftmost 20% region
|
|
||||||
if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20:
|
|
||||||
structural.append(PageRegion(
|
|
||||||
type='page_ref', x=g.x, y=g.y,
|
|
||||||
width=g.width, height=content_h,
|
|
||||||
classification_confidence=0.95,
|
|
||||||
classification_method='positional',
|
|
||||||
))
|
|
||||||
# column_marker: very narrow, few words
|
|
||||||
elif g.width_ratio < 0.06 and g.word_count <= 15:
|
|
||||||
structural.append(PageRegion(
|
|
||||||
type='column_marker', x=g.x, y=g.y,
|
|
||||||
width=g.width, height=content_h,
|
|
||||||
classification_confidence=0.95,
|
|
||||||
classification_method='positional',
|
|
||||||
))
|
|
||||||
else:
|
|
||||||
content_cols.append(g)
|
|
||||||
|
|
||||||
# Single content column → plain text page
|
|
||||||
if len(content_cols) == 1:
|
|
||||||
g = content_cols[0]
|
|
||||||
return structural + [PageRegion(
|
|
||||||
type='column_text', x=g.x, y=g.y,
|
|
||||||
width=g.width, height=content_h,
|
|
||||||
classification_confidence=0.9,
|
|
||||||
classification_method='positional',
|
|
||||||
)]
|
|
||||||
|
|
||||||
# No content columns
|
|
||||||
if not content_cols:
|
|
||||||
return structural
|
|
||||||
|
|
||||||
# Sort content columns left→right and assign positional labels
|
|
||||||
content_cols.sort(key=lambda g: g.x)
|
|
||||||
labels = ['column_en', 'column_de', 'column_example']
|
|
||||||
regions = list(structural)
|
|
||||||
for i, g in enumerate(content_cols):
|
|
||||||
label = labels[i] if i < len(labels) else 'column_example'
|
|
||||||
regions.append(PageRegion(
|
|
||||||
type=label, x=g.x, y=g.y,
|
|
||||||
width=g.width, height=content_h,
|
|
||||||
classification_confidence=0.95,
|
|
||||||
classification_method='positional',
|
|
||||||
))
|
|
||||||
|
|
||||||
logger.info(f"PositionalColumns: {len(structural)} structural, "
|
|
||||||
f"{len(content_cols)} content → "
|
|
||||||
f"{[r.type for r in regions]}")
|
|
||||||
return regions
|
|
||||||
|
|
||||||
|
|
||||||
async def _run_ocr_pipeline_for_page(
|
async def _run_ocr_pipeline_for_page(
|
||||||
img_bgr: np.ndarray,
|
img_bgr: np.ndarray,
|
||||||
page_number: int,
|
page_number: int,
|
||||||
@@ -1479,7 +1410,7 @@ async def _run_ocr_pipeline_for_page(
|
|||||||
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
||||||
geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
|
geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
|
||||||
content_h = bottom_y - top_y
|
content_h = bottom_y - top_y
|
||||||
regions = _positional_column_regions(geometries, content_w, content_h, left_x)
|
regions = positional_column_regions(geometries, content_w, content_h, left_x)
|
||||||
content_bounds = (left_x, right_x, top_y, bottom_y)
|
content_bounds = (left_x, right_x, top_y, bottom_y)
|
||||||
|
|
||||||
logger.info(f" columns: {len(regions)} detected ({_time.time() - t0:.1f}s)")
|
logger.info(f" columns: {len(regions)} detected ({_time.time() - t0:.1f}s)")
|
||||||
|
|||||||
Reference in New Issue
Block a user