fix: Word-Validation auf Segment-Woerter beschraenken
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 17s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 17s
Woerter aus Sub-Header-Bereichen ueberlappten korrekte Spaltenluecken und liessen die Word-Validation faelschlich Gaps verwerfen. Jetzt werden nur Woerter aus dem gewaehlten Segment fuer die Validation verwendet. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2240,11 +2240,25 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
|
||||
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}")
|
||||
|
||||
# --- Step 5: Validate gaps against word bounding boxes ---
|
||||
# When using a segment for projection, only validate against words
|
||||
# inside that segment — words from sub-headers or other sections
|
||||
# would incorrectly overlap with real column gaps.
|
||||
if segments and len(segments) > 1:
|
||||
seg_top_abs = best_seg[0] # relative to content strip
|
||||
seg_bot_abs = best_seg[1]
|
||||
segment_words = [wd for wd in word_dicts
|
||||
if wd['top'] >= seg_top_abs
|
||||
and wd['top'] + wd['height'] <= seg_bot_abs]
|
||||
logger.info(f"ColumnGeometry: filtering words to segment: "
|
||||
f"{len(segment_words)}/{len(word_dicts)} words")
|
||||
else:
|
||||
segment_words = word_dicts
|
||||
|
||||
validated_gaps = []
|
||||
for gap_start_rel, gap_end_rel in raw_gaps:
|
||||
# Check if any word overlaps with this gap region
|
||||
overlapping = False
|
||||
for wd in word_dicts:
|
||||
for wd in segment_words:
|
||||
word_left = wd['left']
|
||||
word_right = wd['left'] + wd['width']
|
||||
if word_left < gap_end_rel and word_right > gap_start_rel:
|
||||
@@ -2258,7 +2272,7 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
|
||||
# Find the tightest word boundaries within the gap region
|
||||
min_word_left = content_w
|
||||
max_word_right = 0
|
||||
for wd in word_dicts:
|
||||
for wd in segment_words:
|
||||
word_left = wd['left']
|
||||
word_right = wd['left'] + wd['width']
|
||||
if word_left < gap_end_rel and word_right > gap_start_rel:
|
||||
@@ -2287,7 +2301,7 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
|
||||
if len(validated_gaps) < 2:
|
||||
logger.info("ColumnGeometry: < 2 pixel-gaps, trying word-coverage gaps")
|
||||
word_coverage = np.zeros(content_w, dtype=np.int32)
|
||||
for wd in word_dicts:
|
||||
for wd in segment_words:
|
||||
wl = max(0, wd['left'])
|
||||
wr = min(wd['left'] + wd['width'], content_w)
|
||||
if wr > wl:
|
||||
|
||||
Reference in New Issue
Block a user