refactor: remove unused pages and backends (model-management, OCR legacy, GPU/vast.ai, video-chat, matrix)
Deleted pages: - /ai/model-management (mock data only, no real backend) - /ai/ocr-compare (old /vocab/ backend, replaced by ocr-kombi) - /ai/ocr-pipeline (minimal session browser, redundant) - /ai/ocr-overlay (legacy monolith, redundant) - /ai/gpu (vast.ai GPU management, no longer used) - /infrastructure/gpu (same) - /communication/video-chat (moved to core) - /communication/matrix (moved to core) Deleted backends: - backend-lehrer/infra/vast_client.py + vast_power.py - backend-lehrer/meetings_api.py + jitsi_api.py - website/app/api/admin/gpu/ - edu-search-service/scripts/vast_ai_extractor.py Total: ~7,800 LOC removed. All code preserved in git history. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
100
klausur-service/backend/tests/debug_shear.py
Normal file
100
klausur-service/backend/tests/debug_shear.py
Normal file
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Debug script: analyze text line slopes on deskewed image to determine true residual shear."""
|
||||
import sys, math, asyncio
|
||||
sys.path.insert(0, "/app/backend")
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import pytesseract
|
||||
from ocr_pipeline_session_store import get_session_db
|
||||
|
||||
SESSION_ID = "3dcb1897-09a6-4b80-91b5-7e4207980bf3"
|
||||
|
||||
async def main():
|
||||
s = await get_session_db(SESSION_ID)
|
||||
if not s:
|
||||
print("Session not found")
|
||||
return
|
||||
|
||||
deskewed_png = s.get("deskewed_png")
|
||||
if not deskewed_png:
|
||||
print("No deskewed_png stored")
|
||||
return
|
||||
|
||||
arr = np.frombuffer(deskewed_png, dtype=np.uint8)
|
||||
img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
|
||||
h, w = img.shape[:2]
|
||||
print(f"Deskewed image: {w}x{h}")
|
||||
|
||||
# Detect text line slopes using Tesseract word positions
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
data = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT, config="--psm 6")
|
||||
|
||||
lines = {}
|
||||
for i in range(len(data["text"])):
|
||||
txt = (data["text"][i] or "").strip()
|
||||
if len(txt) < 2 or data["conf"][i] < 30:
|
||||
continue
|
||||
key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
|
||||
cx = data["left"][i] + data["width"][i] / 2
|
||||
cy = data["top"][i] + data["height"][i] / 2
|
||||
if key not in lines:
|
||||
lines[key] = []
|
||||
lines[key].append((cx, cy))
|
||||
|
||||
slopes = []
|
||||
for key, pts in lines.items():
|
||||
if len(pts) < 3:
|
||||
continue
|
||||
pts.sort(key=lambda p: p[0])
|
||||
xs = np.array([p[0] for p in pts])
|
||||
ys = np.array([p[1] for p in pts])
|
||||
if xs[-1] - xs[0] < w * 0.2:
|
||||
continue
|
||||
A = np.vstack([xs, np.ones(len(xs))]).T
|
||||
result = np.linalg.lstsq(A, ys, rcond=None)
|
||||
slope = result[0][0]
|
||||
angle_deg = math.degrees(math.atan(slope))
|
||||
slopes.append(angle_deg)
|
||||
|
||||
if not slopes:
|
||||
print("No text lines detected")
|
||||
return
|
||||
|
||||
median_slope = sorted(slopes)[len(slopes) // 2]
|
||||
mean_slope = sum(slopes) / len(slopes)
|
||||
print(f"Text lines found: {len(slopes)}")
|
||||
print(f"Median slope: {median_slope:.4f} deg")
|
||||
print(f"Mean slope: {mean_slope:.4f} deg")
|
||||
print(f"Range: [{min(slopes):.4f}, {max(slopes):.4f}]")
|
||||
print()
|
||||
print("Individual line slopes:")
|
||||
for s in sorted(slopes):
|
||||
print(f" {s:+.4f}")
|
||||
|
||||
# Also test the 4 dewarp methods directly
|
||||
print("\n--- Dewarp method results on deskewed image ---")
|
||||
from cv_vocab_pipeline import (
|
||||
_detect_shear_angle, _detect_shear_by_projection,
|
||||
_detect_shear_by_hough, _detect_shear_by_text_lines,
|
||||
)
|
||||
for name, fn in [
|
||||
("vertical_edge", _detect_shear_angle),
|
||||
("projection", _detect_shear_by_projection),
|
||||
("hough_lines", _detect_shear_by_hough),
|
||||
("text_lines", _detect_shear_by_text_lines),
|
||||
]:
|
||||
r = fn(img)
|
||||
print(f" {name}: shear={r['shear_degrees']:.4f} conf={r['confidence']:.3f}")
|
||||
|
||||
# The user says "right side needs to come down 3mm"
|
||||
# For a ~85mm wide image (1002px at ~300DPI), 3mm ~ 35px
|
||||
# shear angle = atan(35 / 1556) ~ 1.29 degrees
|
||||
# Let's check: what does the image look like if we apply 0.5, 1.0, 1.5 deg shear?
|
||||
print("\n--- Pixel shift at right edge for various shear angles ---")
|
||||
for deg in [0.5, 0.8, 1.0, 1.3, 1.5, 2.0]:
|
||||
shift_px = h * math.tan(math.radians(deg))
|
||||
shift_mm = shift_px / (w / 85.0) # approximate mm
|
||||
print(f" {deg:.1f} deg -> {shift_px:.0f}px shift -> ~{shift_mm:.1f}mm")
|
||||
|
||||
asyncio.run(main())
|
||||
256
klausur-service/backend/tests/test_box_boundary_rows.py
Normal file
256
klausur-service/backend/tests/test_box_boundary_rows.py
Normal file
@@ -0,0 +1,256 @@
|
||||
"""
|
||||
Tests for box boundary row filtering logic (box_ranges_inner).
|
||||
|
||||
Verifies that rows at the border of box zones are NOT excluded during
|
||||
row detection and word filtering. This prevents the last row above a
|
||||
box from being clipped by the box's border pixels.
|
||||
|
||||
Related fix in ocr_pipeline_api.py: detect_rows() and detect_words()
|
||||
use box_ranges_inner (shrunk by border_thickness, min 5px) instead of
|
||||
full box_ranges for row exclusion.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Simulate the box_ranges_inner calculation from ocr_pipeline_api.py
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def compute_box_ranges(zones: list[dict]) -> tuple[list, list]:
|
||||
"""
|
||||
Replicates the box_ranges / box_ranges_inner calculation
|
||||
from detect_rows() in ocr_pipeline_api.py.
|
||||
"""
|
||||
box_ranges = []
|
||||
box_ranges_inner = []
|
||||
for zone in zones:
|
||||
if zone.get("zone_type") == "box" and zone.get("box"):
|
||||
box = zone["box"]
|
||||
bt = max(box.get("border_thickness", 0), 5) # minimum 5px margin
|
||||
box_ranges.append((box["y"], box["y"] + box["height"]))
|
||||
box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt))
|
||||
return box_ranges, box_ranges_inner
|
||||
|
||||
|
||||
def build_content_strips(box_ranges_inner: list, top_y: int, bottom_y: int) -> list:
|
||||
"""
|
||||
Replicates the content_strips calculation from detect_rows() in ocr_pipeline_api.py.
|
||||
"""
|
||||
sorted_boxes = sorted(box_ranges_inner, key=lambda r: r[0])
|
||||
content_strips = []
|
||||
strip_start = top_y
|
||||
for by_start, by_end in sorted_boxes:
|
||||
if by_start > strip_start:
|
||||
content_strips.append((strip_start, by_start))
|
||||
strip_start = max(strip_start, by_end)
|
||||
if strip_start < bottom_y:
|
||||
content_strips.append((strip_start, bottom_y))
|
||||
return [(ys, ye) for ys, ye in content_strips if ye - ys >= 20]
|
||||
|
||||
|
||||
def row_in_box(row_y: int, row_height: int, box_ranges_inner: list) -> bool:
|
||||
"""
|
||||
Replicates the _row_in_box filter from detect_words() in ocr_pipeline_api.py.
|
||||
"""
|
||||
center_y = row_y + row_height / 2
|
||||
return any(by_s <= center_y < by_e for by_s, by_e in box_ranges_inner)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestBoxRangesInner:
|
||||
"""Tests for box_ranges_inner calculation."""
|
||||
|
||||
def test_border_thickness_shrinks_inner_range(self):
|
||||
"""Inner range should be shrunk by border_thickness."""
|
||||
zones = [{
|
||||
"zone_type": "box",
|
||||
"box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 10},
|
||||
}]
|
||||
box_ranges, inner = compute_box_ranges(zones)
|
||||
|
||||
assert box_ranges == [(500, 700)]
|
||||
assert inner == [(510, 690)] # shrunk by 10px on each side
|
||||
|
||||
def test_minimum_5px_margin(self):
|
||||
"""Even with border_thickness=0, minimum 5px margin should apply."""
|
||||
zones = [{
|
||||
"zone_type": "box",
|
||||
"box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 0},
|
||||
}]
|
||||
_, inner = compute_box_ranges(zones)
|
||||
|
||||
assert inner == [(505, 695)] # minimum 5px applied
|
||||
|
||||
def test_no_box_zones_returns_empty(self):
|
||||
"""Without box zones, both ranges should be empty."""
|
||||
zones = [
|
||||
{"zone_type": "content", "y": 0, "height": 500},
|
||||
]
|
||||
box_ranges, inner = compute_box_ranges(zones)
|
||||
|
||||
assert box_ranges == []
|
||||
assert inner == []
|
||||
|
||||
def test_multiple_boxes(self):
|
||||
"""Multiple boxes should each get their own inner range."""
|
||||
zones = [
|
||||
{"zone_type": "box", "box": {"x": 50, "y": 300, "width": 1100, "height": 150, "border_thickness": 8}},
|
||||
{"zone_type": "box", "box": {"x": 50, "y": 700, "width": 1100, "height": 150, "border_thickness": 3}},
|
||||
]
|
||||
box_ranges, inner = compute_box_ranges(zones)
|
||||
|
||||
assert len(box_ranges) == 2
|
||||
assert len(inner) == 2
|
||||
assert inner[0] == (308, 442) # 300+8 to 450-8
|
||||
assert inner[1] == (705, 845) # 700+5(min) to 850-5(min)
|
||||
|
||||
|
||||
class TestContentStrips:
|
||||
"""Tests for content strip building with box_ranges_inner."""
|
||||
|
||||
def test_single_box_creates_two_strips(self):
|
||||
"""A single box in the middle should create two content strips."""
|
||||
inner = [(505, 695)] # box inner at y=505..695
|
||||
strips = build_content_strips(inner, top_y=100, bottom_y=1700)
|
||||
|
||||
assert len(strips) == 2
|
||||
assert strips[0] == (100, 505) # above box
|
||||
assert strips[1] == (695, 1700) # below box
|
||||
|
||||
def test_content_strip_includes_box_border_area(self):
|
||||
"""Content strips should INCLUDE the box border area (not just stop at box outer edge)."""
|
||||
# Box at y=500, height=200, border=10 → inner=(510, 690)
|
||||
inner = [(510, 690)]
|
||||
strips = build_content_strips(inner, top_y=100, bottom_y=1700)
|
||||
|
||||
# Strip above extends to 510 (not 500), including border area
|
||||
assert strips[0] == (100, 510)
|
||||
# Strip below starts at 690 (not 700), including border area
|
||||
assert strips[1] == (690, 1700)
|
||||
|
||||
def test_row_at_box_border_is_in_content_strip(self):
|
||||
"""A row at y=495 (just above box at y=500) should be in the content strip."""
|
||||
# Box at y=500, height=200, border=10 → inner=(510, 690)
|
||||
inner = [(510, 690)]
|
||||
strips = build_content_strips(inner, top_y=100, bottom_y=1700)
|
||||
|
||||
# Row at y=495, height=30 → center at y=510 → just at the edge
|
||||
row_center = 495 + 15 # = 510
|
||||
# This row center is at the boundary — it should be in the first strip
|
||||
in_first_strip = strips[0][0] <= row_center <= strips[0][1]
|
||||
assert in_first_strip
|
||||
|
||||
def test_no_boxes_single_strip(self):
|
||||
"""Without boxes, a single strip covering the full content should be returned."""
|
||||
strips = build_content_strips([], top_y=100, bottom_y=1700)
|
||||
|
||||
assert len(strips) == 1
|
||||
assert strips[0] == (100, 1700)
|
||||
|
||||
|
||||
class TestRowInBoxFilter:
|
||||
"""Tests for the _row_in_box filter using box_ranges_inner."""
|
||||
|
||||
def test_row_inside_box_is_excluded(self):
|
||||
"""A row clearly inside the box inner range should be excluded."""
|
||||
inner = [(510, 690)]
|
||||
# Row at y=550, height=30 → center at 565
|
||||
assert row_in_box(550, 30, inner) is True
|
||||
|
||||
def test_row_above_box_not_excluded(self):
|
||||
"""A row above the box (at the border area) should NOT be excluded."""
|
||||
inner = [(510, 690)]
|
||||
# Row at y=490, height=30 → center at 505 → below inner start (510)
|
||||
assert row_in_box(490, 30, inner) is False
|
||||
|
||||
def test_row_below_box_not_excluded(self):
|
||||
"""A row below the box (at the border area) should NOT be excluded."""
|
||||
inner = [(510, 690)]
|
||||
# Row at y=695, height=30 → center at 710 → above inner end (690)
|
||||
assert row_in_box(695, 30, inner) is False
|
||||
|
||||
def test_row_at_box_border_not_excluded(self):
|
||||
"""A row overlapping with the box border should NOT be excluded.
|
||||
|
||||
This is the key fix: previously, box_ranges (not inner) was used,
|
||||
which would exclude this row because its center (505) falls within
|
||||
the full box range (500-700).
|
||||
"""
|
||||
# Full box range: (500, 700), inner: (510, 690)
|
||||
inner = [(510, 690)]
|
||||
# Row at y=490, height=30 → center at 505
|
||||
# With box_ranges (500, 700): 500 <= 505 < 700 → excluded (BUG!)
|
||||
# With box_ranges_inner (510, 690): 510 <= 505 → False → not excluded (FIXED!)
|
||||
assert row_in_box(490, 30, inner) is False
|
||||
|
||||
def test_row_at_bottom_border_not_excluded(self):
|
||||
"""A row overlapping with the bottom box border should NOT be excluded."""
|
||||
inner = [(510, 690)]
|
||||
# Row at y=685, height=30 → center at 700
|
||||
# With box_ranges (500, 700): 500 <= 700 < 700 → not excluded (edge)
|
||||
# With box_ranges_inner (510, 690): 510 <= 700 → True but 700 >= 690 → False
|
||||
assert row_in_box(685, 30, inner) is False
|
||||
|
||||
def test_no_boxes_nothing_excluded(self):
|
||||
"""Without box zones, no rows should be excluded."""
|
||||
assert row_in_box(500, 30, []) is False
|
||||
|
||||
|
||||
class TestBoxBoundaryIntegration:
|
||||
"""Integration test: simulate the full row → content strip → filter pipeline."""
|
||||
|
||||
def test_boundary_row_preserved_with_inner_ranges(self):
|
||||
"""
|
||||
End-to-end: A row at the box boundary is preserved in content strips
|
||||
and not filtered out by _row_in_box.
|
||||
|
||||
Simulates the real scenario: page with a box at y=500..700,
|
||||
border_thickness=10. Row at y=488..518 (center=503) sits just
|
||||
above the box border.
|
||||
"""
|
||||
zones = [{
|
||||
"zone_type": "box",
|
||||
"box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 10},
|
||||
}]
|
||||
|
||||
# Step 1: Compute inner ranges
|
||||
box_ranges, inner = compute_box_ranges(zones)
|
||||
assert inner == [(510, 690)]
|
||||
|
||||
# Step 2: Build content strips
|
||||
strips = build_content_strips(inner, top_y=20, bottom_y=2400)
|
||||
assert len(strips) == 2
|
||||
# First strip extends to 510 (includes the border area 500-510)
|
||||
assert strips[0] == (20, 510)
|
||||
|
||||
# Step 3: Check that the boundary row is NOT in box
|
||||
row_y, row_h = 488, 30 # center = 503
|
||||
assert row_in_box(row_y, row_h, inner) is False
|
||||
|
||||
# Step 4: Verify the row's center falls within a content strip
|
||||
row_center = row_y + row_h / 2 # 503
|
||||
in_any_strip = any(ys <= row_center < ye for ys, ye in strips)
|
||||
assert in_any_strip, f"Row center {row_center} should be in content strips {strips}"
|
||||
|
||||
def test_boundary_row_would_be_lost_with_full_ranges(self):
|
||||
"""
|
||||
Demonstrates the bug: using full box_ranges (not inner) WOULD
|
||||
exclude the boundary row.
|
||||
"""
|
||||
zones = [{
|
||||
"zone_type": "box",
|
||||
"box": {"x": 50, "y": 500, "width": 1100, "height": 200, "border_thickness": 10},
|
||||
}]
|
||||
box_ranges, _ = compute_box_ranges(zones)
|
||||
|
||||
# The full range is (500, 700)
|
||||
row_center = 488 + 30 / 2 # 503
|
||||
# With full range: 500 <= 503 < 700 → would be excluded!
|
||||
in_box_full = any(by_s <= row_center < by_e for by_s, by_e in box_ranges)
|
||||
assert in_box_full is True, "Full range SHOULD incorrectly exclude this row"
|
||||
285
klausur-service/backend/tests/test_dictionary_detection.py
Normal file
285
klausur-service/backend/tests/test_dictionary_detection.py
Normal file
@@ -0,0 +1,285 @@
|
||||
"""Tests for dictionary/Wörterbuch page detection.
|
||||
|
||||
Tests the _score_dictionary_signals() function and _classify_dictionary_columns()
|
||||
from cv_layout.py.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add backend to path for imports
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
from cv_vocab_types import ColumnGeometry
|
||||
from cv_layout import _score_dictionary_signals, _classify_dictionary_columns, _score_language
|
||||
|
||||
|
||||
def _make_words(texts, start_y=0, y_step=30, x=100, conf=80):
|
||||
"""Create a list of word dicts from text strings."""
|
||||
return [
|
||||
{
|
||||
"text": t,
|
||||
"conf": conf,
|
||||
"top": start_y + i * y_step,
|
||||
"left": x,
|
||||
"height": 20,
|
||||
"width": len(t) * 10,
|
||||
}
|
||||
for i, t in enumerate(texts)
|
||||
]
|
||||
|
||||
|
||||
def _make_geom(index, words, x=0, width=200, width_ratio=0.15):
|
||||
"""Create a ColumnGeometry with given words."""
|
||||
return ColumnGeometry(
|
||||
index=index,
|
||||
x=x,
|
||||
y=0,
|
||||
width=width,
|
||||
height=1000,
|
||||
word_count=len(words),
|
||||
words=words,
|
||||
width_ratio=width_ratio,
|
||||
)
|
||||
|
||||
|
||||
class TestDictionarySignals:
|
||||
"""Test _score_dictionary_signals with synthetic data."""
|
||||
|
||||
def test_alphabetical_column_detected(self):
|
||||
"""A column with alphabetically ordered words should score high."""
|
||||
# Simulate a dictionary headword column: Z words
|
||||
headwords = _make_words([
|
||||
"Zahl", "Zahn", "zart", "Zauber", "Zaun",
|
||||
"Zeichen", "zeigen", "Zeit", "Zelt", "Zentrum",
|
||||
"zerbrechen", "Zeug", "Ziel", "Zimmer", "Zitrone",
|
||||
"Zoll", "Zone", "Zoo", "Zucker", "Zug",
|
||||
])
|
||||
# Article column
|
||||
articles = _make_words(
|
||||
["die", "der", "das", "der", "der",
|
||||
"das", "die", "die", "das", "das",
|
||||
"der", "das", "das", "das", "die",
|
||||
"der", "die", "der", "der", "der"],
|
||||
x=0,
|
||||
)
|
||||
# Translation column
|
||||
translations = _make_words(
|
||||
["number", "tooth", "tender", "magic", "fence",
|
||||
"sign", "to show", "time", "tent", "centre",
|
||||
"to break", "stuff", "goal", "room", "lemon",
|
||||
"customs", "zone", "zoo", "sugar", "train"],
|
||||
x=400,
|
||||
)
|
||||
|
||||
geoms = [
|
||||
_make_geom(0, articles, x=0, width=60, width_ratio=0.05),
|
||||
_make_geom(1, headwords, x=80, width=200, width_ratio=0.15),
|
||||
_make_geom(2, translations, x=400, width=200, width_ratio=0.15),
|
||||
]
|
||||
|
||||
result = _score_dictionary_signals(geoms)
|
||||
|
||||
assert result["signals"]["alphabetical_score"] >= 0.80, (
|
||||
f"Expected alphabetical_score >= 0.80, got {result['signals']['alphabetical_score']}"
|
||||
)
|
||||
assert result["signals"]["article_density"] >= 0.80, (
|
||||
f"Expected article_density >= 0.80, got {result['signals']['article_density']}"
|
||||
)
|
||||
assert result["signals"]["first_letter_uniformity"] >= 0.60, (
|
||||
f"Expected first_letter_uniformity >= 0.60, got {result['signals']['first_letter_uniformity']}"
|
||||
)
|
||||
assert result["is_dictionary"] is True
|
||||
assert result["confidence"] >= 0.40
|
||||
|
||||
def test_non_dictionary_vocab_table(self):
|
||||
"""A normal vocab table (topic-grouped, no alphabetical order) should NOT be detected."""
|
||||
en_words = _make_words([
|
||||
"school", "teacher", "homework", "pencil", "break",
|
||||
"lunch", "friend", "computer", "book", "bag",
|
||||
])
|
||||
de_words = _make_words([
|
||||
"Schule", "Lehrer", "Hausaufgaben", "Bleistift", "Pause",
|
||||
"Mittagessen", "Freund", "Computer", "Buch", "Tasche",
|
||||
], x=300)
|
||||
|
||||
geoms = [
|
||||
_make_geom(0, en_words, x=0, width=200, width_ratio=0.20),
|
||||
_make_geom(1, de_words, x=300, width=200, width_ratio=0.20),
|
||||
]
|
||||
|
||||
result = _score_dictionary_signals(geoms)
|
||||
|
||||
# Alphabetical score should be moderate at best (random order)
|
||||
assert result["is_dictionary"] is False, (
|
||||
f"Normal vocab table should NOT be detected as dictionary, "
|
||||
f"confidence={result['confidence']}"
|
||||
)
|
||||
|
||||
def test_article_column_detection(self):
|
||||
"""A narrow column with mostly articles should be identified."""
|
||||
articles = _make_words(
|
||||
["der", "die", "das", "der", "die", "das", "der", "die", "das", "der"],
|
||||
x=0,
|
||||
)
|
||||
headwords = _make_words(
|
||||
["Apfel", "Birne", "Dose", "Eis", "Fisch",
|
||||
"Gabel", "Haus", "Igel", "Jacke", "Kuchen"],
|
||||
)
|
||||
translations = _make_words(
|
||||
["apple", "pear", "can", "ice", "fish",
|
||||
"fork", "house", "hedgehog", "jacket", "cake"],
|
||||
x=400,
|
||||
)
|
||||
|
||||
geoms = [
|
||||
_make_geom(0, articles, x=0, width=50, width_ratio=0.04),
|
||||
_make_geom(1, headwords, x=80, width=200, width_ratio=0.15),
|
||||
_make_geom(2, translations, x=400, width=200, width_ratio=0.15),
|
||||
]
|
||||
|
||||
result = _score_dictionary_signals(geoms)
|
||||
|
||||
assert result["signals"]["article_density"] >= 0.80
|
||||
assert result["signals"]["article_col"] == 0
|
||||
|
||||
def test_first_letter_uniformity(self):
|
||||
"""Words all starting with same letter should have high uniformity."""
|
||||
z_words = _make_words([
|
||||
"Zahl", "Zahn", "zart", "Zauber", "Zaun",
|
||||
"Zeichen", "zeigen", "Zeit", "Zelt", "Zentrum",
|
||||
])
|
||||
other = _make_words(
|
||||
["number", "tooth", "tender", "magic", "fence",
|
||||
"sign", "to show", "time", "tent", "centre"],
|
||||
x=300,
|
||||
)
|
||||
|
||||
geoms = [
|
||||
_make_geom(0, z_words, x=0, width=200, width_ratio=0.15),
|
||||
_make_geom(1, other, x=300, width=200, width_ratio=0.15),
|
||||
]
|
||||
|
||||
result = _score_dictionary_signals(geoms)
|
||||
assert result["signals"]["first_letter_uniformity"] >= 0.80
|
||||
|
||||
def test_letter_transition_detected(self):
|
||||
"""Words transitioning from one letter to next (A→B) should be detected."""
|
||||
words = _make_words([
|
||||
"Apfel", "Arm", "Auto", "Auge", "Abend",
|
||||
"Ball", "Baum", "Berg", "Blume", "Boot",
|
||||
])
|
||||
other = _make_words(
|
||||
["apple", "arm", "car", "eye", "evening",
|
||||
"ball", "tree", "mountain", "flower", "boat"],
|
||||
x=300,
|
||||
)
|
||||
|
||||
geoms = [
|
||||
_make_geom(0, words, x=0, width=200, width_ratio=0.15),
|
||||
_make_geom(1, other, x=300, width=200, width_ratio=0.15),
|
||||
]
|
||||
|
||||
result = _score_dictionary_signals(geoms)
|
||||
assert result["signals"]["has_letter_transition"] is True
|
||||
|
||||
def test_category_boost(self):
|
||||
"""document_category='woerterbuch' should boost confidence."""
|
||||
# Weak signals that normally wouldn't trigger dictionary detection
|
||||
words_a = _make_words(["cat", "dog", "fish", "hat", "map"], x=0)
|
||||
words_b = _make_words(["Katze", "Hund", "Fisch", "Hut", "Karte"], x=300)
|
||||
|
||||
geoms = [
|
||||
_make_geom(0, words_a, x=0, width=200, width_ratio=0.15),
|
||||
_make_geom(1, words_b, x=300, width=200, width_ratio=0.15),
|
||||
]
|
||||
|
||||
without_boost = _score_dictionary_signals(geoms)
|
||||
with_boost = _score_dictionary_signals(geoms, document_category="woerterbuch")
|
||||
|
||||
assert with_boost["confidence"] > without_boost["confidence"]
|
||||
assert with_boost["confidence"] - without_boost["confidence"] >= 0.19 # ~0.20 boost
|
||||
|
||||
def test_margin_strip_signal(self):
|
||||
"""margin_strip_detected=True should contribute to confidence."""
|
||||
words_a = _make_words(["Apfel", "Arm", "Auto", "Auge", "Abend"], x=0)
|
||||
words_b = _make_words(["apple", "arm", "car", "eye", "evening"], x=300)
|
||||
|
||||
geoms = [
|
||||
_make_geom(0, words_a, x=0, width=200, width_ratio=0.15),
|
||||
_make_geom(1, words_b, x=300, width=200, width_ratio=0.15),
|
||||
]
|
||||
|
||||
without = _score_dictionary_signals(geoms, margin_strip_detected=False)
|
||||
with_strip = _score_dictionary_signals(geoms, margin_strip_detected=True)
|
||||
|
||||
assert with_strip["confidence"] > without["confidence"]
|
||||
assert with_strip["signals"]["margin_strip_detected"] is True
|
||||
|
||||
def test_too_few_columns(self):
|
||||
"""Single column should return is_dictionary=False."""
|
||||
words = _make_words(["Zahl", "Zahn", "zart", "Zauber", "Zaun"])
|
||||
geoms = [_make_geom(0, words)]
|
||||
|
||||
result = _score_dictionary_signals(geoms)
|
||||
assert result["is_dictionary"] is False
|
||||
|
||||
def test_empty_words(self):
|
||||
"""Columns with no words should return is_dictionary=False."""
|
||||
geoms = [
|
||||
_make_geom(0, [], x=0),
|
||||
_make_geom(1, [], x=300),
|
||||
]
|
||||
result = _score_dictionary_signals(geoms)
|
||||
assert result["is_dictionary"] is False
|
||||
|
||||
|
||||
class TestClassifyDictionaryColumns:
|
||||
"""Test _classify_dictionary_columns with dictionary-detected data."""
|
||||
|
||||
def test_assigns_article_and_headword(self):
|
||||
"""When dictionary detected, assigns column_article and column_headword."""
|
||||
articles = _make_words(
|
||||
["der", "die", "das", "der", "die", "das", "der", "die", "das", "der"],
|
||||
x=0,
|
||||
)
|
||||
headwords = _make_words([
|
||||
"Zahl", "Zahn", "zart", "Zauber", "Zaun",
|
||||
"Zeichen", "zeigen", "Zeit", "Zelt", "Zentrum",
|
||||
])
|
||||
translations = _make_words(
|
||||
["number", "tooth", "tender", "magic", "fence",
|
||||
"sign", "to show", "time", "tent", "centre"],
|
||||
x=400,
|
||||
)
|
||||
|
||||
geoms = [
|
||||
_make_geom(0, articles, x=0, width=50, width_ratio=0.04),
|
||||
_make_geom(1, headwords, x=80, width=200, width_ratio=0.15),
|
||||
_make_geom(2, translations, x=400, width=200, width_ratio=0.15),
|
||||
]
|
||||
|
||||
dict_signals = _score_dictionary_signals(geoms)
|
||||
assert dict_signals["is_dictionary"] is True
|
||||
|
||||
lang_scores = [_score_language(g.words) for g in geoms]
|
||||
regions = _classify_dictionary_columns(geoms, dict_signals, lang_scores, 1000)
|
||||
|
||||
assert regions is not None
|
||||
types = [r.type for r in regions]
|
||||
assert "column_article" in types, f"Expected column_article in {types}"
|
||||
assert "column_headword" in types, f"Expected column_headword in {types}"
|
||||
# All regions should have classification_method='dictionary'
|
||||
for r in regions:
|
||||
assert r.classification_method == "dictionary"
|
||||
|
||||
def test_returns_none_when_not_dictionary(self):
|
||||
"""Should return None when dict_signals says not a dictionary."""
|
||||
geoms = [
|
||||
_make_geom(0, _make_words(["cat", "dog"]), x=0),
|
||||
_make_geom(1, _make_words(["Katze", "Hund"]), x=300),
|
||||
]
|
||||
dict_signals = {"is_dictionary": False, "confidence": 0.1}
|
||||
lang_scores = [_score_language(g.words) for g in geoms]
|
||||
result = _classify_dictionary_columns(geoms, dict_signals, lang_scores, 1000)
|
||||
assert result is None
|
||||
Reference in New Issue
Block a user