fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
224
backend/tests/test_alerts_agent/test_dedup.py
Normal file
224
backend/tests/test_alerts_agent/test_dedup.py
Normal file
@@ -0,0 +1,224 @@
|
||||
"""
|
||||
Tests für Deduplication Module.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from alerts_agent.processing.dedup import (
|
||||
compute_simhash,
|
||||
hamming_distance,
|
||||
are_similar,
|
||||
find_duplicates,
|
||||
exact_url_duplicates,
|
||||
)
|
||||
from alerts_agent.models.alert_item import AlertItem
|
||||
|
||||
|
||||
class TestSimHash:
|
||||
"""Tests für SimHash Berechnung."""
|
||||
|
||||
def test_compute_simhash_returns_hex(self):
|
||||
"""Test SimHash gibt Hex-String zurück."""
|
||||
text = "Dies ist ein Test für SimHash Berechnung"
|
||||
result = compute_simhash(text)
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert len(result) == 16
|
||||
# Prüfe dass es gültiges Hex ist
|
||||
int(result, 16)
|
||||
|
||||
def test_empty_text_returns_zeros(self):
|
||||
"""Test leerer Text gibt Null-Hash."""
|
||||
assert compute_simhash("") == "0" * 16
|
||||
assert compute_simhash(None) == "0" * 16
|
||||
|
||||
def test_identical_texts_same_hash(self):
|
||||
"""Test identische Texte haben gleichen Hash."""
|
||||
text = "Inklusion in bayerischen Schulen wird verstärkt"
|
||||
hash1 = compute_simhash(text)
|
||||
hash2 = compute_simhash(text)
|
||||
|
||||
assert hash1 == hash2
|
||||
|
||||
def test_similar_texts_similar_hash(self):
|
||||
"""Test ähnliche Texte haben ähnlichen Hash."""
|
||||
text1 = "Inklusion in bayerischen Schulen wird verstärkt"
|
||||
text2 = "Inklusion in bayerischen Schulen wurde verstärkt"
|
||||
|
||||
hash1 = compute_simhash(text1)
|
||||
hash2 = compute_simhash(text2)
|
||||
|
||||
# Ähnliche Texte sollten geringe Hamming-Distanz haben
|
||||
distance = hamming_distance(hash1, hash2)
|
||||
assert distance < 20 # Relativ ähnlich
|
||||
|
||||
def test_different_texts_different_hash(self):
|
||||
"""Test verschiedene Texte haben verschiedenen Hash."""
|
||||
text1 = "Inklusion in bayerischen Schulen"
|
||||
text2 = "Fußball Bundesliga Spieltag"
|
||||
|
||||
hash1 = compute_simhash(text1)
|
||||
hash2 = compute_simhash(text2)
|
||||
|
||||
assert hash1 != hash2
|
||||
|
||||
def test_stopwords_ignored(self):
|
||||
"""Test Stoppwörter werden ignoriert."""
|
||||
text1 = "Die neue Regelung für Inklusion"
|
||||
text2 = "Eine neue Regelung für die Inklusion"
|
||||
|
||||
hash1 = compute_simhash(text1)
|
||||
hash2 = compute_simhash(text2)
|
||||
|
||||
# Trotz unterschiedlicher Stoppwörter ähnlich
|
||||
distance = hamming_distance(hash1, hash2)
|
||||
assert distance < 10
|
||||
|
||||
|
||||
class TestHammingDistance:
|
||||
"""Tests für Hamming-Distanz."""
|
||||
|
||||
def test_identical_hashes_zero_distance(self):
|
||||
"""Test identische Hashes haben Distanz 0."""
|
||||
hash1 = "abcdef0123456789"
|
||||
hash2 = "abcdef0123456789"
|
||||
|
||||
assert hamming_distance(hash1, hash2) == 0
|
||||
|
||||
def test_completely_different_max_distance(self):
|
||||
"""Test komplett verschiedene Hashes haben max Distanz."""
|
||||
hash1 = "0000000000000000"
|
||||
hash2 = "ffffffffffffffff"
|
||||
|
||||
assert hamming_distance(hash1, hash2) == 64
|
||||
|
||||
def test_one_bit_difference(self):
|
||||
"""Test ein Bit Unterschied."""
|
||||
hash1 = "0000000000000000"
|
||||
hash2 = "0000000000000001"
|
||||
|
||||
assert hamming_distance(hash1, hash2) == 1
|
||||
|
||||
def test_invalid_hash_returns_max(self):
|
||||
"""Test ungültiger Hash gibt maximale Distanz."""
|
||||
assert hamming_distance("", "abc") == 64
|
||||
assert hamming_distance("invalid", "abc") == 64
|
||||
|
||||
def test_symmetric(self):
|
||||
"""Test Hamming-Distanz ist symmetrisch."""
|
||||
hash1 = "abcd1234abcd1234"
|
||||
hash2 = "1234abcd1234abcd"
|
||||
|
||||
assert hamming_distance(hash1, hash2) == hamming_distance(hash2, hash1)
|
||||
|
||||
|
||||
class TestAreSimilar:
|
||||
"""Tests für Ähnlichkeitsprüfung."""
|
||||
|
||||
def test_identical_are_similar(self):
|
||||
"""Test identische Hashes sind ähnlich."""
|
||||
hash1 = "abcdef0123456789"
|
||||
assert are_similar(hash1, hash1)
|
||||
|
||||
def test_threshold_respected(self):
|
||||
"""Test Schwellenwert wird respektiert."""
|
||||
hash1 = "0000000000000000"
|
||||
hash2 = "0000000000000003" # 2 Bits unterschiedlich
|
||||
|
||||
assert are_similar(hash1, hash2, threshold=5)
|
||||
assert are_similar(hash1, hash2, threshold=2)
|
||||
assert not are_similar(hash1, hash2, threshold=1)
|
||||
|
||||
|
||||
class TestFindDuplicates:
|
||||
"""Tests für Duplikat-Erkennung."""
|
||||
|
||||
def test_no_duplicates(self):
|
||||
"""Test keine Duplikate wenn alle verschieden."""
|
||||
items = [
|
||||
AlertItem(title="Unique 1", url="https://example.com/1"),
|
||||
AlertItem(title="Unique 2", url="https://example.com/2"),
|
||||
]
|
||||
# Setze verschiedene Hashes
|
||||
items[0].content_hash = "0000000000000000"
|
||||
items[1].content_hash = "ffffffffffffffff"
|
||||
|
||||
duplicates = find_duplicates(items)
|
||||
assert len(duplicates) == 0
|
||||
|
||||
def test_finds_duplicates(self):
|
||||
"""Test findet Duplikate mit ähnlichen Hashes."""
|
||||
items = [
|
||||
AlertItem(title="Original", url="https://example.com/1"),
|
||||
AlertItem(title="Duplicate", url="https://example.com/2"),
|
||||
AlertItem(title="Different", url="https://example.com/3"),
|
||||
]
|
||||
# Setze ähnliche Hashes für die ersten beiden
|
||||
items[0].content_hash = "0000000000000000"
|
||||
items[1].content_hash = "0000000000000001" # 1 Bit unterschiedlich
|
||||
items[2].content_hash = "ffffffffffffffff" # Komplett anders
|
||||
|
||||
duplicates = find_duplicates(items, threshold=3)
|
||||
|
||||
# Beide sollten im gleichen Cluster sein
|
||||
assert len(duplicates) == 2
|
||||
assert duplicates[items[0].id] == duplicates[items[1].id]
|
||||
|
||||
def test_empty_list(self):
|
||||
"""Test leere Liste."""
|
||||
duplicates = find_duplicates([])
|
||||
assert len(duplicates) == 0
|
||||
|
||||
def test_items_without_hash_skipped(self):
|
||||
"""Test Items ohne Hash werden übersprungen."""
|
||||
items = [
|
||||
AlertItem(title="No Hash", url="https://example.com/1"),
|
||||
]
|
||||
# content_hash bleibt None
|
||||
|
||||
duplicates = find_duplicates(items)
|
||||
assert len(duplicates) == 0
|
||||
|
||||
|
||||
class TestExactUrlDuplicates:
|
||||
"""Tests für exakte URL Duplikate."""
|
||||
|
||||
def test_finds_exact_duplicates(self):
|
||||
"""Test findet exakte URL Duplikate."""
|
||||
items = [
|
||||
AlertItem(title="First", url="https://example.com/article"),
|
||||
AlertItem(title="Second", url="https://example.com/article"), # Duplikat
|
||||
AlertItem(title="Third", url="https://example.com/other"),
|
||||
]
|
||||
|
||||
duplicates = exact_url_duplicates(items)
|
||||
|
||||
assert len(duplicates) == 1
|
||||
assert items[1].id in duplicates
|
||||
assert items[0].id not in duplicates # Original, nicht Duplikat
|
||||
|
||||
def test_no_duplicates(self):
|
||||
"""Test keine Duplikate bei verschiedenen URLs."""
|
||||
items = [
|
||||
AlertItem(title="First", url="https://example.com/1"),
|
||||
AlertItem(title="Second", url="https://example.com/2"),
|
||||
]
|
||||
|
||||
duplicates = exact_url_duplicates(items)
|
||||
assert len(duplicates) == 0
|
||||
|
||||
def test_multiple_duplicates(self):
|
||||
"""Test mehrere Duplikate der gleichen URL."""
|
||||
items = [
|
||||
AlertItem(title="First", url="https://example.com/same"),
|
||||
AlertItem(title="Second", url="https://example.com/same"),
|
||||
AlertItem(title="Third", url="https://example.com/same"),
|
||||
]
|
||||
|
||||
duplicates = exact_url_duplicates(items)
|
||||
|
||||
# Zweites und drittes sollten als Duplikate markiert sein
|
||||
assert len(duplicates) == 2
|
||||
assert items[0].id not in duplicates
|
||||
assert items[1].id in duplicates
|
||||
assert items[2].id in duplicates
|
||||
Reference in New Issue
Block a user