This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/backend/tests/test_alerts_agent/test_dedup.py
Benjamin Admin 21a844cb8a fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

225 lines
7.3 KiB
Python

"""
Tests für Deduplication Module.
"""
import pytest
from alerts_agent.processing.dedup import (
compute_simhash,
hamming_distance,
are_similar,
find_duplicates,
exact_url_duplicates,
)
from alerts_agent.models.alert_item import AlertItem
class TestSimHash:
"""Tests für SimHash Berechnung."""
def test_compute_simhash_returns_hex(self):
"""Test SimHash gibt Hex-String zurück."""
text = "Dies ist ein Test für SimHash Berechnung"
result = compute_simhash(text)
assert isinstance(result, str)
assert len(result) == 16
# Prüfe dass es gültiges Hex ist
int(result, 16)
def test_empty_text_returns_zeros(self):
"""Test leerer Text gibt Null-Hash."""
assert compute_simhash("") == "0" * 16
assert compute_simhash(None) == "0" * 16
def test_identical_texts_same_hash(self):
"""Test identische Texte haben gleichen Hash."""
text = "Inklusion in bayerischen Schulen wird verstärkt"
hash1 = compute_simhash(text)
hash2 = compute_simhash(text)
assert hash1 == hash2
def test_similar_texts_similar_hash(self):
"""Test ähnliche Texte haben ähnlichen Hash."""
text1 = "Inklusion in bayerischen Schulen wird verstärkt"
text2 = "Inklusion in bayerischen Schulen wurde verstärkt"
hash1 = compute_simhash(text1)
hash2 = compute_simhash(text2)
# Ähnliche Texte sollten geringe Hamming-Distanz haben
distance = hamming_distance(hash1, hash2)
assert distance < 20 # Relativ ähnlich
def test_different_texts_different_hash(self):
"""Test verschiedene Texte haben verschiedenen Hash."""
text1 = "Inklusion in bayerischen Schulen"
text2 = "Fußball Bundesliga Spieltag"
hash1 = compute_simhash(text1)
hash2 = compute_simhash(text2)
assert hash1 != hash2
def test_stopwords_ignored(self):
"""Test Stoppwörter werden ignoriert."""
text1 = "Die neue Regelung für Inklusion"
text2 = "Eine neue Regelung für die Inklusion"
hash1 = compute_simhash(text1)
hash2 = compute_simhash(text2)
# Trotz unterschiedlicher Stoppwörter ähnlich
distance = hamming_distance(hash1, hash2)
assert distance < 10
class TestHammingDistance:
"""Tests für Hamming-Distanz."""
def test_identical_hashes_zero_distance(self):
"""Test identische Hashes haben Distanz 0."""
hash1 = "abcdef0123456789"
hash2 = "abcdef0123456789"
assert hamming_distance(hash1, hash2) == 0
def test_completely_different_max_distance(self):
"""Test komplett verschiedene Hashes haben max Distanz."""
hash1 = "0000000000000000"
hash2 = "ffffffffffffffff"
assert hamming_distance(hash1, hash2) == 64
def test_one_bit_difference(self):
"""Test ein Bit Unterschied."""
hash1 = "0000000000000000"
hash2 = "0000000000000001"
assert hamming_distance(hash1, hash2) == 1
def test_invalid_hash_returns_max(self):
"""Test ungültiger Hash gibt maximale Distanz."""
assert hamming_distance("", "abc") == 64
assert hamming_distance("invalid", "abc") == 64
def test_symmetric(self):
"""Test Hamming-Distanz ist symmetrisch."""
hash1 = "abcd1234abcd1234"
hash2 = "1234abcd1234abcd"
assert hamming_distance(hash1, hash2) == hamming_distance(hash2, hash1)
class TestAreSimilar:
"""Tests für Ähnlichkeitsprüfung."""
def test_identical_are_similar(self):
"""Test identische Hashes sind ähnlich."""
hash1 = "abcdef0123456789"
assert are_similar(hash1, hash1)
def test_threshold_respected(self):
"""Test Schwellenwert wird respektiert."""
hash1 = "0000000000000000"
hash2 = "0000000000000003" # 2 Bits unterschiedlich
assert are_similar(hash1, hash2, threshold=5)
assert are_similar(hash1, hash2, threshold=2)
assert not are_similar(hash1, hash2, threshold=1)
class TestFindDuplicates:
"""Tests für Duplikat-Erkennung."""
def test_no_duplicates(self):
"""Test keine Duplikate wenn alle verschieden."""
items = [
AlertItem(title="Unique 1", url="https://example.com/1"),
AlertItem(title="Unique 2", url="https://example.com/2"),
]
# Setze verschiedene Hashes
items[0].content_hash = "0000000000000000"
items[1].content_hash = "ffffffffffffffff"
duplicates = find_duplicates(items)
assert len(duplicates) == 0
def test_finds_duplicates(self):
"""Test findet Duplikate mit ähnlichen Hashes."""
items = [
AlertItem(title="Original", url="https://example.com/1"),
AlertItem(title="Duplicate", url="https://example.com/2"),
AlertItem(title="Different", url="https://example.com/3"),
]
# Setze ähnliche Hashes für die ersten beiden
items[0].content_hash = "0000000000000000"
items[1].content_hash = "0000000000000001" # 1 Bit unterschiedlich
items[2].content_hash = "ffffffffffffffff" # Komplett anders
duplicates = find_duplicates(items, threshold=3)
# Beide sollten im gleichen Cluster sein
assert len(duplicates) == 2
assert duplicates[items[0].id] == duplicates[items[1].id]
def test_empty_list(self):
"""Test leere Liste."""
duplicates = find_duplicates([])
assert len(duplicates) == 0
def test_items_without_hash_skipped(self):
"""Test Items ohne Hash werden übersprungen."""
items = [
AlertItem(title="No Hash", url="https://example.com/1"),
]
# content_hash bleibt None
duplicates = find_duplicates(items)
assert len(duplicates) == 0
class TestExactUrlDuplicates:
"""Tests für exakte URL Duplikate."""
def test_finds_exact_duplicates(self):
"""Test findet exakte URL Duplikate."""
items = [
AlertItem(title="First", url="https://example.com/article"),
AlertItem(title="Second", url="https://example.com/article"), # Duplikat
AlertItem(title="Third", url="https://example.com/other"),
]
duplicates = exact_url_duplicates(items)
assert len(duplicates) == 1
assert items[1].id in duplicates
assert items[0].id not in duplicates # Original, nicht Duplikat
def test_no_duplicates(self):
"""Test keine Duplikate bei verschiedenen URLs."""
items = [
AlertItem(title="First", url="https://example.com/1"),
AlertItem(title="Second", url="https://example.com/2"),
]
duplicates = exact_url_duplicates(items)
assert len(duplicates) == 0
def test_multiple_duplicates(self):
"""Test mehrere Duplikate der gleichen URL."""
items = [
AlertItem(title="First", url="https://example.com/same"),
AlertItem(title="Second", url="https://example.com/same"),
AlertItem(title="Third", url="https://example.com/same"),
]
duplicates = exact_url_duplicates(items)
# Zweites und drittes sollten als Duplikate markiert sein
assert len(duplicates) == 2
assert items[0].id not in duplicates
assert items[1].id in duplicates
assert items[2].id in duplicates