Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
All services: admin-v2, studio-v2, website, ai-compliance-sdk, consent-service, klausur-service, voice-service, and infrastructure. Large PDFs and compiled binaries excluded via .gitignore.
225 lines
7.3 KiB
Python
225 lines
7.3 KiB
Python
"""
|
|
Tests für Deduplication Module.
|
|
"""
|
|
|
|
import pytest
|
|
|
|
from alerts_agent.processing.dedup import (
|
|
compute_simhash,
|
|
hamming_distance,
|
|
are_similar,
|
|
find_duplicates,
|
|
exact_url_duplicates,
|
|
)
|
|
from alerts_agent.models.alert_item import AlertItem
|
|
|
|
|
|
class TestSimHash:
|
|
"""Tests für SimHash Berechnung."""
|
|
|
|
def test_compute_simhash_returns_hex(self):
|
|
"""Test SimHash gibt Hex-String zurück."""
|
|
text = "Dies ist ein Test für SimHash Berechnung"
|
|
result = compute_simhash(text)
|
|
|
|
assert isinstance(result, str)
|
|
assert len(result) == 16
|
|
# Prüfe dass es gültiges Hex ist
|
|
int(result, 16)
|
|
|
|
def test_empty_text_returns_zeros(self):
|
|
"""Test leerer Text gibt Null-Hash."""
|
|
assert compute_simhash("") == "0" * 16
|
|
assert compute_simhash(None) == "0" * 16
|
|
|
|
def test_identical_texts_same_hash(self):
|
|
"""Test identische Texte haben gleichen Hash."""
|
|
text = "Inklusion in bayerischen Schulen wird verstärkt"
|
|
hash1 = compute_simhash(text)
|
|
hash2 = compute_simhash(text)
|
|
|
|
assert hash1 == hash2
|
|
|
|
def test_similar_texts_similar_hash(self):
|
|
"""Test ähnliche Texte haben ähnlichen Hash."""
|
|
text1 = "Inklusion in bayerischen Schulen wird verstärkt"
|
|
text2 = "Inklusion in bayerischen Schulen wurde verstärkt"
|
|
|
|
hash1 = compute_simhash(text1)
|
|
hash2 = compute_simhash(text2)
|
|
|
|
# Ähnliche Texte sollten geringe Hamming-Distanz haben
|
|
distance = hamming_distance(hash1, hash2)
|
|
assert distance < 20 # Relativ ähnlich
|
|
|
|
def test_different_texts_different_hash(self):
|
|
"""Test verschiedene Texte haben verschiedenen Hash."""
|
|
text1 = "Inklusion in bayerischen Schulen"
|
|
text2 = "Fußball Bundesliga Spieltag"
|
|
|
|
hash1 = compute_simhash(text1)
|
|
hash2 = compute_simhash(text2)
|
|
|
|
assert hash1 != hash2
|
|
|
|
def test_stopwords_ignored(self):
|
|
"""Test Stoppwörter werden ignoriert."""
|
|
text1 = "Die neue Regelung für Inklusion"
|
|
text2 = "Eine neue Regelung für die Inklusion"
|
|
|
|
hash1 = compute_simhash(text1)
|
|
hash2 = compute_simhash(text2)
|
|
|
|
# Trotz unterschiedlicher Stoppwörter ähnlich
|
|
distance = hamming_distance(hash1, hash2)
|
|
assert distance < 10
|
|
|
|
|
|
class TestHammingDistance:
|
|
"""Tests für Hamming-Distanz."""
|
|
|
|
def test_identical_hashes_zero_distance(self):
|
|
"""Test identische Hashes haben Distanz 0."""
|
|
hash1 = "abcdef0123456789"
|
|
hash2 = "abcdef0123456789"
|
|
|
|
assert hamming_distance(hash1, hash2) == 0
|
|
|
|
def test_completely_different_max_distance(self):
|
|
"""Test komplett verschiedene Hashes haben max Distanz."""
|
|
hash1 = "0000000000000000"
|
|
hash2 = "ffffffffffffffff"
|
|
|
|
assert hamming_distance(hash1, hash2) == 64
|
|
|
|
def test_one_bit_difference(self):
|
|
"""Test ein Bit Unterschied."""
|
|
hash1 = "0000000000000000"
|
|
hash2 = "0000000000000001"
|
|
|
|
assert hamming_distance(hash1, hash2) == 1
|
|
|
|
def test_invalid_hash_returns_max(self):
|
|
"""Test ungültiger Hash gibt maximale Distanz."""
|
|
assert hamming_distance("", "abc") == 64
|
|
assert hamming_distance("invalid", "abc") == 64
|
|
|
|
def test_symmetric(self):
|
|
"""Test Hamming-Distanz ist symmetrisch."""
|
|
hash1 = "abcd1234abcd1234"
|
|
hash2 = "1234abcd1234abcd"
|
|
|
|
assert hamming_distance(hash1, hash2) == hamming_distance(hash2, hash1)
|
|
|
|
|
|
class TestAreSimilar:
|
|
"""Tests für Ähnlichkeitsprüfung."""
|
|
|
|
def test_identical_are_similar(self):
|
|
"""Test identische Hashes sind ähnlich."""
|
|
hash1 = "abcdef0123456789"
|
|
assert are_similar(hash1, hash1)
|
|
|
|
def test_threshold_respected(self):
|
|
"""Test Schwellenwert wird respektiert."""
|
|
hash1 = "0000000000000000"
|
|
hash2 = "0000000000000003" # 2 Bits unterschiedlich
|
|
|
|
assert are_similar(hash1, hash2, threshold=5)
|
|
assert are_similar(hash1, hash2, threshold=2)
|
|
assert not are_similar(hash1, hash2, threshold=1)
|
|
|
|
|
|
class TestFindDuplicates:
|
|
"""Tests für Duplikat-Erkennung."""
|
|
|
|
def test_no_duplicates(self):
|
|
"""Test keine Duplikate wenn alle verschieden."""
|
|
items = [
|
|
AlertItem(title="Unique 1", url="https://example.com/1"),
|
|
AlertItem(title="Unique 2", url="https://example.com/2"),
|
|
]
|
|
# Setze verschiedene Hashes
|
|
items[0].content_hash = "0000000000000000"
|
|
items[1].content_hash = "ffffffffffffffff"
|
|
|
|
duplicates = find_duplicates(items)
|
|
assert len(duplicates) == 0
|
|
|
|
def test_finds_duplicates(self):
|
|
"""Test findet Duplikate mit ähnlichen Hashes."""
|
|
items = [
|
|
AlertItem(title="Original", url="https://example.com/1"),
|
|
AlertItem(title="Duplicate", url="https://example.com/2"),
|
|
AlertItem(title="Different", url="https://example.com/3"),
|
|
]
|
|
# Setze ähnliche Hashes für die ersten beiden
|
|
items[0].content_hash = "0000000000000000"
|
|
items[1].content_hash = "0000000000000001" # 1 Bit unterschiedlich
|
|
items[2].content_hash = "ffffffffffffffff" # Komplett anders
|
|
|
|
duplicates = find_duplicates(items, threshold=3)
|
|
|
|
# Beide sollten im gleichen Cluster sein
|
|
assert len(duplicates) == 2
|
|
assert duplicates[items[0].id] == duplicates[items[1].id]
|
|
|
|
def test_empty_list(self):
|
|
"""Test leere Liste."""
|
|
duplicates = find_duplicates([])
|
|
assert len(duplicates) == 0
|
|
|
|
def test_items_without_hash_skipped(self):
|
|
"""Test Items ohne Hash werden übersprungen."""
|
|
items = [
|
|
AlertItem(title="No Hash", url="https://example.com/1"),
|
|
]
|
|
# content_hash bleibt None
|
|
|
|
duplicates = find_duplicates(items)
|
|
assert len(duplicates) == 0
|
|
|
|
|
|
class TestExactUrlDuplicates:
|
|
"""Tests für exakte URL Duplikate."""
|
|
|
|
def test_finds_exact_duplicates(self):
|
|
"""Test findet exakte URL Duplikate."""
|
|
items = [
|
|
AlertItem(title="First", url="https://example.com/article"),
|
|
AlertItem(title="Second", url="https://example.com/article"), # Duplikat
|
|
AlertItem(title="Third", url="https://example.com/other"),
|
|
]
|
|
|
|
duplicates = exact_url_duplicates(items)
|
|
|
|
assert len(duplicates) == 1
|
|
assert items[1].id in duplicates
|
|
assert items[0].id not in duplicates # Original, nicht Duplikat
|
|
|
|
def test_no_duplicates(self):
|
|
"""Test keine Duplikate bei verschiedenen URLs."""
|
|
items = [
|
|
AlertItem(title="First", url="https://example.com/1"),
|
|
AlertItem(title="Second", url="https://example.com/2"),
|
|
]
|
|
|
|
duplicates = exact_url_duplicates(items)
|
|
assert len(duplicates) == 0
|
|
|
|
def test_multiple_duplicates(self):
|
|
"""Test mehrere Duplikate der gleichen URL."""
|
|
items = [
|
|
AlertItem(title="First", url="https://example.com/same"),
|
|
AlertItem(title="Second", url="https://example.com/same"),
|
|
AlertItem(title="Third", url="https://example.com/same"),
|
|
]
|
|
|
|
duplicates = exact_url_duplicates(items)
|
|
|
|
# Zweites und drittes sollten als Duplikate markiert sein
|
|
assert len(duplicates) == 2
|
|
assert items[0].id not in duplicates
|
|
assert items[1].id in duplicates
|
|
assert items[2].id in duplicates
|