feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)

New standalone Python/FastAPI service for automatic compliance document
scanning, LLM-based classification, IPFS archival, and gap analysis.
Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier,
compliance matrix, and full REST API on port 8098.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Boenisch
2026-02-13 20:35:15 +01:00
parent 0923c03756
commit 364d2c69ff
34 changed files with 1633 additions and 0 deletions

View File

View File

@@ -0,0 +1,36 @@
"""Tests for keyword fallback classifier."""
import pytest
from classifiers.keyword_fallback import keyword_classify
def test_vvt_detection():
text = "Verzeichnis von Verarbeitungstaetigkeiten gemaess Art. 30 DSGVO"
result = keyword_classify(text, "vvt.pdf")
assert result["classification"] == "VVT"
assert result["confidence"] <= 0.3
def test_tom_detection():
text = "Technisch-organisatorische Massnahmen: Zutrittskontrolle, Zugangskontrolle, Verschluesselungskonzept"
result = keyword_classify(text, "toms.docx")
assert result["classification"] == "TOM"
def test_dse_detection():
text = "Datenschutzerklaerung: Informationspflichten nach Art. 13 DSGVO"
result = keyword_classify(text, "datenschutz.pdf")
assert result["classification"] == "DSE"
def test_unknown_document():
text = "Lorem ipsum dolor sit amet"
result = keyword_classify(text, "random.pdf")
assert result["classification"] == "Sonstiges"
assert result["confidence"] == 0.1
def test_confidence_capped():
text = "Verarbeitungsverzeichnis Art. 30 Kategorie betroffener Personen Datenkategorien Zweck der Verarbeitung"
result = keyword_classify(text, "vvt_complete.pdf")
assert result["confidence"] <= 0.3

View File

@@ -0,0 +1,16 @@
"""Tests for document text extractors."""
import pytest
from extractors.dispatcher import extract_text, EXTRACTORS
def test_supported_extensions():
assert ".pdf" in EXTRACTORS
assert ".docx" in EXTRACTORS
assert ".xlsx" in EXTRACTORS
assert ".pptx" in EXTRACTORS
def test_unsupported_extension():
with pytest.raises(ValueError, match="Unsupported"):
extract_text("/tmp/test.txt", ".txt")

View File

@@ -0,0 +1,46 @@
"""Tests for gap analysis."""
import pytest
from gap_analysis.analyzer import generate_gap_analysis
def test_full_compliance():
counts = {
"VVT": 1, "TOM": 1, "DSE": 1, "Loeschkonzept": 1,
"Richtlinie": 1, "Schulungsnachweis": 1, "AVV": 1, "DSFA": 1,
}
result = generate_gap_analysis(counts)
assert result["compliance_score"] == 100.0
assert len(result["gaps"]) == 0
def test_no_documents():
result = generate_gap_analysis({})
assert result["compliance_score"] == 0.0
assert len(result["gaps"]) > 0
assert result["gap_summary"]["critical"] > 0
def test_partial_compliance():
counts = {"VVT": 1, "TOM": 1}
result = generate_gap_analysis(counts)
assert 0 < result["compliance_score"] < 100
# DSE, Loeschkonzept, Richtlinie, Schulungsnachweis, AVV, DSFA should be gaps
gap_categories = [g["category"] for g in result["gaps"]]
assert "DSE" in gap_categories
assert "Loeschkonzept" in gap_categories
def test_universal_only():
counts = {"VVT": 1, "TOM": 1, "DSE": 1, "Loeschkonzept": 1}
result = generate_gap_analysis(counts, company_profiles=["universal"])
# Universal requires VVT, TOM, DSE, Loeschkonzept, Richtlinie, Schulungsnachweis
# 4 out of 6 covered
assert result["covered"] == 4
assert result["total_required"] == 6
def test_gap_severity():
result = generate_gap_analysis({}, company_profiles=["universal"])
severities = {g["severity"] for g in result["gaps"]}
assert "CRITICAL" in severities