feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)

New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:35:15 +01:00
parent 0923c03756
commit 364d2c69ff
34 changed files with 1633 additions and 0 deletions
@@ -0,0 +1,36 @@
+"""Tests for keyword fallback classifier."""
+
+import pytest
+from classifiers.keyword_fallback import keyword_classify
+
+
+def test_vvt_detection():
+    text = "Verzeichnis von Verarbeitungstaetigkeiten gemaess Art. 30 DSGVO"
+    result = keyword_classify(text, "vvt.pdf")
+    assert result["classification"] == "VVT"
+    assert result["confidence"] <= 0.3
+
+
+def test_tom_detection():
+    text = "Technisch-organisatorische Massnahmen: Zutrittskontrolle, Zugangskontrolle, Verschluesselungskonzept"
+    result = keyword_classify(text, "toms.docx")
+    assert result["classification"] == "TOM"
+
+
+def test_dse_detection():
+    text = "Datenschutzerklaerung: Informationspflichten nach Art. 13 DSGVO"
+    result = keyword_classify(text, "datenschutz.pdf")
+    assert result["classification"] == "DSE"
+
+
+def test_unknown_document():
+    text = "Lorem ipsum dolor sit amet"
+    result = keyword_classify(text, "random.pdf")
+    assert result["classification"] == "Sonstiges"
+    assert result["confidence"] == 0.1
+
+
+def test_confidence_capped():
+    text = "Verarbeitungsverzeichnis Art. 30 Kategorie betroffener Personen Datenkategorien Zweck der Verarbeitung"
+    result = keyword_classify(text, "vvt_complete.pdf")
+    assert result["confidence"] <= 0.3
@@ -0,0 +1,16 @@
+"""Tests for document text extractors."""
+
+import pytest
+from extractors.dispatcher import extract_text, EXTRACTORS
+
+
+def test_supported_extensions():
+    assert ".pdf" in EXTRACTORS
+    assert ".docx" in EXTRACTORS
+    assert ".xlsx" in EXTRACTORS
+    assert ".pptx" in EXTRACTORS
+
+
+def test_unsupported_extension():
+    with pytest.raises(ValueError, match="Unsupported"):
+        extract_text("/tmp/test.txt", ".txt")
@@ -0,0 +1,46 @@
+"""Tests for gap analysis."""
+
+import pytest
+from gap_analysis.analyzer import generate_gap_analysis
+
+
+def test_full_compliance():
+    counts = {
+        "VVT": 1, "TOM": 1, "DSE": 1, "Loeschkonzept": 1,
+        "Richtlinie": 1, "Schulungsnachweis": 1, "AVV": 1, "DSFA": 1,
+    }
+    result = generate_gap_analysis(counts)
+    assert result["compliance_score"] == 100.0
+    assert len(result["gaps"]) == 0
+
+
+def test_no_documents():
+    result = generate_gap_analysis({})
+    assert result["compliance_score"] == 0.0
+    assert len(result["gaps"]) > 0
+    assert result["gap_summary"]["critical"] > 0
+
+
+def test_partial_compliance():
+    counts = {"VVT": 1, "TOM": 1}
+    result = generate_gap_analysis(counts)
+    assert 0 < result["compliance_score"] < 100
+    # DSE, Loeschkonzept, Richtlinie, Schulungsnachweis, AVV, DSFA should be gaps
+    gap_categories = [g["category"] for g in result["gaps"]]
+    assert "DSE" in gap_categories
+    assert "Loeschkonzept" in gap_categories
+
+
+def test_universal_only():
+    counts = {"VVT": 1, "TOM": 1, "DSE": 1, "Loeschkonzept": 1}
+    result = generate_gap_analysis(counts, company_profiles=["universal"])
+    # Universal requires VVT, TOM, DSE, Loeschkonzept, Richtlinie, Schulungsnachweis
+    # 4 out of 6 covered
+    assert result["covered"] == 4
+    assert result["total_required"] == 6
+
+
+def test_gap_severity():
+    result = generate_gap_analysis({}, company_profiles=["universal"])
+    severities = {g["severity"] for g in result["gaps"]}
+    assert "CRITICAL" in severities