feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
16
document-crawler/tests/test_extractors.py
Normal file
16
document-crawler/tests/test_extractors.py
Normal file
@@ -0,0 +1,16 @@
|
||||
"""Tests for document text extractors."""
|
||||
|
||||
import pytest
|
||||
from extractors.dispatcher import extract_text, EXTRACTORS
|
||||
|
||||
|
||||
def test_supported_extensions():
|
||||
assert ".pdf" in EXTRACTORS
|
||||
assert ".docx" in EXTRACTORS
|
||||
assert ".xlsx" in EXTRACTORS
|
||||
assert ".pptx" in EXTRACTORS
|
||||
|
||||
|
||||
def test_unsupported_extension():
|
||||
with pytest.raises(ValueError, match="Unsupported"):
|
||||
extract_text("/tmp/test.txt", ".txt")
|
||||
Reference in New Issue
Block a user