feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
1
document-crawler/archiver/__init__.py
Normal file
1
document-crawler/archiver/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .dsms_client import archive_document
|
||||
37
document-crawler/archiver/dsms_client.py
Normal file
37
document-crawler/archiver/dsms_client.py
Normal file
@@ -0,0 +1,37 @@
|
||||
"""Client for dsms-gateway (IPFS) document archival."""
|
||||
|
||||
import httpx
|
||||
|
||||
from config import settings
|
||||
|
||||
|
||||
async def archive_document(
|
||||
file_path: str,
|
||||
file_name: str,
|
||||
document_type: str,
|
||||
document_id: str,
|
||||
auth_token: str = "Bearer system-crawler",
|
||||
) -> dict:
|
||||
"""Archive a document to IPFS via the DSMS gateway.
|
||||
|
||||
Returns dict with cid, size, gateway_url on success.
|
||||
Raises on failure.
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
with open(file_path, "rb") as f:
|
||||
resp = await client.post(
|
||||
f"{settings.DSMS_GATEWAY_URL}/api/v1/documents",
|
||||
files={"file": (file_name, f)},
|
||||
data={
|
||||
"document_type": "compliance_document",
|
||||
"document_id": document_id,
|
||||
"version": "1",
|
||||
"language": "de",
|
||||
},
|
||||
headers={"Authorization": auth_token},
|
||||
)
|
||||
|
||||
if resp.status_code != 200:
|
||||
raise RuntimeError(f"DSMS archive failed ({resp.status_code}): {resp.text}")
|
||||
|
||||
return resp.json()
|
||||
Reference in New Issue
Block a user